From 0de47a32139f79c8ad7ecf37243ccf11e36ae3f9 Mon Sep 17 00:00:00 2001 From: albert-de-montserrat Date: Fri, 15 Dec 2023 10:39:13 +0100 Subject: [PATCH 1/3] cuda atomic add --- CUDA/atomic_add.cu | 127 +++++++++++++++++++++++++++++++++++++++++++++ CUDA/atomic_add.jl | 102 ++++++++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 CUDA/atomic_add.cu create mode 100644 CUDA/atomic_add.jl diff --git a/CUDA/atomic_add.cu b/CUDA/atomic_add.cu new file mode 100644 index 0000000..f9694f0 --- /dev/null +++ b/CUDA/atomic_add.cu @@ -0,0 +1,127 @@ +#include +#include +#include +#include + +#include +using namespace std::chrono; +using nano_double = duration; + +#ifdef _WIN32 +#define EXPORT_API __declspec(dllexport) +#else +#define EXPORT_API +#endif + +#define DAT double + +#define source(i) _source[i] +#define indices(i, j) _indices[j * n + i] +#define target1(i) _target1[i] +#define target2(i) _target2[i] + +__global__ void no_atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int i1 = indices(i, 0); + int i2 = indices(i, 1); + int i3 = indices(i, 2); + int i4 = indices(i, 3); + DAT v = source(i); + target1(i1) += v; + target1(i2) += v; + target1(i3) += v; + target1(i4) += v; + target2(i1) += v; + target2(i2) += v; + target2(i3) += v; + target2(i4) += v; +} + +__global__ void atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int i1 = indices(i, 0); + int i2 = indices(i, 1); + int i3 = indices(i, 2); + int i4 = indices(i, 3); + DAT v = source(i); + atomicAdd(&target1(i1), v); + atomicAdd(&target1(i2), v); + atomicAdd(&target1(i3), v); + atomicAdd(&target1(i4), v); + atomicAdd(&target2(i1), v); + atomicAdd(&target2(i2), v); + atomicAdd(&target2(i3), v); + atomicAdd(&target2(i4), v); +} + +extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples) { + int i; + const int n = 1024; + const int bins = 64; + DAT *target1, *target2, *source; + int *indices; + + srand((unsigned) time(NULL)); + + DAT *target1_h = (DAT *)malloc(bins * sizeof(DAT)); + for (i = 0; i < bins; i++) { + target1_h[i] = (DAT)0.0; + } + DAT *target2_h = (DAT *)malloc(bins * sizeof(DAT)); + for (i = 0; i < bins; i++) { + target2_h[i] = (DAT)0.0; + } + DAT *source_h = (DAT *)malloc(n * sizeof(DAT)); + for (i = 0; i < n; i++) { + source_h[i] = static_cast(rand()) / static_cast(RAND_MAX); + } + int *indices_h = (int *)malloc(n * 4 * sizeof(int)); + for (i = 0; i < (n * 4); i++) { + indices_h[i] = std::rand() % bins; + } + + cudaMalloc(&target1, bins * sizeof(DAT)); + cudaMalloc(&target2, bins * sizeof(DAT)); + cudaMalloc(&source, n * sizeof(DAT)); + cudaMalloc(&indices, n * 4 * sizeof(int)); + + cudaMemcpy(target1, target1_h, bins * sizeof(DAT), cudaMemcpyHostToDevice); + cudaMemcpy(target2, target2_h, bins * sizeof(DAT), cudaMemcpyHostToDevice); + cudaMemcpy( source, source_h, n * sizeof(DAT), cudaMemcpyHostToDevice); + cudaMemcpy(indices, indices_h, n * 4 * sizeof(int), cudaMemcpyHostToDevice); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + dim3 nthreads(256); + dim3 nblocks((n + nthreads.x - 1) / nthreads.x); + + // for (int isample = 0; isample < nsamples; ++isample) { + // auto timer = high_resolution_clock::now(); + // hipLaunchKernelGGL(no_atomic_kernel, nblocks, nthreads, 0, stream, target1, target2, source, indices, n); + // hipStreamSynchronize(stream); + // auto elapsed = high_resolution_clock::now() - timer; + // auto time_total = duration_cast(elapsed).count(); + // times[isample] = time_total; + // } + + for (int isample = 0; isample < nsamples; ++isample) { + auto timer = high_resolution_clock::now(); + atomic_kernel<<>>(target1, target2, source, indices, n); + cudaStreamSynchronize(stream); + auto elapsed = high_resolution_clock::now() - timer; + auto time_total = duration_cast(elapsed).count(); + times[isample] = time_total; + } + + free(target1_h); + free(target2_h); + free(source_h); + free(indices_h); + cudaFree(target1); + cudaFree(target2); + cudaFree(source); + cudaFree(indices); + + cudaStreamDestroy(stream); +} diff --git a/CUDA/atomic_add.jl b/CUDA/atomic_add.jl new file mode 100644 index 0000000..fbed643 --- /dev/null +++ b/CUDA/atomic_add.jl @@ -0,0 +1,102 @@ +using CUDA +using KernelAbstractions +using BenchmarkTools +using Libdl + +# function make_c_trial(nsamples) +# c_times = zeros(Float64, nsamples) +# c_gctimes = zeros(Float64, nsamples) +# c_memory = 0::Int64 +# c_allocs = 0::Int64 +# c_params = BenchmarkTools.DEFAULT_PARAMETERS +# c_params.samples = nsamples +# return BenchmarkTools.Trial(c_params, c_times, c_gctimes, c_memory, c_allocs) +# end + +# INPUTS = Dict() + +# INPUTS["atomic"] = ( +# c_samples=2000, +# ) + +function cuda_atomic_add!(target1, target2, source, indices) + i = threadIdx().x + (blockIdx().x - 1) * gridDim().x + i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4] + v = source[i] + CUDA.@atomic target1[i1] += v + CUDA.@atomic target1[i2] += v + CUDA.@atomic target1[i3] += v + CUDA.@atomic target1[i4] += v + CUDA.@atomic target2[i1] += v + CUDA.@atomic target2[i2] += v + CUDA.@atomic target2[i3] += v + CUDA.@atomic target2[i4] += v + return +end + +@kernel function ka_atomic_add!(target1, target2, source, indices) + i = @index(Global, Linear) + i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4] + v = source[i] + KernelAbstractions.@atomic target1[i1] += v + KernelAbstractions.@atomic target1[i2] += v + KernelAbstractions.@atomic target1[i3] += v + KernelAbstractions.@atomic target1[i4] += v + KernelAbstractions.@atomic target2[i1] += v + KernelAbstractions.@atomic target2[i2] += v + KernelAbstractions.@atomic target2[i3] += v + KernelAbstractions.@atomic target2[i4] += v +end + +function run_julia_benchmarks(::Type{DAT}) where DAT + n, bins = 1024, 64 + target1 = CuArray(zeros(DAT, bins)) + target2 = CuArray(zeros(DAT, bins)) + source = CuArray(rand(DAT, n)) + indices = CuArray(rand(1:bins, n, 4)) + + nthreads = 256 + nblocks = cld.(n, nthreads) + + bm = @benchmark begin + @cuda threads=$nthreads blocks=$nblocks cuda_atomic_add!($target1, $target2, $source, $indices) + CUDA.synchronize() + end + + bm_ka = @benchmark begin + ka_atomic_add!(CUDABackend(), 256, $n)($target1, $target2, $source, $indices) + KernelAbstractions.synchronize(CUDABackend()) + end + + CUDA.unsafe_free!(source) + CUDA.unsafe_free!(indices) + CUDA.unsafe_free!(target1) + CUDA.unsafe_free!(target2) + + return (bm, bm_ka) +end + +function run_c_benchmarks(lib, nsamples) + trial = make_c_trial(nsamples) + + sym = Libdl.dlsym(lib, :run_benchmark) + @ccall $sym(trial.times::Ptr{Cdouble}, nsamples::Cint)::Cvoid + + return trial +end + +# Compile C benchmark +libext = Sys.iswindows() ? "dll" : "so" +libname = "atomic." * libext +run(`hipcc -O3 -o $libname --shared -fPIC atomic.cu`) + +Libdl.dlopen("./$libname") do lib + group_n = BenchmarkGroup() + jb = run_julia_benchmarks(Float32) + group_n["julia"] = jb[1] + group_n["julia-ka"] = jb[2] + group_n["reference"] = run_c_benchmarks(lib, INPUTS["atomic"].c_samples) + display(group_n) +end + + From cc084fac54806918a0f778189be57696b24a513f Mon Sep 17 00:00:00 2001 From: albert-de-montserrat Date: Fri, 15 Dec 2023 10:53:09 +0100 Subject: [PATCH 2/3] up --- CUDA/atomic_add.jl | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/CUDA/atomic_add.jl b/CUDA/atomic_add.jl index fbed643..003f0d1 100644 --- a/CUDA/atomic_add.jl +++ b/CUDA/atomic_add.jl @@ -3,21 +3,21 @@ using KernelAbstractions using BenchmarkTools using Libdl -# function make_c_trial(nsamples) -# c_times = zeros(Float64, nsamples) -# c_gctimes = zeros(Float64, nsamples) -# c_memory = 0::Int64 -# c_allocs = 0::Int64 -# c_params = BenchmarkTools.DEFAULT_PARAMETERS -# c_params.samples = nsamples -# return BenchmarkTools.Trial(c_params, c_times, c_gctimes, c_memory, c_allocs) -# end - -# INPUTS = Dict() - -# INPUTS["atomic"] = ( -# c_samples=2000, -# ) +function make_c_trial(nsamples) + c_times = zeros(Float64, nsamples) + c_gctimes = zeros(Float64, nsamples) + c_memory = 0::Int64 + c_allocs = 0::Int64 + c_params = BenchmarkTools.DEFAULT_PARAMETERS + c_params.samples = nsamples + return BenchmarkTools.Trial(c_params, c_times, c_gctimes, c_memory, c_allocs) +end + +INPUTS = Dict() + +INPUTS["atomic"] = ( + c_samples=2000, +) function cuda_atomic_add!(target1, target2, source, indices) i = threadIdx().x + (blockIdx().x - 1) * gridDim().x @@ -64,7 +64,7 @@ function run_julia_benchmarks(::Type{DAT}) where DAT end bm_ka = @benchmark begin - ka_atomic_add!(CUDABackend(), 256, $n)($target1, $target2, $source, $indices) + ka_atomic_add!($CUDABackend(), 256, $n)($target1, $target2, $source, $indices) KernelAbstractions.synchronize(CUDABackend()) end @@ -87,16 +87,14 @@ end # Compile C benchmark libext = Sys.iswindows() ? "dll" : "so" -libname = "atomic." * libext -run(`hipcc -O3 -o $libname --shared -fPIC atomic.cu`) +libname = "atomic_add." * libext +run(`nvcc -O3 -o $libname --shared -Xcompiler -fPIC atomic_add.cu`) Libdl.dlopen("./$libname") do lib group_n = BenchmarkGroup() jb = run_julia_benchmarks(Float32) group_n["julia"] = jb[1] group_n["julia-ka"] = jb[2] - group_n["reference"] = run_c_benchmarks(lib, INPUTS["atomic"].c_samples) + group_n["reference"] = run_c_benchmarks(lib, INPUTS["atomic_add"].c_samples) display(group_n) -end - - +end \ No newline at end of file From 467a26e87747e927782c287fb7228c0921c7fb64 Mon Sep 17 00:00:00 2001 From: albert-de-montserrat Date: Fri, 15 Dec 2023 14:36:53 +0100 Subject: [PATCH 3/3] fix nvcc compilation flag --- CUDA/atomic_add.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CUDA/atomic_add.jl b/CUDA/atomic_add.jl index 003f0d1..f660382 100644 --- a/CUDA/atomic_add.jl +++ b/CUDA/atomic_add.jl @@ -15,7 +15,7 @@ end INPUTS = Dict() -INPUTS["atomic"] = ( +INPUTS["atomic_add"] = ( c_samples=2000, ) @@ -88,7 +88,7 @@ end # Compile C benchmark libext = Sys.iswindows() ? "dll" : "so" libname = "atomic_add." * libext -run(`nvcc -O3 -o $libname --shared -Xcompiler -fPIC atomic_add.cu`) +run(`nvcc -O3 -o $libname --shared -Xcompiler -fPIC -arch=sm_60 atomic_add.cu`) Libdl.dlopen("./$libname") do lib group_n = BenchmarkGroup()