diff --git a/CUDA/atomic_add.cu b/CUDA/atomic_add.cu new file mode 100644 index 0000000..f9694f0 --- /dev/null +++ b/CUDA/atomic_add.cu @@ -0,0 +1,127 @@ +#include +#include +#include +#include + +#include +using namespace std::chrono; +using nano_double = duration; + +#ifdef _WIN32 +#define EXPORT_API __declspec(dllexport) +#else +#define EXPORT_API +#endif + +#define DAT double + +#define source(i) _source[i] +#define indices(i, j) _indices[j * n + i] +#define target1(i) _target1[i] +#define target2(i) _target2[i] + +__global__ void no_atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int i1 = indices(i, 0); + int i2 = indices(i, 1); + int i3 = indices(i, 2); + int i4 = indices(i, 3); + DAT v = source(i); + target1(i1) += v; + target1(i2) += v; + target1(i3) += v; + target1(i4) += v; + target2(i1) += v; + target2(i2) += v; + target2(i3) += v; + target2(i4) += v; +} + +__global__ void atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int i1 = indices(i, 0); + int i2 = indices(i, 1); + int i3 = indices(i, 2); + int i4 = indices(i, 3); + DAT v = source(i); + atomicAdd(&target1(i1), v); + atomicAdd(&target1(i2), v); + atomicAdd(&target1(i3), v); + atomicAdd(&target1(i4), v); + atomicAdd(&target2(i1), v); + atomicAdd(&target2(i2), v); + atomicAdd(&target2(i3), v); + atomicAdd(&target2(i4), v); +} + +extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples) { + int i; + const int n = 1024; + const int bins = 64; + DAT *target1, *target2, *source; + int *indices; + + srand((unsigned) time(NULL)); + + DAT *target1_h = (DAT *)malloc(bins * sizeof(DAT)); + for (i = 0; i < bins; i++) { + target1_h[i] = (DAT)0.0; + } + DAT *target2_h = (DAT *)malloc(bins * sizeof(DAT)); + for (i = 0; i < bins; i++) { + target2_h[i] = (DAT)0.0; + } + DAT *source_h = (DAT *)malloc(n * sizeof(DAT)); + for (i = 0; i < n; i++) { + source_h[i] = static_cast(rand()) / static_cast(RAND_MAX); + } + int *indices_h = (int *)malloc(n * 4 * sizeof(int)); + for (i = 0; i < (n * 4); i++) { + indices_h[i] = std::rand() % bins; + } + + cudaMalloc(&target1, bins * sizeof(DAT)); + cudaMalloc(&target2, bins * sizeof(DAT)); + cudaMalloc(&source, n * sizeof(DAT)); + cudaMalloc(&indices, n * 4 * sizeof(int)); + + cudaMemcpy(target1, target1_h, bins * sizeof(DAT), cudaMemcpyHostToDevice); + cudaMemcpy(target2, target2_h, bins * sizeof(DAT), cudaMemcpyHostToDevice); + cudaMemcpy( source, source_h, n * sizeof(DAT), cudaMemcpyHostToDevice); + cudaMemcpy(indices, indices_h, n * 4 * sizeof(int), cudaMemcpyHostToDevice); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + dim3 nthreads(256); + dim3 nblocks((n + nthreads.x - 1) / nthreads.x); + + // for (int isample = 0; isample < nsamples; ++isample) { + // auto timer = high_resolution_clock::now(); + // hipLaunchKernelGGL(no_atomic_kernel, nblocks, nthreads, 0, stream, target1, target2, source, indices, n); + // hipStreamSynchronize(stream); + // auto elapsed = high_resolution_clock::now() - timer; + // auto time_total = duration_cast(elapsed).count(); + // times[isample] = time_total; + // } + + for (int isample = 0; isample < nsamples; ++isample) { + auto timer = high_resolution_clock::now(); + atomic_kernel<<>>(target1, target2, source, indices, n); + cudaStreamSynchronize(stream); + auto elapsed = high_resolution_clock::now() - timer; + auto time_total = duration_cast(elapsed).count(); + times[isample] = time_total; + } + + free(target1_h); + free(target2_h); + free(source_h); + free(indices_h); + cudaFree(target1); + cudaFree(target2); + cudaFree(source); + cudaFree(indices); + + cudaStreamDestroy(stream); +} diff --git a/CUDA/atomic_add.jl b/CUDA/atomic_add.jl new file mode 100644 index 0000000..f660382 --- /dev/null +++ b/CUDA/atomic_add.jl @@ -0,0 +1,100 @@ +using CUDA +using KernelAbstractions +using BenchmarkTools +using Libdl + +function make_c_trial(nsamples) + c_times = zeros(Float64, nsamples) + c_gctimes = zeros(Float64, nsamples) + c_memory = 0::Int64 + c_allocs = 0::Int64 + c_params = BenchmarkTools.DEFAULT_PARAMETERS + c_params.samples = nsamples + return BenchmarkTools.Trial(c_params, c_times, c_gctimes, c_memory, c_allocs) +end + +INPUTS = Dict() + +INPUTS["atomic_add"] = ( + c_samples=2000, +) + +function cuda_atomic_add!(target1, target2, source, indices) + i = threadIdx().x + (blockIdx().x - 1) * gridDim().x + i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4] + v = source[i] + CUDA.@atomic target1[i1] += v + CUDA.@atomic target1[i2] += v + CUDA.@atomic target1[i3] += v + CUDA.@atomic target1[i4] += v + CUDA.@atomic target2[i1] += v + CUDA.@atomic target2[i2] += v + CUDA.@atomic target2[i3] += v + CUDA.@atomic target2[i4] += v + return +end + +@kernel function ka_atomic_add!(target1, target2, source, indices) + i = @index(Global, Linear) + i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4] + v = source[i] + KernelAbstractions.@atomic target1[i1] += v + KernelAbstractions.@atomic target1[i2] += v + KernelAbstractions.@atomic target1[i3] += v + KernelAbstractions.@atomic target1[i4] += v + KernelAbstractions.@atomic target2[i1] += v + KernelAbstractions.@atomic target2[i2] += v + KernelAbstractions.@atomic target2[i3] += v + KernelAbstractions.@atomic target2[i4] += v +end + +function run_julia_benchmarks(::Type{DAT}) where DAT + n, bins = 1024, 64 + target1 = CuArray(zeros(DAT, bins)) + target2 = CuArray(zeros(DAT, bins)) + source = CuArray(rand(DAT, n)) + indices = CuArray(rand(1:bins, n, 4)) + + nthreads = 256 + nblocks = cld.(n, nthreads) + + bm = @benchmark begin + @cuda threads=$nthreads blocks=$nblocks cuda_atomic_add!($target1, $target2, $source, $indices) + CUDA.synchronize() + end + + bm_ka = @benchmark begin + ka_atomic_add!($CUDABackend(), 256, $n)($target1, $target2, $source, $indices) + KernelAbstractions.synchronize(CUDABackend()) + end + + CUDA.unsafe_free!(source) + CUDA.unsafe_free!(indices) + CUDA.unsafe_free!(target1) + CUDA.unsafe_free!(target2) + + return (bm, bm_ka) +end + +function run_c_benchmarks(lib, nsamples) + trial = make_c_trial(nsamples) + + sym = Libdl.dlsym(lib, :run_benchmark) + @ccall $sym(trial.times::Ptr{Cdouble}, nsamples::Cint)::Cvoid + + return trial +end + +# Compile C benchmark +libext = Sys.iswindows() ? "dll" : "so" +libname = "atomic_add." * libext +run(`nvcc -O3 -o $libname --shared -Xcompiler -fPIC -arch=sm_60 atomic_add.cu`) + +Libdl.dlopen("./$libname") do lib + group_n = BenchmarkGroup() + jb = run_julia_benchmarks(Float32) + group_n["julia"] = jb[1] + group_n["julia-ka"] = jb[2] + group_n["reference"] = run_c_benchmarks(lib, INPUTS["atomic_add"].c_samples) + display(group_n) +end \ No newline at end of file