From 7f02c3124714ce5f461d96864d683ac87e6e424d Mon Sep 17 00:00:00 2001 From: Stefano Pigozzi Date: Fri, 2 Dec 2022 00:50:44 +0100 Subject: [PATCH] Add comments and improvements --- atax/.bench.sh | 6 ++++-- atax/Makefile | 2 +- atax/atax.cu | 41 +++++++++++++++++++++++++---------------- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/atax/.bench.sh b/atax/.bench.sh index a875cf7..47a5b7f 100755 --- a/atax/.bench.sh +++ b/atax/.bench.sh @@ -8,7 +8,7 @@ run_benchmarks() { do exet=$(./atax.elf 2> /dev/null) totalt=$(awk "BEGIN{print $totalt+$exet}") - echo -n "." + echo -n "*" # echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds" done @@ -16,7 +16,7 @@ run_benchmarks() { echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds" } -for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET +for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET do for c in $(seq 0 3) do @@ -34,7 +34,9 @@ do echo "Flags: $cxxflags" make --silent "clean" + echo -n "C" make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf" + echo -n "B" run_benchmarks done diff --git a/atax/Makefile b/atax/Makefile index 396c9c8..310dbad 100644 --- a/atax/Makefile +++ b/atax/Makefile @@ -40,4 +40,4 @@ bench: ./.bench.sh clean: - rm *.elf + rm -f *.elf diff --git a/atax/atax.cu b/atax/atax.cu index 83c104d..b686e78 100644 --- a/atax/atax.cu +++ b/atax/atax.cu @@ -304,10 +304,13 @@ __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y // Have each thread perform the previously determined number of iterations for(int stride = 0; stride < perThread; stride++) { + // Iterate over x; y is not parallelized unsigned int x = threads * stride + blockThreadIdx; + // Prevent the thread from accessing unallocated memory if(x < NX) { + // The same tmp as earlier DATA_TYPE tmp = 0; for (unsigned int y = 0; y < NX; y++) @@ -317,6 +320,8 @@ __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y for (unsigned int y = 0; y < NX; y++) { + // THIS DOES NOT WORK ON THE NANO, AS IT IS TOO OLD TO SUPPORT ATOMIC ADDITION WITH DOUBLES! + // If you want to use the Nano, swap this for something else, or change atax.hu to use float instead of double atomicAdd(&Y[x], A[a_index(x, y)] * tmp); } } @@ -394,8 +399,10 @@ __host__ int main(int argc, char** argv) DATA_TYPE* A; DATA_TYPE* X; DATA_TYPE* Y; - DATA_TYPE* host_A = new DATA_TYPE[NX * NY]; - DATA_TYPE* host_X = new DATA_TYPE[NY]; + #ifdef HPC_DEBUG + DATA_TYPE* host_A = new DATA_TYPE[NX * NY]; + DATA_TYPE* host_X = new DATA_TYPE[NY]; + #endif DATA_TYPE* host_Y = new DATA_TYPE[NX]; print_debug("[CUDA] Allocating A..."); @@ -428,7 +435,7 @@ __host__ int main(int argc, char** argv) #endif print_debug("[Init] Initializing..."); - init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y); + init_array_cuda<<<32, 32>>>((DATA_TYPE*) A, (DATA_TYPE*) X, (DATA_TYPE*) Y); if(cudaError_t err = cudaGetLastError()) { print_cudaError(err, "[Init] Failed to execute kernel!"); @@ -442,22 +449,24 @@ __host__ int main(int argc, char** argv) #endif print_debug("[Kernel] Running..."); - kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y); + kernel_atax_cuda<<<32, 32>>>((DATA_TYPE*) A, (DATA_TYPE*) X, (DATA_TYPE*) Y); print_debug("[Kernel] Complete!"); - print_debug("[CUDA] Copying A back..."); - if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) { - print_cudaError(err, "[CUDA] Could copy A back!"); - return 1; - }; - print_debug("[CUDA] Copied A back!"); + #ifdef HPC_DEBUG + print_debug("[CUDA] Copying A back..."); + if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) { + print_cudaError(err, "[CUDA] Could copy A back!"); + return 1; + }; + print_debug("[CUDA] Copied A back!"); - print_debug("[CUDA] Copying X back..."); - if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) { - print_cudaError(err, "[CUDA] Could copy X back!"); - return 1; - }; - print_debug("[CUDA] Copied X back!"); + print_debug("[CUDA] Copying X back..."); + if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) { + print_cudaError(err, "[CUDA] Could copy X back!"); + return 1; + }; + print_debug("[CUDA] Copied X back!"); + #endif print_debug("[CUDA] Copying Y back..."); if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {