1
Fork 0
mirror of https://github.com/Steffo99/unimore-hpc-assignments.git synced 2024-11-22 16:14:24 +00:00

Add comments and improvements

This commit is contained in:
Steffo 2022-12-02 00:50:44 +01:00
parent ef30e88e01
commit 7f02c31247
Signed by: steffo
GPG key ID: 6965406171929D01
3 changed files with 30 additions and 19 deletions

View file

@ -8,7 +8,7 @@ run_benchmarks() {
do do
exet=$(./atax.elf 2> /dev/null) exet=$(./atax.elf 2> /dev/null)
totalt=$(awk "BEGIN{print $totalt+$exet}") totalt=$(awk "BEGIN{print $totalt+$exet}")
echo -n "." echo -n "*"
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds" # echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
done done
@ -16,7 +16,7 @@ run_benchmarks() {
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds" echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
} }
for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
do do
for c in $(seq 0 3) for c in $(seq 0 3)
do do
@ -34,7 +34,9 @@ do
echo "Flags: $cxxflags" echo "Flags: $cxxflags"
make --silent "clean" make --silent "clean"
echo -n "C"
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf" make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
echo -n "B"
run_benchmarks run_benchmarks
done done

View file

@ -40,4 +40,4 @@ bench:
./.bench.sh ./.bench.sh
clean: clean:
rm *.elf rm -f *.elf

View file

@ -304,10 +304,13 @@ __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y
// Have each thread perform the previously determined number of iterations // Have each thread perform the previously determined number of iterations
for(int stride = 0; stride < perThread; stride++) for(int stride = 0; stride < perThread; stride++)
{ {
// Iterate over x; y is not parallelized
unsigned int x = threads * stride + blockThreadIdx; unsigned int x = threads * stride + blockThreadIdx;
// Prevent the thread from accessing unallocated memory
if(x < NX) if(x < NX)
{ {
// The same tmp as earlier
DATA_TYPE tmp = 0; DATA_TYPE tmp = 0;
for (unsigned int y = 0; y < NX; y++) for (unsigned int y = 0; y < NX; y++)
@ -317,6 +320,8 @@ __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y
for (unsigned int y = 0; y < NX; y++) for (unsigned int y = 0; y < NX; y++)
{ {
// THIS DOES NOT WORK ON THE NANO, AS IT IS TOO OLD TO SUPPORT ATOMIC ADDITION WITH DOUBLES!
// If you want to use the Nano, swap this for something else, or change atax.hu to use float instead of double
atomicAdd(&Y[x], A[a_index(x, y)] * tmp); atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
} }
} }
@ -394,8 +399,10 @@ __host__ int main(int argc, char** argv)
DATA_TYPE* A; DATA_TYPE* A;
DATA_TYPE* X; DATA_TYPE* X;
DATA_TYPE* Y; DATA_TYPE* Y;
#ifdef HPC_DEBUG
DATA_TYPE* host_A = new DATA_TYPE[NX * NY]; DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
DATA_TYPE* host_X = new DATA_TYPE[NY]; DATA_TYPE* host_X = new DATA_TYPE[NY];
#endif
DATA_TYPE* host_Y = new DATA_TYPE[NX]; DATA_TYPE* host_Y = new DATA_TYPE[NX];
print_debug("[CUDA] Allocating A..."); print_debug("[CUDA] Allocating A...");
@ -428,7 +435,7 @@ __host__ int main(int argc, char** argv)
#endif #endif
print_debug("[Init] Initializing..."); print_debug("[Init] Initializing...");
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y); init_array_cuda<<<32, 32>>>((DATA_TYPE*) A, (DATA_TYPE*) X, (DATA_TYPE*) Y);
if(cudaError_t err = cudaGetLastError()) if(cudaError_t err = cudaGetLastError())
{ {
print_cudaError(err, "[Init] Failed to execute kernel!"); print_cudaError(err, "[Init] Failed to execute kernel!");
@ -442,9 +449,10 @@ __host__ int main(int argc, char** argv)
#endif #endif
print_debug("[Kernel] Running..."); print_debug("[Kernel] Running...");
kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y); kernel_atax_cuda<<<32, 32>>>((DATA_TYPE*) A, (DATA_TYPE*) X, (DATA_TYPE*) Y);
print_debug("[Kernel] Complete!"); print_debug("[Kernel] Complete!");
#ifdef HPC_DEBUG
print_debug("[CUDA] Copying A back..."); print_debug("[CUDA] Copying A back...");
if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) { if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
print_cudaError(err, "[CUDA] Could copy A back!"); print_cudaError(err, "[CUDA] Could copy A back!");
@ -458,6 +466,7 @@ __host__ int main(int argc, char** argv)
return 1; return 1;
}; };
print_debug("[CUDA] Copied X back!"); print_debug("[CUDA] Copied X back!");
#endif
print_debug("[CUDA] Copying Y back..."); print_debug("[CUDA] Copying Y back...");
if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) { if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {