1
Fork 0
mirror of https://github.com/Steffo99/unimore-hpc-assignments.git synced 2024-11-22 08:04:25 +00:00

Add comments and improvements

This commit is contained in:
Steffo 2022-12-02 00:50:44 +01:00
parent ef30e88e01
commit 7f02c31247
Signed by: steffo
GPG key ID: 6965406171929D01
3 changed files with 30 additions and 19 deletions

View file

@ -8,7 +8,7 @@ run_benchmarks() {
do
exet=$(./atax.elf 2> /dev/null)
totalt=$(awk "BEGIN{print $totalt+$exet}")
echo -n "."
echo -n "*"
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
done
@ -16,7 +16,7 @@ run_benchmarks() {
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
}
for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET
for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
do
for c in $(seq 0 3)
do
@ -34,7 +34,9 @@ do
echo "Flags: $cxxflags"
make --silent "clean"
echo -n "C"
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
echo -n "B"
run_benchmarks
done

View file

@ -40,4 +40,4 @@ bench:
./.bench.sh
clean:
rm *.elf
rm -f *.elf

View file

@ -304,10 +304,13 @@ __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y
// Have each thread perform the previously determined number of iterations
for(int stride = 0; stride < perThread; stride++)
{
// Iterate over x; y is not parallelized
unsigned int x = threads * stride + blockThreadIdx;
// Prevent the thread from accessing unallocated memory
if(x < NX)
{
// The same tmp as earlier
DATA_TYPE tmp = 0;
for (unsigned int y = 0; y < NX; y++)
@ -317,6 +320,8 @@ __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y
for (unsigned int y = 0; y < NX; y++)
{
// THIS DOES NOT WORK ON THE NANO, AS IT IS TOO OLD TO SUPPORT ATOMIC ADDITION WITH DOUBLES!
// If you want to use the Nano, swap this for something else, or change atax.hu to use float instead of double
atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
}
}
@ -394,8 +399,10 @@ __host__ int main(int argc, char** argv)
DATA_TYPE* A;
DATA_TYPE* X;
DATA_TYPE* Y;
DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
DATA_TYPE* host_X = new DATA_TYPE[NY];
#ifdef HPC_DEBUG
DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
DATA_TYPE* host_X = new DATA_TYPE[NY];
#endif
DATA_TYPE* host_Y = new DATA_TYPE[NX];
print_debug("[CUDA] Allocating A...");
@ -428,7 +435,7 @@ __host__ int main(int argc, char** argv)
#endif
print_debug("[Init] Initializing...");
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
init_array_cuda<<<32, 32>>>((DATA_TYPE*) A, (DATA_TYPE*) X, (DATA_TYPE*) Y);
if(cudaError_t err = cudaGetLastError())
{
print_cudaError(err, "[Init] Failed to execute kernel!");
@ -442,22 +449,24 @@ __host__ int main(int argc, char** argv)
#endif
print_debug("[Kernel] Running...");
kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
kernel_atax_cuda<<<32, 32>>>((DATA_TYPE*) A, (DATA_TYPE*) X, (DATA_TYPE*) Y);
print_debug("[Kernel] Complete!");
print_debug("[CUDA] Copying A back...");
if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
print_cudaError(err, "[CUDA] Could copy A back!");
return 1;
};
print_debug("[CUDA] Copied A back!");
#ifdef HPC_DEBUG
print_debug("[CUDA] Copying A back...");
if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
print_cudaError(err, "[CUDA] Could copy A back!");
return 1;
};
print_debug("[CUDA] Copied A back!");
print_debug("[CUDA] Copying X back...");
if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) {
print_cudaError(err, "[CUDA] Could copy X back!");
return 1;
};
print_debug("[CUDA] Copied X back!");
print_debug("[CUDA] Copying X back...");
if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) {
print_cudaError(err, "[CUDA] Could copy X back!");
return 1;
};
print_debug("[CUDA] Copied X back!");
#endif
print_debug("[CUDA] Copying Y back...");
if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {