mirror of
https://github.com/Steffo99/unimore-hpc-assignments.git
synced 2024-11-22 16:14:24 +00:00
Add comments and improvements
This commit is contained in:
parent
ef30e88e01
commit
7f02c31247
3 changed files with 30 additions and 19 deletions
|
@ -8,7 +8,7 @@ run_benchmarks() {
|
||||||
do
|
do
|
||||||
exet=$(./atax.elf 2> /dev/null)
|
exet=$(./atax.elf 2> /dev/null)
|
||||||
totalt=$(awk "BEGIN{print $totalt+$exet}")
|
totalt=$(awk "BEGIN{print $totalt+$exet}")
|
||||||
echo -n "."
|
echo -n "*"
|
||||||
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
|
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
|
||||||
done
|
done
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ run_benchmarks() {
|
||||||
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
|
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
|
||||||
}
|
}
|
||||||
|
|
||||||
for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET
|
for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
|
||||||
do
|
do
|
||||||
for c in $(seq 0 3)
|
for c in $(seq 0 3)
|
||||||
do
|
do
|
||||||
|
@ -34,7 +34,9 @@ do
|
||||||
|
|
||||||
echo "Flags: $cxxflags"
|
echo "Flags: $cxxflags"
|
||||||
make --silent "clean"
|
make --silent "clean"
|
||||||
|
echo -n "C"
|
||||||
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
|
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
|
||||||
|
echo -n "B"
|
||||||
|
|
||||||
run_benchmarks
|
run_benchmarks
|
||||||
done
|
done
|
||||||
|
|
|
@ -40,4 +40,4 @@ bench:
|
||||||
./.bench.sh
|
./.bench.sh
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm *.elf
|
rm -f *.elf
|
||||||
|
|
13
atax/atax.cu
13
atax/atax.cu
|
@ -304,10 +304,13 @@ __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y
|
||||||
// Have each thread perform the previously determined number of iterations
|
// Have each thread perform the previously determined number of iterations
|
||||||
for(int stride = 0; stride < perThread; stride++)
|
for(int stride = 0; stride < perThread; stride++)
|
||||||
{
|
{
|
||||||
|
// Iterate over x; y is not parallelized
|
||||||
unsigned int x = threads * stride + blockThreadIdx;
|
unsigned int x = threads * stride + blockThreadIdx;
|
||||||
|
|
||||||
|
// Prevent the thread from accessing unallocated memory
|
||||||
if(x < NX)
|
if(x < NX)
|
||||||
{
|
{
|
||||||
|
// The same tmp as earlier
|
||||||
DATA_TYPE tmp = 0;
|
DATA_TYPE tmp = 0;
|
||||||
|
|
||||||
for (unsigned int y = 0; y < NX; y++)
|
for (unsigned int y = 0; y < NX; y++)
|
||||||
|
@ -317,6 +320,8 @@ __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y
|
||||||
|
|
||||||
for (unsigned int y = 0; y < NX; y++)
|
for (unsigned int y = 0; y < NX; y++)
|
||||||
{
|
{
|
||||||
|
// THIS DOES NOT WORK ON THE NANO, AS IT IS TOO OLD TO SUPPORT ATOMIC ADDITION WITH DOUBLES!
|
||||||
|
// If you want to use the Nano, swap this for something else, or change atax.hu to use float instead of double
|
||||||
atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
|
atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -394,8 +399,10 @@ __host__ int main(int argc, char** argv)
|
||||||
DATA_TYPE* A;
|
DATA_TYPE* A;
|
||||||
DATA_TYPE* X;
|
DATA_TYPE* X;
|
||||||
DATA_TYPE* Y;
|
DATA_TYPE* Y;
|
||||||
|
#ifdef HPC_DEBUG
|
||||||
DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
|
DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
|
||||||
DATA_TYPE* host_X = new DATA_TYPE[NY];
|
DATA_TYPE* host_X = new DATA_TYPE[NY];
|
||||||
|
#endif
|
||||||
DATA_TYPE* host_Y = new DATA_TYPE[NX];
|
DATA_TYPE* host_Y = new DATA_TYPE[NX];
|
||||||
|
|
||||||
print_debug("[CUDA] Allocating A...");
|
print_debug("[CUDA] Allocating A...");
|
||||||
|
@ -428,7 +435,7 @@ __host__ int main(int argc, char** argv)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
print_debug("[Init] Initializing...");
|
print_debug("[Init] Initializing...");
|
||||||
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
|
init_array_cuda<<<32, 32>>>((DATA_TYPE*) A, (DATA_TYPE*) X, (DATA_TYPE*) Y);
|
||||||
if(cudaError_t err = cudaGetLastError())
|
if(cudaError_t err = cudaGetLastError())
|
||||||
{
|
{
|
||||||
print_cudaError(err, "[Init] Failed to execute kernel!");
|
print_cudaError(err, "[Init] Failed to execute kernel!");
|
||||||
|
@ -442,9 +449,10 @@ __host__ int main(int argc, char** argv)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
print_debug("[Kernel] Running...");
|
print_debug("[Kernel] Running...");
|
||||||
kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
|
kernel_atax_cuda<<<32, 32>>>((DATA_TYPE*) A, (DATA_TYPE*) X, (DATA_TYPE*) Y);
|
||||||
print_debug("[Kernel] Complete!");
|
print_debug("[Kernel] Complete!");
|
||||||
|
|
||||||
|
#ifdef HPC_DEBUG
|
||||||
print_debug("[CUDA] Copying A back...");
|
print_debug("[CUDA] Copying A back...");
|
||||||
if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
|
if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
|
||||||
print_cudaError(err, "[CUDA] Could copy A back!");
|
print_cudaError(err, "[CUDA] Could copy A back!");
|
||||||
|
@ -458,6 +466,7 @@ __host__ int main(int argc, char** argv)
|
||||||
return 1;
|
return 1;
|
||||||
};
|
};
|
||||||
print_debug("[CUDA] Copied X back!");
|
print_debug("[CUDA] Copied X back!");
|
||||||
|
#endif
|
||||||
|
|
||||||
print_debug("[CUDA] Copying Y back...");
|
print_debug("[CUDA] Copying Y back...");
|
||||||
if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {
|
if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {
|
||||||
|
|
Loading…
Reference in a new issue