1
Fork 0
mirror of https://github.com/Steffo99/unimore-hpc-assignments.git synced 2024-11-25 17:44:23 +00:00

Improve debugging tooling

This commit is contained in:
Stefano Pigozzi 2022-12-01 01:12:07 +01:00
parent f919c9f9ce
commit 6b770e1ef2
2 changed files with 82 additions and 31 deletions

View file

@ -23,9 +23,10 @@ NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
$(NVCC) $(NVCFLAGS) -c $< -o $@
.PHONY: bench clean
.PHONY: bench clean dev
all: atax.elf
dev: atax.elf
./atax.elf
bench:
./.bench.sh

View file

@ -31,6 +31,9 @@
// TODO: Remove this, as it will be set by .bench.sh
#define HPC_USE_STRIDE
// Create macro for debug logging
#define debug(txt) std::cerr << txt << std::endl
/**
* Initialize the arrays to be used in the computation:
@ -75,26 +78,6 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
}
#endif
/**
* Initialize the arrays to be used in the computation:
*
* - `X` is filled with multiples of `M_PI`;
* - `Y` is zeroed;
* - `A` is filled with sample data.
*
* It is called by the host, runs on the device, and calls the other init_arrays on the device.
*/
#ifdef HPC_USE_CUDA
__global__ static void init_array_cuda(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
{
unsigned int threads = gridDim.x * blockDim.x;
init_array_cuda_x(X, threads);
init_array_cuda_y(Y, threads);
init_array_cuda_a(A, threads);
}
#endif
/**
* Initialize the `X` array.
*
@ -161,9 +144,40 @@ __device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
* Runs on the device.
*/
#ifdef HPC_USE_CUDA
__device__ static void init_array_cuda_a(DATA_TYPE** A, unsigned int threads)
__device__ static void init_array_cuda_a(DATA_TYPE* A, unsigned int threads)
{
// Find how many elements should be written in total
unsigned int elements = NX * NY;
// Find how many iterations should be performed by each thread
unsigned int perThread = elements / threads;
// Find the index of the current thread, even if threads span multiple blocks
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
/* TODO */
}
#endif
/**
* Initialize the arrays to be used in the computation:
*
* - `X` is filled with multiples of `M_PI`;
* - `Y` is zeroed;
* - `A` is filled with sample data.
*
* Beware that `A` here is a simple array, it is not a matrix, so elements are accessed via [y * NX + x] (I think?).
*
* It is called by the host, runs on the device, and calls the other init_arrays on the device.
*/
#ifdef HPC_USE_CUDA
__global__ static void init_array_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
{
unsigned int threads = gridDim.x * blockDim.x;
init_array_cuda_x(X, threads);
init_array_cuda_y(Y, threads);
init_array_cuda_a(A, threads);
}
#endif
@ -226,8 +240,14 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
*/
__host__ int main(int argc, char** argv)
{
debug("Starting main...");
#ifndef HPC_USE_CUDA
debug("[Mode] Host-only");
debug("[Pointers] Allocating...");
// A[NX][NY]
DATA_TYPE** A = new DATA_TYPE*[NX] {};
for(unsigned int x = 0; x < NX; x++)
@ -241,56 +261,86 @@ __host__ int main(int argc, char** argv)
// Y[NX]
DATA_TYPE* Y = new DATA_TYPE[NX] {};
debug("[Pointers] Allocated!");
#ifdef HPC_INCLUDE_INIT
debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
debug("[Init] Initializing...");
init_array(A, X, Y);
debug("[Init] Initialized!");
#ifndef HPC_INCLUDE_INIT
debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
debug("[Kernel] Running...");
kernel_atax(A, X, Y);
debug("[Kernel] Completed!");
debug("[Benchmark] Stopping...");
polybench_stop_instruments;
polybench_print_instruments;
debug("[Benchmark] Complete!");
debug("[Verify] Printing...")
polybench_prevent_dce(
print_array(Y)
);
debug("[Verify] Done!")
#else
debug("[Mode] Host-and-device, CUDA");
DATA_TYPE** A;
debug("[Pointers] Allocating...");
DATA_TYPE* A;
DATA_TYPE* X;
DATA_TYPE* Y;
if(cudaMalloc(&A, sizeof(DATA_TYPE) * NX * NY))
debug("[CUDA] Allocating A...");
if(cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
{
std::cerr << "Could not allocate A on the device\n";
debug("[CUDA] Could not allocate A!");
return 1;
}
debug("[CUDA] Allocated A!");
if(cudaMalloc(&X, sizeof(DATA_TYPE) * NY))
debug("[CUDA] Allocating X...");
if(cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
{
std::cerr << "Could not allocate X on the device\n";
debug("[CUDA] Could not allocate X!");
return 1;
}
debug("[CUDA] Allocated X!");
if(cudaMalloc(&Y, sizeof(DATA_TYPE) * NX))
debug("[CUDA] Allocating Y...");
if(cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
{
std::cerr << "Could not allocate Y on the device\n";
debug("[CUDA] Could not allocate Y!");
return 1;
}
debug("[CUDA] Allocated Y!");
#ifdef POLYBENCH_INCLUDE_INIT
debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
init_array_cuda<<<1, 1>>>(A, X, Y);
debug("[Init] Initializing...");
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
if(cudaGetLastError())
{
debug("[Init] Failed to execute kernel!");
return 1;
}
debug("[Init] Initialized!");
#ifndef POLYBENCH_INCLUDE_INIT
debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif