Massive commit with undocumented changes

it's not like anybody but me reads the `git log` anyways or does someone? in that case, hello, whoever you are! 👋
2025-03-24 17:17:10 +00:00 · 2022-11-30 23:43:32 +01:00 · 2022-11-30 23:43:32 +01:00 · f919c9f9ce
commit f919c9f9ce
parent 40affe04f5
2 changed files with 164 additions and 7 deletions
--- a/atax/.bench.sh
+++ b/atax/.bench.sh
@ -18,18 +18,23 @@ run_benchmarks() {

 for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
 do
-    for c in $(seq 0 3)
+    for c in $(seq 0 7)
    do
        cxxflags="-D$dataset"

        if (( $c & 1 ))
        then
-            cxxflags="$cxxflags -DPOLYBENCH_INCLUDE_INIT"
+            cxxflags="$cxxflags -DHPC_INCLUDE_INIT"
        fi 

        if (( $c & 2 ))
        then
-            cxxflags="$cxxflags -DPOLYBENCH_USE_CUDA"
+            cxxflags="$cxxflags -DHPC_USE_CUDA"
+        fi
+
+        if (( $c & 2 ))
+        then
+            cxxflags="$cxxflags -DHPC_USE_STRIDE"
        fi

        echo "Flags: $cxxflags"
--- a/atax/atax.cu
+++ b/atax/atax.cu
@ -2,6 +2,7 @@
 #include <unistd.h>
 #include <string.h>
 #include <math.h>
+#include <iostream>

 /* Include polybench common header. */
 #include "polybench.hu"
@ -17,14 +18,30 @@
 	#define M_PI 3.141
 #endif

+// Default if CUDA_NTHREADS is not set
+#ifndef CUDA_NTHREADS
+	#define CUDA_NTHREADS 128
+#endif
+
+// Enable syntax highlighting for the CUDA mode
+// TODO: Remove this, as it will be set by .bench.sh
+#define HPC_USE_CUDA
+
+// Enable syntax highlighting for the stride mode
+// TODO: Remove this, as it will be set by .bench.sh
+#define HPC_USE_STRIDE
+
+
 /**
 * Initialize the arrays to be used in the computation:
 * 
- * - `x` is filled with multiples of `M_PI`;
+ * - `X` is filled with multiples of `M_PI`;
+ * - `Y` is zeroed;
 * - `A` is filled with sample data.
 * 
 * To be called on the CPU (uses the `__host__` qualifier).
 */
+#ifndef HPC_USE_CUDA
 __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 {
 	/* X = [ 3.14, 6.28, 9.42, ... ] */
@ -56,7 +73,99 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 		}
 	}
 }
+#endif

+/**
+ * Initialize the arrays to be used in the computation:
+ * 
+ * - `X` is filled with multiples of `M_PI`;
+ * - `Y` is zeroed;
+ * - `A` is filled with sample data.
+ * 
+ * It is called by the host, runs on the device, and calls the other init_arrays on the device.
+ */
+#ifdef HPC_USE_CUDA
+__global__ static void init_array_cuda(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
+{
+	unsigned int threads = gridDim.x * blockDim.x;
+
+	init_array_cuda_x(X, threads);
+	init_array_cuda_y(Y, threads);
+	init_array_cuda_a(A, threads);
+}
+#endif
+
+/**
+ * Initialize the `X` array.
+ * 
+ * Runs on the device.
+ */
+#ifdef HPC_USE_CUDA
+__device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
+{
+	// Find how many iterations should be performed by each thread
+	unsigned int perThread = NY / threads;
+
+	// Find the index of the current thread, even if threads span multiple blocks
+	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
+	
+	// Have each thread perform the previously determined number of iterations
+	for(int stride = 0; stride < perThread; stride++) {
+		// Find the index of the current iteration
+		// This is equal to `y` of the init_array function
+		int iterationIdx = blockThreadIdx * stride;
+
+		// Prevent the thread from accessing unallocated memory
+		if(iterationIdx < NY) {
+
+			// Set the array element
+			X[iterationIdx] = iterationIdx * M_PI;
+		}
+	}
+}
+#endif
+
+/**
+ * Initialize the `Y` array.
+ * 
+ * Runs on the device.
+ */
+#ifdef HPC_USE_CUDA
+__device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
+{
+	// Find how many iterations should be performed by each thread
+	unsigned int perThread = NX / threads;
+
+	// Find the index of the current thread, even if threads span multiple blocks
+	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
+	
+	// Have each thread perform the previously determined number of iterations
+	for(int stride = 0; stride < perThread; stride++) {
+		// Find the index of the current iteration
+		// This is equal to `y` of the init_array function
+		int iterationIdx = blockThreadIdx * stride;
+
+		// Prevent the thread from accessing unallocated memory
+		if(iterationIdx < NX) {
+
+			// Set the array element
+			Y[iterationIdx] = 0;
+		}
+	}
+}
+#endif
+
+/**
+ * Initialize the `A` array.
+ * 
+ * Runs on the device.
+*/
+#ifdef HPC_USE_CUDA
+__device__ static void init_array_cuda_a(DATA_TYPE** A, unsigned int threads)
+{
+
+}
+#endif

 /**
 * Print the given array.
@ -117,7 +226,7 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 */
 __host__ int main(int argc, char** argv)
 {
-	#ifndef POLYBENCH_USE_CUDA
+	#ifndef HPC_USE_CUDA

 		// A[NX][NY]
 		DATA_TYPE** A = new DATA_TYPE*[NX] {};
@ -132,13 +241,13 @@ __host__ int main(int argc, char** argv)
 		// Y[NX]
 		DATA_TYPE* Y = new DATA_TYPE[NX] {};

-		#ifdef POLYBENCH_INCLUDE_INIT
+		#ifdef HPC_INCLUDE_INIT
 			polybench_start_instruments;
 		#endif

 		init_array(A, X, Y);

-		#ifndef POLYBENCH_INCLUDE_INIT
+		#ifndef HPC_INCLUDE_INIT
 			polybench_start_instruments;
 		#endif

@ -153,7 +262,50 @@ __host__ int main(int argc, char** argv)

 	#else

+		DATA_TYPE** A;
+		DATA_TYPE* X;
+		DATA_TYPE* Y;
+		
+		if(cudaMalloc(&A, sizeof(DATA_TYPE) * NX * NY)) 
+		{
+			std::cerr << "Could not allocate A on the device\n";
+			return 1;
+		}
+		
+		if(cudaMalloc(&X, sizeof(DATA_TYPE) * NY))
+		{
+			std::cerr << "Could not allocate X on the device\n";
+			return 1;
+		}

+		if(cudaMalloc(&Y, sizeof(DATA_TYPE) * NX))
+		{
+			std::cerr << "Could not allocate Y on the device\n";
+			return 1;
+		}
+
+		#ifdef POLYBENCH_INCLUDE_INIT
+			polybench_start_instruments;
+		#endif
+
+		init_array_cuda<<<1, 1>>>(A, X, Y);
+
+		#ifndef POLYBENCH_INCLUDE_INIT
+			polybench_start_instruments;
+		#endif
+
+		// kernel_atax_cuda<<<1, 1>>>();
+
+		polybench_stop_instruments;
+		polybench_print_instruments;
+
+		// Y = cudaMemcpy();
+
+		/*
+		polybench_prevent_dce(
+			print_array(Y)
+		);
+		*/

 	#endif