diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..c6453fa
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,6 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+indent_style = tab
diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 13566b8..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml
deleted file mode 100644
index 9de6c18..0000000
--- a/.idea/codeStyles/Project.xml
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
deleted file mode 100644
index 79ee123..0000000
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/customTargets.xml b/.idea/customTargets.xml
deleted file mode 100644
index 0d91871..0000000
--- a/.idea/customTargets.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/discord.xml b/.idea/discord.xml
deleted file mode 100644
index 8cf359d..0000000
--- a/.idea/discord.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
deleted file mode 100644
index 4696d94..0000000
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/makefile.xml b/.idea/makefile.xml
deleted file mode 100644
index 952d6c0..0000000
--- a/.idea/makefile.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-
-
-
-
-
-
- all
-
-
-
-
-
-
-
-
- clean
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index d135688..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 45b35be..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/runConfigurations/atax_acc.xml b/.idea/runConfigurations/atax_acc.xml
deleted file mode 100644
index ebaf15b..0000000
--- a/.idea/runConfigurations/atax_acc.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/unimore-hpc-1.iml b/.idea/unimore-hpc-1.iml
deleted file mode 100644
index d6ebd48..0000000
--- a/.idea/unimore-hpc-1.iml
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/unimore-hpc-12.iml b/.idea/unimore-hpc-12.iml
deleted file mode 100644
index 771e54c..0000000
--- a/.idea/unimore-hpc-12.iml
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 35eb1dd..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..4ec76b3
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,106 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1669932513703
+
+
+ 1669932513703
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
index b7a1887..3616aa3 100644
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@@ -6,7 +6,7 @@
"${workspaceFolder}/**"
],
"defines": [],
- "compilerPath": "/usr/local/cuda-10.0/bin/nvcc",
+ "compilerPath": "/opt/cuda/bin/nvcc",
"cStandard": "c11",
"cppStandard": "c++14",
"configurationProvider": "ms-vscode.makefile-tools"
diff --git a/.vscode/launch.json b/.vscode/launch.json
index b2e9531..a9bd71b 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -15,6 +15,7 @@
"environment": [],
"externalConsole": false,
"MIMode": "gdb",
+ "miDebuggerPath": "/usr/bin/gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a728723..f7ad1fb 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,5 +1,39 @@
{
"files.associations": {
- "*.hu": "cuda-cpp"
+ "*.hu": "cuda-cpp",
+ "array": "cpp",
+ "*.tcc": "cpp",
+ "cctype": "cpp",
+ "clocale": "cpp",
+ "cmath": "cpp",
+ "compare": "cpp",
+ "concepts": "cpp",
+ "cstdarg": "cpp",
+ "cstdint": "cpp",
+ "cstdio": "cpp",
+ "cstdlib": "cpp",
+ "cwchar": "cpp",
+ "cwctype": "cpp",
+ "unordered_map": "cpp",
+ "vector": "cpp",
+ "exception": "cpp",
+ "functional": "cpp",
+ "initializer_list": "cpp",
+ "iosfwd": "cpp",
+ "iostream": "cpp",
+ "istream": "cpp",
+ "limits": "cpp",
+ "new": "cpp",
+ "numbers": "cpp",
+ "ostream": "cpp",
+ "stdexcept": "cpp",
+ "streambuf": "cpp",
+ "string": "cpp",
+ "string_view": "cpp",
+ "system_error": "cpp",
+ "tuple": "cpp",
+ "type_traits": "cpp",
+ "typeinfo": "cpp",
+ "utility": "cpp"
}
}
\ No newline at end of file
diff --git a/atax/.bench.sh b/atax/.bench.sh
index 9d3b769..a875cf7 100755
--- a/atax/.bench.sh
+++ b/atax/.bench.sh
@@ -1,12 +1,12 @@
#!/bin/bash
run_benchmarks() {
- runs=25
+ runs=3
totalt=0.0
for i in $(seq $runs)
do
- exet=$(./atax.elf)
+ exet=$(./atax.elf 2> /dev/null)
totalt=$(awk "BEGIN{print $totalt+$exet}")
echo -n "."
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
@@ -16,9 +16,9 @@ run_benchmarks() {
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
}
-for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
+for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET
do
- for c in $(seq 0 7)
+ for c in $(seq 0 3)
do
cxxflags="-D$dataset"
@@ -32,12 +32,8 @@ do
cxxflags="$cxxflags -DHPC_USE_CUDA"
fi
- if (( $c & 2 ))
- then
- cxxflags="$cxxflags -DHPC_USE_STRIDE"
- fi
-
echo "Flags: $cxxflags"
+ make --silent "clean"
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
run_benchmarks
diff --git a/atax/Makefile b/atax/Makefile
index 2978ab6..396c9c8 100644
--- a/atax/Makefile
+++ b/atax/Makefile
@@ -5,16 +5,25 @@ MAKEFLAGS+= -r
CXXFLAGS+= -DPOLYBENCH_TIME
# -O3 applies all compiler optimization, improving from 800ms to 300ms
CXXFLAGS+= -O3
+# Enable this to view the contents of the arrays
+CXXFLAGS+= -DHPC_DEBUG
+# Enable this to use CUDA
+CXXFLAGS+= -DHPC_USE_CUDA
# Extend CFLAGS with command line parameters
CXXFLAGS+= ${EXTRA_CXXFLAGS}
# Select the location of the local CUDA install
-CUDA_HOME:=/usr/local/cuda-10.0
+# CUDA_HOME:=/usr/local/cuda-10.0
+CUDA_HOME:=/opt/cuda
# Specify the directory of the nvc compiler
NVCC:=$(CUDA_HOME)/bin/nvcc
# Specify the flags for the nvc compiler
NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
+# Optimize for @Steffo's NVIDIA GTX 1070
+NVCFLAGS+= -arch=compute_61
+NVCFLAGS+= -code=sm_61
+
%.elf: %.cu.o polybench.cu.o
$(NVCC) $(NVCFLAGS) $^ -o $@ $(LDFLAGS)
@@ -23,13 +32,12 @@ NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
$(NVCC) $(NVCFLAGS) -c $< -o $@
-.PHONY: bench clean dev
+all: atax.elf
-dev: atax.elf
- ./atax.elf
+.PHONY: bench clean
bench:
./.bench.sh
clean:
- rm *.elf *.cu.o
+ rm *.elf
diff --git a/atax/atax.cu b/atax/atax.cu
index 933431e..83c104d 100644
--- a/atax/atax.cu
+++ b/atax/atax.cu
@@ -3,6 +3,7 @@
#include
#include
#include
+#include
/* Include polybench common header. */
#include "polybench.hu"
@@ -23,16 +24,34 @@
#define CUDA_NTHREADS 128
#endif
-// Enable syntax highlighting for the CUDA mode
-// TODO: Remove this, as it will be set by .bench.sh
-#define HPC_USE_CUDA
-// Enable syntax highlighting for the stride mode
-// TODO: Remove this, as it will be set by .bench.sh
-#define HPC_USE_STRIDE
+/**
+ * Given a `x` and a `y`, compute the relative index of the element in the `A` matrix.
+ */
+__host__ __device__ inline static unsigned int a_index(unsigned int x, unsigned int y) {
+ return x * NY + y;
+}
-// Create macro for debug logging
-#define debug(txt) std::cerr << txt << std::endl
+/**
+ * Log a debug message.
+ */
+__host__ inline static void print_debug(std::string txt) {
+ #ifdef HPC_DEBUG
+ std::cerr << txt << std::endl;
+ #endif
+}
+
+/**
+ * Log an error message.
+ */
+#ifdef HPC_USE_CUDA
+__host__ inline static void print_cudaError(cudaError_t err, std::string txt) {
+ #ifdef HPC_DEBUG
+ std::cerr << txt;
+ fprintf( stderr, ": error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(err) );
+ #endif
+}
+#endif
/**
@@ -45,7 +64,7 @@
* To be called on the CPU (uses the `__host__` qualifier).
*/
#ifndef HPC_USE_CUDA
-__host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
+__host__ static void init_array(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
{
/* X = [ 3.14, 6.28, 9.42, ... ] */
for (unsigned int y = 0; y < NY; y++)
@@ -72,7 +91,7 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
{
for (unsigned int y = 0; y < NY; y++)
{
- A[x][y] = (DATA_TYPE)(x * (y + 1)) / NX;
+ A[a_index(x, y)] = (DATA_TYPE)(x * (y + 1)) / NX;
}
}
}
@@ -87,20 +106,21 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
__device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
{
// Find how many iterations should be performed by each thread
- unsigned int perThread = NY / threads;
+ unsigned int perThread = NY / threads + 1;
// Find the index of the current thread, even if threads span multiple blocks
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
// Have each thread perform the previously determined number of iterations
- for(int stride = 0; stride < perThread; stride++) {
+ for(int stride = 0; stride < perThread; stride++)
+ {
// Find the index of the current iteration
// This is equal to `y` of the init_array function
- int iterationIdx = blockThreadIdx * stride;
+ unsigned int iterationIdx = threads * stride + blockThreadIdx;
// Prevent the thread from accessing unallocated memory
- if(iterationIdx < NY) {
-
+ if(iterationIdx < NY)
+ {
// Set the array element
X[iterationIdx] = iterationIdx * M_PI;
}
@@ -117,20 +137,21 @@ __device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
__device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
{
// Find how many iterations should be performed by each thread
- unsigned int perThread = NX / threads;
+ unsigned int perThread = NX / threads + 1;
// Find the index of the current thread, even if threads span multiple blocks
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
// Have each thread perform the previously determined number of iterations
- for(int stride = 0; stride < perThread; stride++) {
+ for(int stride = 0; stride < perThread; stride++)
+ {
// Find the index of the current iteration
// This is equal to `y` of the init_array function
- int iterationIdx = blockThreadIdx * stride;
+ unsigned int iterationIdx = threads * stride + blockThreadIdx;
// Prevent the thread from accessing unallocated memory
- if(iterationIdx < NX) {
-
+ if(iterationIdx < NX)
+ {
// Set the array element
Y[iterationIdx] = 0;
}
@@ -150,12 +171,29 @@ __device__ static void init_array_cuda_a(DATA_TYPE* A, unsigned int threads)
unsigned int elements = NX * NY;
// Find how many iterations should be performed by each thread
- unsigned int perThread = elements / threads;
+ unsigned int perThread = elements / threads + 1;
// Find the index of the current thread, even if threads span multiple blocks
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
- /* TODO */
+ // Have each thread perform the previously determined number of iterations
+ for(int stride = 0; stride < perThread; stride++)
+ {
+ // Find the index of the current iteration
+ // This is equal to `y` of the init_array function
+ unsigned int iterationIdx = threads * stride + blockThreadIdx;
+
+ // Determine current x and y
+ unsigned int y = iterationIdx % NY;
+ unsigned int x = iterationIdx / NY;
+
+ // Prevent the thread from accessing unallocated memory
+ if(iterationIdx < elements)
+ {
+ // Set the array element
+ A[iterationIdx] = (DATA_TYPE)(x * (y + 1)) / NX;
+ }
+ }
}
#endif
@@ -188,11 +226,11 @@ __global__ static void init_array_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
*
* To be called on the CPU (uses the `__host__` qualifier).
*/
-__host__ static void print_array(DATA_TYPE* Y)
+__host__ static void print_array(DATA_TYPE* Z, unsigned int size)
{
- for (unsigned int x = 0; x < NX; x++)
+ for (unsigned int z = 0; z < size; z++)
{
- fprintf(stderr, DATA_PRINTF_MODIFIER, Y[x]);
+ fprintf(stderr, DATA_PRINTF_MODIFIER, Z[z]);
}
fprintf(stderr, "\n");
}
@@ -212,25 +250,79 @@ __host__ static void print_array(DATA_TYPE* Y)
*
* Parallelizing this is the goal of the assignment.
*
- * Currently to be called on the CPU (uses the `__host__` qualifier), but we may probably want to change that soon.
+ * To be called on the CPU (uses the `__host__` qualifier).
*/
-__host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
+#ifndef HPC_USE_CUDA
+__host__ static void kernel_atax(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
{
- for (unsigned int x = 0; x < NX; x++)
+ for (unsigned int x = 0; x < NY; x++)
{
DATA_TYPE tmp = 0;
- for (unsigned int y = 0; y < NY; y++)
+ for (unsigned int y = 0; y < NX; y++)
{
- tmp += A[x][y] * X[y];
+ tmp += A[a_index(x, y)] * X[y];
}
- for (unsigned int y = 0; y < NY; y++)
+ for (unsigned int y = 0; y < NX; y++)
{
- Y[y] += A[x][y] * tmp;
+ Y[x] += A[a_index(x, y)] * tmp;
}
}
}
+#endif
+
+
+/**
+ * Compute ATAX :
+ * - A is the input matrix
+ * - X is an input vector
+ * - Y is the result vector
+ *
+ * In particular:
+ * ```
+ * A * (A * X) = Y
+ * ```
+ * Wait, there's no transposition here?!?
+ *
+ * Parallelizing this is the goal of the assignment.
+ *
+ * To be called on the device as a kernel (uses the `__global__` qualifier).
+ */
+#ifdef HPC_USE_CUDA
+__global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
+{
+ // Find out how many threads there are
+ unsigned int threads = gridDim.x * blockDim.x;
+
+ // Find how many iterations should be performed by each thread
+ unsigned int perThread = NX / threads + 1;
+
+ // Find the index of the current thread, even if threads span multiple blocks
+ unsigned int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Have each thread perform the previously determined number of iterations
+ for(int stride = 0; stride < perThread; stride++)
+ {
+ unsigned int x = threads * stride + blockThreadIdx;
+
+ if(x < NX)
+ {
+ DATA_TYPE tmp = 0;
+
+ for (unsigned int y = 0; y < NX; y++)
+ {
+ tmp += A[a_index(x, y)] * X[y];
+ }
+
+ for (unsigned int y = 0; y < NX; y++)
+ {
+ atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
+ }
+ }
+ }
+}
+#endif
/**
@@ -240,143 +332,180 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
*/
__host__ int main(int argc, char** argv)
{
- debug("Starting main...");
+ print_debug("[Main] Starting...");
+ std::cerr << "[Main] NX is: " << NX << std::endl;
+ std::cerr << "[Main] NY is: " << NY << std::endl;
#ifndef HPC_USE_CUDA
- debug("[Mode] Host-only");
+ print_debug("[Mode] Host-only");
- debug("[Pointers] Allocating...");
+ print_debug("[Pointers] Allocating...");
- // A[NX][NY]
- DATA_TYPE** A = new DATA_TYPE*[NX] {};
- for(unsigned int x = 0; x < NX; x++)
- {
- A[x] = new DATA_TYPE[NY] {};
- }
+ DATA_TYPE* A = new DATA_TYPE[NX * NY];
+ DATA_TYPE* X = new DATA_TYPE[NY];
+ DATA_TYPE* Y = new DATA_TYPE[NX];
- // X[NY]
- DATA_TYPE* X = new DATA_TYPE[NY] {};
-
- // Y[NX]
- DATA_TYPE* Y = new DATA_TYPE[NX] {};
-
- debug("[Pointers] Allocated!");
+ print_debug("[Pointers] Allocated!");
#ifdef HPC_INCLUDE_INIT
- debug("[Benchmark] Starting...");
+ print_debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
- debug("[Init] Initializing...");
+ print_debug("[Init] Initializing...");
init_array(A, X, Y);
- debug("[Init] Initialized!");
+ print_debug("[Init] Initialized!");
#ifndef HPC_INCLUDE_INIT
- debug("[Benchmark] Starting...");
+ print_debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
- debug("[Kernel] Running...");
+ print_debug("[Kernel] Running...");
kernel_atax(A, X, Y);
- debug("[Kernel] Completed!");
+ print_debug("[Kernel] Completed!");
- debug("[Benchmark] Stopping...");
+ print_debug("[Benchmark] Stopping...");
polybench_stop_instruments;
polybench_print_instruments;
- debug("[Benchmark] Complete!");
+ print_debug("[Benchmark] Complete!");
- debug("[Verify] Printing...")
+ #ifdef HPC_DEBUG
+ print_debug("[Debug] Displaying A:");
+ print_array(A, NX * NY);
+ print_debug("[Debug] Displaying X:");
+ print_array(X, NY);
+ print_debug("[Debug] Displaying Y:");
+ print_array(Y, NX);
+ #endif
+
+ print_debug("[Verify] Printing...");
polybench_prevent_dce(
- print_array(Y)
+ print_array(Y, NX)
);
- debug("[Verify] Done!")
+ print_debug("[Verify] Done!");
#else
- debug("[Mode] Host-and-device, CUDA");
+ print_debug("[Mode] Host-and-device, CUDA");
- debug("[Pointers] Allocating...");
+ print_debug("[Pointers] Allocating...");
DATA_TYPE* A;
DATA_TYPE* X;
DATA_TYPE* Y;
+ DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
+ DATA_TYPE* host_X = new DATA_TYPE[NY];
+ DATA_TYPE* host_Y = new DATA_TYPE[NX];
- debug("[CUDA] Allocating A...");
- if(cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
+ print_debug("[CUDA] Allocating A...");
+ if(cudaError_t err = cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
{
- debug("[CUDA] Could not allocate A!");
+ print_cudaError(err, "[CUDA] Could not allocate A!");
return 1;
}
- debug("[CUDA] Allocated A!");
+ print_debug("[CUDA] Allocated A!");
- debug("[CUDA] Allocating X...");
- if(cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
+ print_debug("[CUDA] Allocating X...");
+ if(cudaError_t err = cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
{
- debug("[CUDA] Could not allocate X!");
+ print_cudaError(err, "[CUDA] Could not allocate X!");
return 1;
}
- debug("[CUDA] Allocated X!");
+ print_debug("[CUDA] Allocated X!");
- debug("[CUDA] Allocating Y...");
- if(cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
+ print_debug("[CUDA] Allocating Y...");
+ if(cudaError_t err = cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
{
- debug("[CUDA] Could not allocate Y!");
+ print_cudaError(err, "[CUDA] Could not allocate Y!");
return 1;
}
- debug("[CUDA] Allocated Y!");
+ print_debug("[CUDA] Allocated Y!");
#ifdef POLYBENCH_INCLUDE_INIT
- debug("[Benchmark] Starting...");
+ print_debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
- debug("[Init] Initializing...");
+ print_debug("[Init] Initializing...");
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
- if(cudaGetLastError())
+ if(cudaError_t err = cudaGetLastError())
{
- debug("[Init] Failed to execute kernel!");
+ print_cudaError(err, "[Init] Failed to execute kernel!");
return 1;
}
- debug("[Init] Initialized!");
+ print_debug("[Init] Complete!");
#ifndef POLYBENCH_INCLUDE_INIT
- debug("[Benchmark] Starting...");
+ print_debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
- // kernel_atax_cuda<<<1, 1>>>();
+ print_debug("[Kernel] Running...");
+ kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
+ print_debug("[Kernel] Complete!");
+ print_debug("[CUDA] Copying A back...");
+ if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
+ print_cudaError(err, "[CUDA] Could copy A back!");
+ return 1;
+ };
+ print_debug("[CUDA] Copied A back!");
+
+ print_debug("[CUDA] Copying X back...");
+ if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) {
+ print_cudaError(err, "[CUDA] Could copy X back!");
+ return 1;
+ };
+ print_debug("[CUDA] Copied X back!");
+
+ print_debug("[CUDA] Copying Y back...");
+ if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {
+ print_cudaError(err, "[CUDA] Could copy Y back!");
+ return 1;
+ };
+ print_debug("[CUDA] Copied Y back!");
+
+ print_debug("[Benchmark] Stopping...");
polybench_stop_instruments;
polybench_print_instruments;
+ print_debug("[Benchmark] Complete!");
- // Y = cudaMemcpy();
-
- debug("[CUDA] Freeing A...");
- if(cudaFree(A)) {
- debug("[CUDA] Could not free A!");
+ print_debug("[CUDA] Freeing A...");
+ if(cudaError_t err = cudaFree(A)) {
+ print_cudaError(err, "[CUDA] Could not free A!");
return 1;
}
- debug("[CUDA] Freed A!");
+ print_debug("[CUDA] Freed A!");
- debug("[CUDA] Freeing X...");
- if(cudaFree(X)) {
- debug("[CUDA] Could not free X!");
+ print_debug("[CUDA] Freeing X...");
+ if(cudaError_t err = cudaFree(X)) {
+ print_cudaError(err, "[CUDA] Could not free X!");
return 1;
}
- debug("[CUDA] Freed X!");
+ print_debug("[CUDA] Freed X!");
- debug("[CUDA] Freeing Y...");
- if(cudaFree(Y)) {
- debug("[CUDA] Could not free Y!");
+ print_debug("[CUDA] Freeing Y...");
+ if(cudaError_t err = cudaFree(Y)) {
+ print_cudaError(err, "[CUDA] Could not free Y!");
return 1;
}
- debug("[CUDA] Freed Y!");
+ print_debug("[CUDA] Freed Y!");
- /*
+ #ifdef HPC_DEBUG
+ print_debug("[Debug] Displaying A:");
+ print_array(host_A, NX * NY);
+ print_debug("[Debug] Displaying X:");
+ print_array(host_X, NY);
+ print_debug("[Debug] Displaying Y:");
+ print_array(host_Y, NX);
+ #endif
+
+ print_debug("[Verify] Printing...");
polybench_prevent_dce(
- print_array(Y)
+ print_array(host_Y, NX)
);
- */
+ print_debug("[Verify] Done!");
#endif
diff --git a/atax/atax.hu b/atax/atax.hu
index 280f2a2..9ce4a68 100644
--- a/atax/atax.hu
+++ b/atax/atax.hu
@@ -3,8 +3,12 @@
#define ATAX_H
/* Default to STANDARD_DATASET. */
- #if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
- #define STANDARD_DATASET
+ #if !defined(NANO_DATASET) && !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
+ #ifdef HPC_DEBUG
+ #define NANO_DATASET
+ #else
+ #define EXTRALARGE_DATASET
+ #endif
#endif
/* Do not define anything if the user manually defines the size. */
@@ -12,6 +16,11 @@
/* Define the possible dataset sizes. */
+ #ifdef NANO_DATASET
+ #define NX 3
+ #define NY 5
+ #endif
+
#ifdef MINI_DATASET
#define NX 32
#define NY 32