Get everything to work

2025-02-18 18:13:58 +00:00 · 2022-12-02 00:15:33 +01:00 · 2022-12-02 00:15:33 +01:00 · ef30e88e01
commit ef30e88e01
parent ae73536d76
22 changed files with 404 additions and 278 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,6 @@
 root = true
 [*]
 end_of_line = lf
 insert_final_newline = true
 indent_style = tab
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -1,8 +0,0 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/codeStyles/Project.xml
+++ b/.idea/codeStyles/Project.xml
@ -1,40 +0,0 @@
 <component name="ProjectCodeStyleConfiguration">
  <code_scheme name="Project" version="173">
    <Objective-C>
      <option name="INDENT_CLASS_MEMBERS" value="8" />
      <option name="INDENT_VISIBILITY_KEYWORDS" value="4" />
      <option name="INDENT_PREPROCESSOR_DIRECTIVE" value="4" />
      <option name="INDENT_DIRECTIVE_AS_CODE" value="true" />
      <option name="SPACE_BEFORE_INIT_LIST" value="true" />
      <option name="SPACE_BEFORE_POINTER_IN_DECLARATION" value="false" />
      <option name="SPACE_AFTER_POINTER_IN_DECLARATION" value="true" />
      <option name="SPACE_BEFORE_REFERENCE_IN_DECLARATION" value="false" />
      <option name="SPACE_AFTER_REFERENCE_IN_DECLARATION" value="true" />
      <option name="DISCHARGED_SHORT_TERNARY_OPERATOR" value="true" />
    </Objective-C>
    <clangFormatSettings>
      <option name="ENABLED" value="true" />
    </clangFormatSettings>
    <files>
      <extensions>
        <pair source="cpp" header="hpp" fileNamingConvention="NONE" />
        <pair source="c" header="h" fileNamingConvention="NONE" />
        <pair source="cu" header="cuh" fileNamingConvention="NONE" />
      </extensions>
    </files>
    <codeStyleSettings language="Markdown">
      <indentOptions>
        <option name="INDENT_SIZE" value="3" />
        <option name="TAB_SIZE" value="3" />
      </indentOptions>
    </codeStyleSettings>
    <codeStyleSettings language="ObjectiveC">
      <indentOptions>
        <option name="INDENT_SIZE" value="2" />
        <option name="TAB_SIZE" value="2" />
        <option name="SMART_TABS" value="true" />
        <option name="KEEP_INDENTS_ON_EMPTY_LINES" value="true" />
      </indentOptions>
    </codeStyleSettings>
  </code_scheme>
 </component>
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
@ -1,5 +0,0 @@
 <component name="ProjectCodeStyleConfiguration">
  <state>
    <option name="USE_PER_PROJECT_SETTINGS" value="true" />
  </state>
 </component>
--- a/.idea/customTargets.xml
+++ b/.idea/customTargets.xml
@ -1,15 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="CLionExternalBuildManager">
    <target id="4d1b5109-3338-4779-bba7-8def1c68abbb" name="All" defaultType="MAKE">
      <configuration id="c01c91f7-5730-4713-b432-50125cbe22a7" name="All">
        <build type="MAKE">
          <make targetName="all" />
        </build>
        <clean type="MAKE">
          <make targetName="clean" />
        </clean>
      </configuration>
    </target>
  </component>
 </project>
--- a/.idea/discord.xml
+++ b/.idea/discord.xml
@ -1,7 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="DiscordProjectSettings">
    <option name="show" value="PROJECT" />
    <option name="description" value="" />
  </component>
 </project>
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -1,11 +0,0 @@
 <component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
    <inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
    <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
      <option name="processCode" value="true" />
      <option name="processLiterals" value="true" />
      <option name="processComments" value="true" />
    </inspection_tool>
  </profile>
 </component>
--- a/.idea/makefile.xml
+++ b/.idea/makefile.xml
@ -1,25 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="MakefileBuildTargetsManager">
    <user-build-targets>
      <build-target name="all">
        <build-configurations>
          <build-configuration>
            <make-targets>
              <make-target>all</make-target>
            </make-targets>
          </build-configuration>
        </build-configurations>
      </build-target>
      <build-target name="clean">
        <build-configurations>
          <build-configuration>
            <make-targets>
              <make-target>clean</make-target>
            </make-targets>
          </build-configuration>
        </build-configurations>
      </build-target>
    </user-build-targets>
  </component>
 </project>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,20 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ExternalStorageConfigurationManager" enabled="true" />
  <component name="MakefileSettings">
    <option name="linkedExternalProjectsSettings">
      <MakefileProjectSettings>
        <option name="externalProjectPath" value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
        <option name="modules">
          <set>
            <option value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
          </set>
        </option>
        <option name="version" value="2" />
      </MakefileProjectSettings>
    </option>
  </component>
  <component name="MakefileWorkspace" PROJECT_DIR="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax">
    <contentRoot DIR="$PROJECT_DIR$" />
  </component>
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -1,8 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/unimore-hpc-1.iml" filepath="$PROJECT_DIR$/.idea/unimore-hpc-1.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/runConfigurations/atax_acc.xml
+++ b/.idea/runConfigurations/atax_acc.xml
@ -1,7 +0,0 @@
 <component name="ProjectRunConfigurationManager">
  <configuration default="false" name="atax_acc" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" WORKING_DIR="file://$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" PASS_PARENT_ENVS_2="true" PROJECT_NAME="unimore-hpc-1" TARGET_NAME="all" CONFIG_NAME="all" version="1" RUN_PATH="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax/atax_acc">
    <method v="2">
      <option name="CLION.COMPOUND.BUILD" enabled="true" />
    </method>
  </configuration>
 </component>
--- a/.idea/unimore-hpc-1.iml
+++ b/.idea/unimore-hpc-1.iml
@ -1,9 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="JAVA_MODULE" version="4">
  <component name="NewModuleRootManager" inherit-compiler-output="true">
    <exclude-output />
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/unimore-hpc-12.iml
+++ b/.idea/unimore-hpc-12.iml
@ -1,2 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module classpath="External" type="CPP_MODULE" version="4" />
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -1,6 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="" vcs="Git" />
  </component>
 </project>
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -0,0 +1,106 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="AutoImportSettings">
    <option name="autoReloadType" value="SELECTIVE" />
  </component>
  <component name="CMakePresetLoader"><![CDATA[{
  "useNewFormat": true
 }]]></component>
  <component name="CMakeRunConfigurationManager">
    <generated>
      <config projectName="unimore-hpc-assignments" targetName="atax.elf" />
    </generated>
  </component>
  <component name="CMakeSettings">
    <configurations />
  </component>
  <component name="ChangeListManager">
    <list default="true" id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="">
      <change beforePath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" afterDir="false" />
      <change beforePath="$PROJECT_DIR$/.vscode/settings.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/settings.json" afterDir="false" />
      <change beforePath="$PROJECT_DIR$/atax/.bench.sh" beforeDir="false" afterPath="$PROJECT_DIR$/atax/.bench.sh" afterDir="false" />
      <change beforePath="$PROJECT_DIR$/atax/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/atax/Makefile" afterDir="false" />
      <change beforePath="$PROJECT_DIR$/atax/atax.cu" beforeDir="false" afterPath="$PROJECT_DIR$/atax/atax.cu" afterDir="false" />
    </list>
    <option name="SHOW_DIALOG" value="false" />
    <option name="HIGHLIGHT_CONFLICTS" value="true" />
    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
    <option name="LAST_RESOLUTION" value="IGNORE" />
  </component>
  <component name="Git.Settings">
    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
  </component>
  <component name="MakefileLocalSettings">
    <option name="projectSyncType">
      <map>
        <entry key="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" value="RE_IMPORT" />
      </map>
    </option>
  </component>
  <component name="MarkdownSettingsMigration">
    <option name="stateVersion" value="1" />
  </component>
  <component name="ProjectId" id="2IKa1Pp6YCa8Ycz7UwOe8DKGQjU" />
  <component name="ProjectViewState">
    <option name="hideEmptyMiddlePackages" value="true" />
    <option name="showLibraryContents" value="true" />
  </component>
  <component name="PropertiesComponent"><![CDATA[{
  "keyToString": {
    "RunOnceActivity.OpenProjectViewOnStart": "true",
    "RunOnceActivity.ShowReadmeOnStart": "true",
    "RunOnceActivity.cidr.known.project.marker": "true",
    "SHARE_PROJECT_CONFIGURATION_FILES": "true",
    "WebServerToolWindowFactoryState": "false",
    "cf.first.check.clang-format": "false",
    "cidr.known.project.marker": "true",
    "last_opened_file_path": "/home/steffo/Workspaces/Steffo99/unimore-hpc-assignments/atax",
    "nodejs_package_manager_path": "npm",
    "settings.editor.selected.configurable": "MakefileSettings"
  },
  "keyToStringList": {
    "GitStage.ChangesTree.GroupingKeys": [
      "directory",
      "module",
      "repository"
    ]
  }
 }]]></component>
  <component name="RunManager">
    <configuration default="true" type="CLionExternalRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" PASS_PARENT_ENVS_2="true">
      <method v="2">
        <option name="CLION.EXTERNAL.BUILD" enabled="true" />
      </method>
    </configuration>
  </component>
  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
  <component name="TaskManager">
    <task active="true" id="Default" summary="Default task">
      <changelist id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="" />
      <created>1669932513703</created>
      <option name="number" value="Default" />
      <option name="presentableId" value="Default" />
      <updated>1669932513703</updated>
      <workItem from="1669932517554" duration="325000" />
    </task>
    <servers />
  </component>
  <component name="TypeScriptGeneratedFilesManager">
    <option name="version" value="3" />
  </component>
  <component name="Vcs.Log.Tabs.Properties">
    <option name="TAB_STATES">
      <map>
        <entry key="MAIN">
          <value>
            <State />
          </value>
        </entry>
      </map>
    </option>
  </component>
  <component name="XSLT-Support.FileAssociations.UIState">
    <expand />
    <select />
  </component>
 </project>
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@ -6,7 +6,7 @@
                "${workspaceFolder}/**"
            ],
            "defines": [],
-            "compilerPath": "/usr/local/cuda-10.0/bin/nvcc",
+            "compilerPath": "/opt/cuda/bin/nvcc",
            "cStandard": "c11",
            "cppStandard": "c++14",
            "configurationProvider": "ms-vscode.makefile-tools"
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -15,6 +15,7 @@
            "environment": [],
            "externalConsole": false,
            "MIMode": "gdb",
            "miDebuggerPath": "/usr/bin/gdb",
            "setupCommands": [
                {
                    "description": "Enable pretty-printing for gdb",
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,5 +1,39 @@
 {
    "files.associations": {
-        "*.hu": "cuda-cpp"
+        "*.hu": "cuda-cpp",
        "array": "cpp",
        "*.tcc": "cpp",
        "cctype": "cpp",
        "clocale": "cpp",
        "cmath": "cpp",
        "compare": "cpp",
        "concepts": "cpp",
        "cstdarg": "cpp",
        "cstdint": "cpp",
        "cstdio": "cpp",
        "cstdlib": "cpp",
        "cwchar": "cpp",
        "cwctype": "cpp",
        "unordered_map": "cpp",
        "vector": "cpp",
        "exception": "cpp",
        "functional": "cpp",
        "initializer_list": "cpp",
        "iosfwd": "cpp",
        "iostream": "cpp",
        "istream": "cpp",
        "limits": "cpp",
        "new": "cpp",
        "numbers": "cpp",
        "ostream": "cpp",
        "stdexcept": "cpp",
        "streambuf": "cpp",
        "string": "cpp",
        "string_view": "cpp",
        "system_error": "cpp",
        "tuple": "cpp",
        "type_traits": "cpp",
        "typeinfo": "cpp",
        "utility": "cpp"
    }
 }
--- a/atax/.bench.sh
+++ b/atax/.bench.sh
@ -1,12 +1,12 @@
 #!/bin/bash
 run_benchmarks() {
-    runs=25
+    runs=3
    totalt=0.0
    for i in $(seq $runs)
    do
-        exet=$(./atax.elf)
+        exet=$(./atax.elf 2> /dev/null)
        totalt=$(awk "BEGIN{print $totalt+$exet}")
        echo -n "."
        # echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
@ -16,9 +16,9 @@ run_benchmarks() {
    echo "  Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
 }
-for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
+for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET
 do
-    for c in $(seq 0 7)
+    for c in $(seq 0 3)
    do
        cxxflags="-D$dataset"
@ -32,12 +32,8 @@ do
            cxxflags="$cxxflags -DHPC_USE_CUDA"
        fi
        if (( $c & 2 ))
        then
            cxxflags="$cxxflags -DHPC_USE_STRIDE"
        fi
        echo "Flags: $cxxflags"
        make --silent "clean"
        make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
        run_benchmarks
--- a/atax/Makefile
+++ b/atax/Makefile
@ -5,16 +5,25 @@ MAKEFLAGS+= -r
 CXXFLAGS+= -DPOLYBENCH_TIME
 # -O3 applies all compiler optimization, improving from 800ms to 300ms
 CXXFLAGS+= -O3
 # Enable this to view the contents of the arrays
 CXXFLAGS+= -DHPC_DEBUG
 # Enable this to use CUDA
 CXXFLAGS+= -DHPC_USE_CUDA
 # Extend CFLAGS with command line parameters
 CXXFLAGS+= ${EXTRA_CXXFLAGS}
 # Select the location of the local CUDA install
-CUDA_HOME:=/usr/local/cuda-10.0
+# CUDA_HOME:=/usr/local/cuda-10.0
 CUDA_HOME:=/opt/cuda
 # Specify the directory of the nvc compiler
 NVCC:=$(CUDA_HOME)/bin/nvcc
 # Specify the flags for the nvc compiler
 NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
 # Optimize for @Steffo's NVIDIA GTX 1070
 NVCFLAGS+= -arch=compute_61
 NVCFLAGS+= -code=sm_61
 %.elf: %.cu.o polybench.cu.o
 	$(NVCC) $(NVCFLAGS) $^ -o $@ $(LDFLAGS)
@ -23,13 +32,12 @@ NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
 	$(NVCC) $(NVCFLAGS) -c $< -o $@
-.PHONY: bench clean dev
+all: atax.elf
-dev: atax.elf
+.PHONY: bench clean
 	./atax.elf
 bench:
 	./.bench.sh
 clean:
-	rm *.elf *.cu.o
+	rm *.elf
--- a/atax/atax.cu
+++ b/atax/atax.cu
@ -3,6 +3,7 @@
 #include <string.h>
 #include <math.h>
 #include <iostream>
 #include <string>
 /* Include polybench common header. */
 #include "polybench.hu"
@ -23,16 +24,34 @@
 	#define CUDA_NTHREADS 128
 #endif
 // Enable syntax highlighting for the CUDA mode
 // TODO: Remove this, as it will be set by .bench.sh
 #define HPC_USE_CUDA
-// Enable syntax highlighting for the stride mode
+/**
-// TODO: Remove this, as it will be set by .bench.sh
+ * Given a `x` and a `y`, compute the relative index of the element in the `A` matrix.
-#define HPC_USE_STRIDE
+ */
 __host__ __device__ inline static unsigned int a_index(unsigned int x, unsigned int y) {
 	return x * NY + y;
 }
-// Create macro for debug logging
+/**
-#define debug(txt) std::cerr << txt << std::endl
+ * Log a debug message.
 */
 __host__ inline static void print_debug(std::string txt) {
 	#ifdef HPC_DEBUG
 		std::cerr << txt << std::endl;
 	#endif
 }
 /**
 * Log an error message.
 */
 #ifdef HPC_USE_CUDA
 __host__ inline static void print_cudaError(cudaError_t err, std::string txt) {
 	#ifdef HPC_DEBUG
 		std::cerr << txt;
 		fprintf( stderr, ": error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(err) );
 	#endif
 }
 #endif
 /**
@ -45,7 +64,7 @@
 * To be called on the CPU (uses the `__host__` qualifier).
 */
 #ifndef HPC_USE_CUDA
-__host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
+__host__ static void init_array(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
 {
 	/* X = [ 3.14, 6.28, 9.42, ... ] */
 	for (unsigned int y = 0; y < NY; y++) 
@ -72,7 +91,7 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 	{
 		for (unsigned int y = 0; y < NY; y++) 
 		{
-			A[x][y] = (DATA_TYPE)(x * (y + 1)) / NX;
+			A[a_index(x, y)] = (DATA_TYPE)(x * (y + 1)) / NX;
 		}
 	}
 }
@ -87,20 +106,21 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 __device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
 {
 	// Find how many iterations should be performed by each thread
-	unsigned int perThread = NY / threads;
+	unsigned int perThread = NY / threads + 1;
 	// Find the index of the current thread, even if threads span multiple blocks
 	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
 	// Have each thread perform the previously determined number of iterations
-	for(int stride = 0; stride < perThread; stride++) {
+	for(int stride = 0; stride < perThread; stride++)
 	{
 		// Find the index of the current iteration
 		// This is equal to `y` of the init_array function
-		int iterationIdx = blockThreadIdx * stride;
+		unsigned int iterationIdx = threads * stride + blockThreadIdx;
 		// Prevent the thread from accessing unallocated memory
-		if(iterationIdx < NY) {
+		if(iterationIdx < NY)
-
+		{
 			// Set the array element
 			X[iterationIdx] = iterationIdx * M_PI;
 		}
@ -117,20 +137,21 @@ __device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
 __device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
 {
 	// Find how many iterations should be performed by each thread
-	unsigned int perThread = NX / threads;
+	unsigned int perThread = NX / threads + 1;
 	// Find the index of the current thread, even if threads span multiple blocks
 	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
 	// Have each thread perform the previously determined number of iterations
-	for(int stride = 0; stride < perThread; stride++) {
+	for(int stride = 0; stride < perThread; stride++) 
 	{
 		// Find the index of the current iteration
 		// This is equal to `y` of the init_array function
-		int iterationIdx = blockThreadIdx * stride;
+		unsigned int iterationIdx = threads * stride + blockThreadIdx;
 		// Prevent the thread from accessing unallocated memory
-		if(iterationIdx < NX) {
+		if(iterationIdx < NX) 
-
+		{
 			// Set the array element
 			Y[iterationIdx] = 0;
 		}
@ -150,12 +171,29 @@ __device__ static void init_array_cuda_a(DATA_TYPE* A, unsigned int threads)
 	unsigned int elements = NX * NY;
 	// Find how many iterations should be performed by each thread
-	unsigned int perThread = elements / threads;
+	unsigned int perThread = elements / threads + 1;
 	// Find the index of the current thread, even if threads span multiple blocks
 	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
-	/* TODO */
+	// Have each thread perform the previously determined number of iterations
 	for(int stride = 0; stride < perThread; stride++) 
 	{
 		// Find the index of the current iteration
 		// This is equal to `y` of the init_array function
 		unsigned int iterationIdx = threads * stride + blockThreadIdx;
 		// Determine current x and y
 		unsigned int y = iterationIdx % NY;
 		unsigned int x = iterationIdx / NY;
 		// Prevent the thread from accessing unallocated memory
 		if(iterationIdx < elements) 
 		{
 			// Set the array element
 			A[iterationIdx] = (DATA_TYPE)(x * (y + 1)) / NX;
 		}
 	}
 }
 #endif
@ -188,11 +226,11 @@ __global__ static void init_array_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
 * 
 * To be called on the CPU (uses the `__host__` qualifier).
 */
-__host__ static void print_array(DATA_TYPE* Y)
+__host__ static void print_array(DATA_TYPE* Z, unsigned int size)
 {
-	for (unsigned int x = 0; x < NX; x++) 
+	for (unsigned int z = 0; z < size; z++) 
 	{
-		fprintf(stderr, DATA_PRINTF_MODIFIER, Y[x]);
+		fprintf(stderr, DATA_PRINTF_MODIFIER, Z[z]);
 	}
 	fprintf(stderr, "\n");
 }
@ -212,25 +250,79 @@ __host__ static void print_array(DATA_TYPE* Y)
 * 
 * Parallelizing this is the goal of the assignment.
 * 
- * Currently to be called on the CPU (uses the `__host__` qualifier), but we may probably want to change that soon.
+ * To be called on the CPU (uses the `__host__` qualifier).
 */
-__host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
+#ifndef HPC_USE_CUDA
 __host__ static void kernel_atax(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
 {
-	for (unsigned int x = 0; x < NX; x++) 
+	for (unsigned int x = 0; x < NY; x++) 
 	{
 		DATA_TYPE tmp = 0;
-		for (unsigned int y = 0; y < NY; y++) 
+		for (unsigned int y = 0; y < NX; y++) 
 		{
-			tmp += A[x][y] * X[y];
+			tmp += A[a_index(x, y)] * X[y];
 		}
-		for (unsigned int y = 0; y < NY; y++) 
+		for (unsigned int y = 0; y < NX; y++) 
 		{
-			Y[y] += A[x][y] * tmp;
+			Y[x] += A[a_index(x, y)] * tmp;
 		}
 	}
 }
 #endif
 /**
 * Compute ATAX :
 * - A is the input matrix
 * - X is an input vector
 * - Y is the result vector
 * 
 * In particular:
 * ```
 * A * (A * X) = Y
 * ```
 * Wait, there's no transposition here?!?
 * 
 * Parallelizing this is the goal of the assignment.
 * 
 * To be called on the device as a kernel (uses the `__global__` qualifier).
 */
 #ifdef HPC_USE_CUDA
 __global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
 {
 	// Find out how many threads there are
 	unsigned int threads = gridDim.x * blockDim.x;
 	// Find how many iterations should be performed by each thread
 	unsigned int perThread = NX / threads + 1;
 	// Find the index of the current thread, even if threads span multiple blocks
 	unsigned int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
 	// Have each thread perform the previously determined number of iterations
 	for(int stride = 0; stride < perThread; stride++) 
 	{
 		unsigned int x = threads * stride + blockThreadIdx;
 		if(x < NX) 
 		{
 			DATA_TYPE tmp = 0;
 			for (unsigned int y = 0; y < NX; y++) 
 			{
 				tmp += A[a_index(x, y)] * X[y];
 			}
 			for (unsigned int y = 0; y < NX; y++) 
 			{
 				atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
 			}
 		}
 	}
 }
 #endif
 /**
@ -240,143 +332,180 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 */
 __host__ int main(int argc, char** argv)
 {
-	debug("Starting main...");
+	print_debug("[Main] Starting...");
 	std::cerr << "[Main] NX is: " << NX << std::endl;
 	std::cerr << "[Main] NY is: " << NY << std::endl;
 	#ifndef HPC_USE_CUDA
-		debug("[Mode] Host-only");
+		print_debug("[Mode] Host-only");
-		debug("[Pointers] Allocating...");
+		print_debug("[Pointers] Allocating...");
-		// A[NX][NY]
+		DATA_TYPE* A = new DATA_TYPE[NX * NY];
-		DATA_TYPE** A = new DATA_TYPE*[NX] {};
+		DATA_TYPE* X = new DATA_TYPE[NY];
-		for(unsigned int x = 0; x < NX; x++)
+		DATA_TYPE* Y = new DATA_TYPE[NX];
 		{
 			A[x] = new DATA_TYPE[NY] {};
 		}
-		// X[NY]
+		print_debug("[Pointers] Allocated!");
 		DATA_TYPE* X = new DATA_TYPE[NY] {};
 		// Y[NX]
 		DATA_TYPE* Y = new DATA_TYPE[NX] {};
 		debug("[Pointers] Allocated!");
 		#ifdef HPC_INCLUDE_INIT
-			debug("[Benchmark] Starting...");
+			print_debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif
-		debug("[Init] Initializing...");
+		print_debug("[Init] Initializing...");
 		init_array(A, X, Y);
-		debug("[Init] Initialized!");
+		print_debug("[Init] Initialized!");
 		#ifndef HPC_INCLUDE_INIT
-			debug("[Benchmark] Starting...");
+			print_debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif
-		debug("[Kernel] Running...");
+		print_debug("[Kernel] Running...");
 		kernel_atax(A, X, Y);
-		debug("[Kernel] Completed!");
+		print_debug("[Kernel] Completed!");
-		debug("[Benchmark] Stopping...");
+		print_debug("[Benchmark] Stopping...");
 		polybench_stop_instruments;
 		polybench_print_instruments;
-		debug("[Benchmark] Complete!");
+		print_debug("[Benchmark] Complete!");
-		debug("[Verify] Printing...")
+		#ifdef HPC_DEBUG
 			print_debug("[Debug] Displaying A:");
 			print_array(A, NX * NY);
 			print_debug("[Debug] Displaying X:");
 			print_array(X, NY);
 			print_debug("[Debug] Displaying Y:");
 			print_array(Y, NX);
 		#endif
 		print_debug("[Verify] Printing...");
 		polybench_prevent_dce(
-			print_array(Y)
+			print_array(Y, NX)
 		);
-		debug("[Verify] Done!")
+		print_debug("[Verify] Done!");
 	#else
-		debug("[Mode] Host-and-device, CUDA");
+		print_debug("[Mode] Host-and-device, CUDA");
-		debug("[Pointers] Allocating...");
+		print_debug("[Pointers] Allocating...");
 		DATA_TYPE* A;
 		DATA_TYPE* X;
 		DATA_TYPE* Y;
 		DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
 		DATA_TYPE* host_X = new DATA_TYPE[NY];
 		DATA_TYPE* host_Y = new DATA_TYPE[NX];
-		debug("[CUDA] Allocating A...");
+		print_debug("[CUDA] Allocating A...");
-		if(cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY)) 
+		if(cudaError_t err = cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY)) 
 		{
-			debug("[CUDA] Could not allocate A!");
+			print_cudaError(err, "[CUDA] Could not allocate A!");
 			return 1;
 		}
-		debug("[CUDA] Allocated A!");
+		print_debug("[CUDA] Allocated A!");
-		debug("[CUDA] Allocating X...");
+		print_debug("[CUDA] Allocating X...");
-		if(cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
+		if(cudaError_t err = cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
 		{
-			debug("[CUDA] Could not allocate X!");
+			print_cudaError(err, "[CUDA] Could not allocate X!");
 			return 1;
 		}
-		debug("[CUDA] Allocated X!");
+		print_debug("[CUDA] Allocated X!");
-		debug("[CUDA] Allocating Y...");
+		print_debug("[CUDA] Allocating Y...");
-		if(cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
+		if(cudaError_t err = cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
 		{
-			debug("[CUDA] Could not allocate Y!");
+			print_cudaError(err, "[CUDA] Could not allocate Y!");
 			return 1;
 		}
-		debug("[CUDA] Allocated Y!");
+		print_debug("[CUDA] Allocated Y!");
 		#ifdef POLYBENCH_INCLUDE_INIT
-			debug("[Benchmark] Starting...");
+			print_debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif
-		debug("[Init] Initializing...");
+		print_debug("[Init] Initializing...");
 		init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
-		if(cudaGetLastError())
+		if(cudaError_t err = cudaGetLastError())
 		{
-			debug("[Init] Failed to execute kernel!");
+			print_cudaError(err, "[Init] Failed to execute kernel!");
 			return 1;
 		}
-		debug("[Init] Initialized!");
+		print_debug("[Init] Complete!");
 		#ifndef POLYBENCH_INCLUDE_INIT
-			debug("[Benchmark] Starting...");
+			print_debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif
-		// kernel_atax_cuda<<<1, 1>>>();
+		print_debug("[Kernel] Running...");
 		kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
 		print_debug("[Kernel] Complete!");
 		print_debug("[CUDA] Copying A back...");
 		if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
 			print_cudaError(err, "[CUDA] Could copy A back!");
 			return 1;
 		};
 		print_debug("[CUDA] Copied A back!");
 		print_debug("[CUDA] Copying X back...");
 		if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) {
 			print_cudaError(err, "[CUDA] Could copy X back!");
 			return 1;
 		};
 		print_debug("[CUDA] Copied X back!");
 		print_debug("[CUDA] Copying Y back...");
 		if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {
 			print_cudaError(err, "[CUDA] Could copy Y back!");
 			return 1;
 		};
 		print_debug("[CUDA] Copied Y back!");
 		print_debug("[Benchmark] Stopping...");
 		polybench_stop_instruments;
 		polybench_print_instruments;
 		print_debug("[Benchmark] Complete!");
-		// Y = cudaMemcpy();
+		print_debug("[CUDA] Freeing A...");
-
+		if(cudaError_t err = cudaFree(A)) {
-		debug("[CUDA] Freeing A...");
+			print_cudaError(err, "[CUDA] Could not free A!");
 		if(cudaFree(A)) {
 			debug("[CUDA] Could not free A!");
 			return 1;
 		}
-		debug("[CUDA] Freed A!");
+		print_debug("[CUDA] Freed A!");
-		debug("[CUDA] Freeing X...");
+		print_debug("[CUDA] Freeing X...");
-		if(cudaFree(X)) {
+		if(cudaError_t err = cudaFree(X)) {
-			debug("[CUDA] Could not free X!");
+			print_cudaError(err, "[CUDA] Could not free X!");
 			return 1;
 		}
-		debug("[CUDA] Freed X!");
+		print_debug("[CUDA] Freed X!");
-		debug("[CUDA] Freeing Y...");
+		print_debug("[CUDA] Freeing Y...");
-		if(cudaFree(Y)) {
+		if(cudaError_t err = cudaFree(Y)) {
-			debug("[CUDA] Could not free Y!");
+			print_cudaError(err, "[CUDA] Could not free Y!");
 			return 1;
 		}
-		debug("[CUDA] Freed Y!");
+		print_debug("[CUDA] Freed Y!");
-		/*
+		#ifdef HPC_DEBUG
 			print_debug("[Debug] Displaying A:");
 			print_array(host_A, NX * NY);
 			print_debug("[Debug] Displaying X:");
 			print_array(host_X, NY);
 			print_debug("[Debug] Displaying Y:");
 			print_array(host_Y, NX);
 		#endif
 		print_debug("[Verify] Printing...");
 		polybench_prevent_dce(
-			print_array(Y)
+			print_array(host_Y, NX)
 		);
-		*/
+		print_debug("[Verify] Done!");
 	#endif
--- a/atax/atax.hu
+++ b/atax/atax.hu
@ -3,8 +3,12 @@
 	#define ATAX_H
 	/* Default to STANDARD_DATASET. */
-	#if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
+	#if !defined(NANO_DATASET) && !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
-		#define STANDARD_DATASET
+		#ifdef HPC_DEBUG
 			#define NANO_DATASET
 		#else
 			#define EXTRALARGE_DATASET
 		#endif
 	#endif
 	/* Do not define anything if the user manually defines the size. */
@ -12,6 +16,11 @@
 		/* Define the possible dataset sizes. */
 		#ifdef NANO_DATASET
 			#define NX 3
 			#define NY 5
 		#endif
 		#ifdef MINI_DATASET
 			#define NX 32
 			#define NY 32
		`@ -1,2 +0,0 @@`
			`<?xml version="1.0" encoding="UTF-8"?>`
			`<module classpath="External" type="CPP_MODULE" version="4" />`