Get everything to work

2025-02-18 18:13:58 +00:00 · 2022-12-02 00:15:33 +01:00 · 2022-12-02 00:15:33 +01:00 · ef30e88e01
commit ef30e88e01
parent ae73536d76
22 changed files with 404 additions and 278 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,6 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+indent_style = tab
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
--- a/.idea/codeStyles/Project.xml
+++ b/.idea/codeStyles/Project.xml
@ -1,40 +0,0 @@
-<component name="ProjectCodeStyleConfiguration">
-  <code_scheme name="Project" version="173">
-    <Objective-C>
-      <option name="INDENT_CLASS_MEMBERS" value="8" />
-      <option name="INDENT_VISIBILITY_KEYWORDS" value="4" />
-      <option name="INDENT_PREPROCESSOR_DIRECTIVE" value="4" />
-      <option name="INDENT_DIRECTIVE_AS_CODE" value="true" />
-      <option name="SPACE_BEFORE_INIT_LIST" value="true" />
-      <option name="SPACE_BEFORE_POINTER_IN_DECLARATION" value="false" />
-      <option name="SPACE_AFTER_POINTER_IN_DECLARATION" value="true" />
-      <option name="SPACE_BEFORE_REFERENCE_IN_DECLARATION" value="false" />
-      <option name="SPACE_AFTER_REFERENCE_IN_DECLARATION" value="true" />
-      <option name="DISCHARGED_SHORT_TERNARY_OPERATOR" value="true" />
-    </Objective-C>
-    <clangFormatSettings>
-      <option name="ENABLED" value="true" />
-    </clangFormatSettings>
-    <files>
-      <extensions>
-        <pair source="cpp" header="hpp" fileNamingConvention="NONE" />
-        <pair source="c" header="h" fileNamingConvention="NONE" />
-        <pair source="cu" header="cuh" fileNamingConvention="NONE" />
-      </extensions>
-    </files>
-    <codeStyleSettings language="Markdown">
-      <indentOptions>
-        <option name="INDENT_SIZE" value="3" />
-        <option name="TAB_SIZE" value="3" />
-      </indentOptions>
-    </codeStyleSettings>
-    <codeStyleSettings language="ObjectiveC">
-      <indentOptions>
-        <option name="INDENT_SIZE" value="2" />
-        <option name="TAB_SIZE" value="2" />
-        <option name="SMART_TABS" value="true" />
-        <option name="KEEP_INDENTS_ON_EMPTY_LINES" value="true" />
-      </indentOptions>
-    </codeStyleSettings>
-  </code_scheme>
-</component>
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
@ -1,5 +0,0 @@
-<component name="ProjectCodeStyleConfiguration">
-  <state>
-    <option name="USE_PER_PROJECT_SETTINGS" value="true" />
-  </state>
-</component>
--- a/.idea/customTargets.xml
+++ b/.idea/customTargets.xml
@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="CLionExternalBuildManager">
-    <target id="4d1b5109-3338-4779-bba7-8def1c68abbb" name="All" defaultType="MAKE">
-      <configuration id="c01c91f7-5730-4713-b432-50125cbe22a7" name="All">
-        <build type="MAKE">
-          <make targetName="all" />
-        </build>
-        <clean type="MAKE">
-          <make targetName="clean" />
-        </clean>
-      </configuration>
-    </target>
-  </component>
-</project>
--- a/.idea/discord.xml
+++ b/.idea/discord.xml
@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="DiscordProjectSettings">
-    <option name="show" value="PROJECT" />
-    <option name="description" value="" />
-  </component>
-</project>
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -1,11 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <profile version="1.0">
-    <option name="myName" value="Project Default" />
-    <inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
-    <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
-      <option name="processCode" value="true" />
-      <option name="processLiterals" value="true" />
-      <option name="processComments" value="true" />
-    </inspection_tool>
-  </profile>
-</component>
--- a/.idea/makefile.xml
+++ b/.idea/makefile.xml
@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="MakefileBuildTargetsManager">
-    <user-build-targets>
-      <build-target name="all">
-        <build-configurations>
-          <build-configuration>
-            <make-targets>
-              <make-target>all</make-target>
-            </make-targets>
-          </build-configuration>
-        </build-configurations>
-      </build-target>
-      <build-target name="clean">
-        <build-configurations>
-          <build-configuration>
-            <make-targets>
-              <make-target>clean</make-target>
-            </make-targets>
-          </build-configuration>
-        </build-configurations>
-      </build-target>
-    </user-build-targets>
-  </component>
-</project>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ExternalStorageConfigurationManager" enabled="true" />
-  <component name="MakefileSettings">
-    <option name="linkedExternalProjectsSettings">
-      <MakefileProjectSettings>
-        <option name="externalProjectPath" value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
-        <option name="modules">
-          <set>
-            <option value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
-          </set>
-        </option>
-        <option name="version" value="2" />
-      </MakefileProjectSettings>
-    </option>
-  </component>
-  <component name="MakefileWorkspace" PROJECT_DIR="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax">
-    <contentRoot DIR="$PROJECT_DIR$" />
-  </component>
-</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/unimore-hpc-1.iml" filepath="$PROJECT_DIR$/.idea/unimore-hpc-1.iml" />
-    </modules>
-  </component>
-</project>
--- a/.idea/runConfigurations/atax_acc.xml
+++ b/.idea/runConfigurations/atax_acc.xml
@ -1,7 +0,0 @@
-<component name="ProjectRunConfigurationManager">
-  <configuration default="false" name="atax_acc" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" WORKING_DIR="file://$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" PASS_PARENT_ENVS_2="true" PROJECT_NAME="unimore-hpc-1" TARGET_NAME="all" CONFIG_NAME="all" version="1" RUN_PATH="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax/atax_acc">
-    <method v="2">
-      <option name="CLION.COMPOUND.BUILD" enabled="true" />
-    </method>
-  </configuration>
-</component>
--- a/.idea/unimore-hpc-1.iml
+++ b/.idea/unimore-hpc-1.iml
@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="JAVA_MODULE" version="4">
-  <component name="NewModuleRootManager" inherit-compiler-output="true">
-    <exclude-output />
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
--- a/.idea/unimore-hpc-12.iml
+++ b/.idea/unimore-hpc-12.iml
@ -1,2 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module classpath="External" type="CPP_MODULE" version="4" />
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="" vcs="Git" />
-  </component>
-</project>
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="CMakePresetLoader"><![CDATA[{
+  "useNewFormat": true
+}]]></component>
+  <component name="CMakeRunConfigurationManager">
+    <generated>
+      <config projectName="unimore-hpc-assignments" targetName="atax.elf" />
+    </generated>
+  </component>
+  <component name="CMakeSettings">
+    <configurations />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="">
+      <change beforePath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.vscode/settings.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/settings.json" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/atax/.bench.sh" beforeDir="false" afterPath="$PROJECT_DIR$/atax/.bench.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/atax/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/atax/Makefile" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/atax/atax.cu" beforeDir="false" afterPath="$PROJECT_DIR$/atax/atax.cu" afterDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="MakefileLocalSettings">
+    <option name="projectSyncType">
+      <map>
+        <entry key="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" value="RE_IMPORT" />
+      </map>
+    </option>
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProjectId" id="2IKa1Pp6YCa8Ycz7UwOe8DKGQjU" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "RunOnceActivity.OpenProjectViewOnStart": "true",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "RunOnceActivity.cidr.known.project.marker": "true",
+    "SHARE_PROJECT_CONFIGURATION_FILES": "true",
+    "WebServerToolWindowFactoryState": "false",
+    "cf.first.check.clang-format": "false",
+    "cidr.known.project.marker": "true",
+    "last_opened_file_path": "/home/steffo/Workspaces/Steffo99/unimore-hpc-assignments/atax",
+    "nodejs_package_manager_path": "npm",
+    "settings.editor.selected.configurable": "MakefileSettings"
+  },
+  "keyToStringList": {
+    "GitStage.ChangesTree.GroupingKeys": [
+      "directory",
+      "module",
+      "repository"
+    ]
+  }
+}]]></component>
+  <component name="RunManager">
+    <configuration default="true" type="CLionExternalRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" PASS_PARENT_ENVS_2="true">
+      <method v="2">
+        <option name="CLION.EXTERNAL.BUILD" enabled="true" />
+      </method>
+    </configuration>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="" />
+      <created>1669932513703</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1669932513703</updated>
+      <workItem from="1669932517554" duration="325000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="XSLT-Support.FileAssociations.UIState">
+    <expand />
+    <select />
+  </component>
+</project>
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@ -6,7 +6,7 @@
                "${workspaceFolder}/**"
            ],
            "defines": [],
-            "compilerPath": "/usr/local/cuda-10.0/bin/nvcc",
+            "compilerPath": "/opt/cuda/bin/nvcc",
            "cStandard": "c11",
            "cppStandard": "c++14",
            "configurationProvider": "ms-vscode.makefile-tools"
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -15,6 +15,7 @@
            "environment": [],
            "externalConsole": false,
            "MIMode": "gdb",
+            "miDebuggerPath": "/usr/bin/gdb",
            "setupCommands": [
                {
                    "description": "Enable pretty-printing for gdb",
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,5 +1,39 @@
 {
    "files.associations": {
-        "*.hu": "cuda-cpp"
+        "*.hu": "cuda-cpp",
+        "array": "cpp",
+        "*.tcc": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "cstdarg": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "functional": "cpp",
+        "initializer_list": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "string": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "typeinfo": "cpp",
+        "utility": "cpp"
    }
 }
--- a/atax/.bench.sh
+++ b/atax/.bench.sh
@ -1,12 +1,12 @@
 #!/bin/bash

 run_benchmarks() {
-    runs=25
+    runs=3
    totalt=0.0

    for i in $(seq $runs)
    do
-        exet=$(./atax.elf)
+        exet=$(./atax.elf 2> /dev/null)
        totalt=$(awk "BEGIN{print $totalt+$exet}")
        echo -n "."
        # echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
@ -16,9 +16,9 @@ run_benchmarks() {
    echo "  Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
 }

-for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
+for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET
 do
-    for c in $(seq 0 7)
+    for c in $(seq 0 3)
    do
        cxxflags="-D$dataset"

@ -32,12 +32,8 @@ do
            cxxflags="$cxxflags -DHPC_USE_CUDA"
        fi

-        if (( $c & 2 ))
-        then
-            cxxflags="$cxxflags -DHPC_USE_STRIDE"
-        fi
-
        echo "Flags: $cxxflags"
+        make --silent "clean"
        make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"

        run_benchmarks
--- a/atax/Makefile
+++ b/atax/Makefile
@ -5,16 +5,25 @@ MAKEFLAGS+= -r
 CXXFLAGS+= -DPOLYBENCH_TIME
 # -O3 applies all compiler optimization, improving from 800ms to 300ms
 CXXFLAGS+= -O3
+# Enable this to view the contents of the arrays
+CXXFLAGS+= -DHPC_DEBUG
+# Enable this to use CUDA
+CXXFLAGS+= -DHPC_USE_CUDA
 # Extend CFLAGS with command line parameters
 CXXFLAGS+= ${EXTRA_CXXFLAGS}

 # Select the location of the local CUDA install
-CUDA_HOME:=/usr/local/cuda-10.0
+# CUDA_HOME:=/usr/local/cuda-10.0
+CUDA_HOME:=/opt/cuda
 # Specify the directory of the nvc compiler
 NVCC:=$(CUDA_HOME)/bin/nvcc
 # Specify the flags for the nvc compiler
 NVCFLAGS:=$(CXXFLAGS) $(NVOPT)

+# Optimize for @Steffo's NVIDIA GTX 1070
+NVCFLAGS+= -arch=compute_61
+NVCFLAGS+= -code=sm_61
+

 %.elf: %.cu.o polybench.cu.o
 	$(NVCC) $(NVCFLAGS) $^ -o $@ $(LDFLAGS)
@ -23,13 +32,12 @@ NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
 	$(NVCC) $(NVCFLAGS) -c $< -o $@


-.PHONY: bench clean dev
+all: atax.elf

-dev: atax.elf
-	./atax.elf
+.PHONY: bench clean

 bench:
 	./.bench.sh

 clean:
-	rm *.elf *.cu.o
+	rm *.elf
--- a/atax/atax.cu
+++ b/atax/atax.cu
@ -3,6 +3,7 @@
 #include <string.h>
 #include <math.h>
 #include <iostream>
+#include <string>

 /* Include polybench common header. */
 #include "polybench.hu"
@ -23,16 +24,34 @@
 	#define CUDA_NTHREADS 128
 #endif

-// Enable syntax highlighting for the CUDA mode
-// TODO: Remove this, as it will be set by .bench.sh
-#define HPC_USE_CUDA

-// Enable syntax highlighting for the stride mode
-// TODO: Remove this, as it will be set by .bench.sh
-#define HPC_USE_STRIDE
+/**
+ * Given a `x` and a `y`, compute the relative index of the element in the `A` matrix.
+ */
+__host__ __device__ inline static unsigned int a_index(unsigned int x, unsigned int y) {
+	return x * NY + y;
+}

-// Create macro for debug logging
-#define debug(txt) std::cerr << txt << std::endl
+/**
+ * Log a debug message.
+ */
+__host__ inline static void print_debug(std::string txt) {
+	#ifdef HPC_DEBUG
+		std::cerr << txt << std::endl;
+	#endif
+}
+
+/**
+ * Log an error message.
+ */
+#ifdef HPC_USE_CUDA
+__host__ inline static void print_cudaError(cudaError_t err, std::string txt) {
+	#ifdef HPC_DEBUG
+		std::cerr << txt;
+		fprintf( stderr, ": error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(err) );
+	#endif
+}
+#endif


 /**
@ -45,7 +64,7 @@
 * To be called on the CPU (uses the `__host__` qualifier).
 */
 #ifndef HPC_USE_CUDA
-__host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
+__host__ static void init_array(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
 {
 	/* X = [ 3.14, 6.28, 9.42, ... ] */
 	for (unsigned int y = 0; y < NY; y++) 
@ -72,7 +91,7 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 	{
 		for (unsigned int y = 0; y < NY; y++) 
 		{
-			A[x][y] = (DATA_TYPE)(x * (y + 1)) / NX;
+			A[a_index(x, y)] = (DATA_TYPE)(x * (y + 1)) / NX;
 		}
 	}
 }
@ -87,20 +106,21 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 __device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
 {
 	// Find how many iterations should be performed by each thread
-	unsigned int perThread = NY / threads;
+	unsigned int perThread = NY / threads + 1;

 	// Find the index of the current thread, even if threads span multiple blocks
 	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
 	
 	// Have each thread perform the previously determined number of iterations
-	for(int stride = 0; stride < perThread; stride++) {
+	for(int stride = 0; stride < perThread; stride++)
+	{
 		// Find the index of the current iteration
 		// This is equal to `y` of the init_array function
-		int iterationIdx = blockThreadIdx * stride;
+		unsigned int iterationIdx = threads * stride + blockThreadIdx;

 		// Prevent the thread from accessing unallocated memory
-		if(iterationIdx < NY) {
-
+		if(iterationIdx < NY)
+		{
 			// Set the array element
 			X[iterationIdx] = iterationIdx * M_PI;
 		}
@ -117,20 +137,21 @@ __device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
 __device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
 {
 	// Find how many iterations should be performed by each thread
-	unsigned int perThread = NX / threads;
+	unsigned int perThread = NX / threads + 1;

 	// Find the index of the current thread, even if threads span multiple blocks
 	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
 	
 	// Have each thread perform the previously determined number of iterations
-	for(int stride = 0; stride < perThread; stride++) {
+	for(int stride = 0; stride < perThread; stride++) 
+	{
 		// Find the index of the current iteration
 		// This is equal to `y` of the init_array function
-		int iterationIdx = blockThreadIdx * stride;
+		unsigned int iterationIdx = threads * stride + blockThreadIdx;

 		// Prevent the thread from accessing unallocated memory
-		if(iterationIdx < NX) {
-
+		if(iterationIdx < NX) 
+		{
 			// Set the array element
 			Y[iterationIdx] = 0;
 		}
@ -150,12 +171,29 @@ __device__ static void init_array_cuda_a(DATA_TYPE* A, unsigned int threads)
 	unsigned int elements = NX * NY;

 	// Find how many iterations should be performed by each thread
-	unsigned int perThread = elements / threads;
+	unsigned int perThread = elements / threads + 1;

 	// Find the index of the current thread, even if threads span multiple blocks
 	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;

-	/* TODO */
+	// Have each thread perform the previously determined number of iterations
+	for(int stride = 0; stride < perThread; stride++) 
+	{
+		// Find the index of the current iteration
+		// This is equal to `y` of the init_array function
+		unsigned int iterationIdx = threads * stride + blockThreadIdx;
+
+		// Determine current x and y
+		unsigned int y = iterationIdx % NY;
+		unsigned int x = iterationIdx / NY;
+
+		// Prevent the thread from accessing unallocated memory
+		if(iterationIdx < elements) 
+		{
+			// Set the array element
+			A[iterationIdx] = (DATA_TYPE)(x * (y + 1)) / NX;
+		}
+	}
 }
 #endif

@ -188,11 +226,11 @@ __global__ static void init_array_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
 * 
 * To be called on the CPU (uses the `__host__` qualifier).
 */
-__host__ static void print_array(DATA_TYPE* Y)
+__host__ static void print_array(DATA_TYPE* Z, unsigned int size)
 {
-	for (unsigned int x = 0; x < NX; x++) 
+	for (unsigned int z = 0; z < size; z++) 
 	{
-		fprintf(stderr, DATA_PRINTF_MODIFIER, Y[x]);
+		fprintf(stderr, DATA_PRINTF_MODIFIER, Z[z]);
 	}
 	fprintf(stderr, "\n");
 }
@ -212,25 +250,79 @@ __host__ static void print_array(DATA_TYPE* Y)
 * 
 * Parallelizing this is the goal of the assignment.
 * 
- * Currently to be called on the CPU (uses the `__host__` qualifier), but we may probably want to change that soon.
+ * To be called on the CPU (uses the `__host__` qualifier).
 */
-__host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
+#ifndef HPC_USE_CUDA
+__host__ static void kernel_atax(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
 {
-	for (unsigned int x = 0; x < NX; x++) 
+	for (unsigned int x = 0; x < NY; x++) 
 	{
 		DATA_TYPE tmp = 0;
 		
-		for (unsigned int y = 0; y < NY; y++) 
+		for (unsigned int y = 0; y < NX; y++) 
 		{
-			tmp += A[x][y] * X[y];
+			tmp += A[a_index(x, y)] * X[y];
 		}
 		
-		for (unsigned int y = 0; y < NY; y++) 
+		for (unsigned int y = 0; y < NX; y++) 
 		{
-			Y[y] += A[x][y] * tmp;
+			Y[x] += A[a_index(x, y)] * tmp;
 		}
 	}
 }
+#endif
+
+
+/**
+ * Compute ATAX :
+ * - A is the input matrix
+ * - X is an input vector
+ * - Y is the result vector
+ * 
+ * In particular:
+ * ```
+ * A * (A * X) = Y
+ * ```
+ * Wait, there's no transposition here?!?
+ * 
+ * Parallelizing this is the goal of the assignment.
+ * 
+ * To be called on the device as a kernel (uses the `__global__` qualifier).
+ */
+#ifdef HPC_USE_CUDA
+__global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
+{
+	// Find out how many threads there are
+	unsigned int threads = gridDim.x * blockDim.x;
+
+	// Find how many iterations should be performed by each thread
+	unsigned int perThread = NX / threads + 1;
+
+	// Find the index of the current thread, even if threads span multiple blocks
+	unsigned int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Have each thread perform the previously determined number of iterations
+	for(int stride = 0; stride < perThread; stride++) 
+	{
+		unsigned int x = threads * stride + blockThreadIdx;
+		
+		if(x < NX) 
+		{
+			DATA_TYPE tmp = 0;
+
+			for (unsigned int y = 0; y < NX; y++) 
+			{
+				tmp += A[a_index(x, y)] * X[y];
+			}
+
+			for (unsigned int y = 0; y < NX; y++) 
+			{
+				atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
+			}
+		}
+	}
+}
+#endif


 /**
@ -240,143 +332,180 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 */
 __host__ int main(int argc, char** argv)
 {
-	debug("Starting main...");
+	print_debug("[Main] Starting...");
+	std::cerr << "[Main] NX is: " << NX << std::endl;
+	std::cerr << "[Main] NY is: " << NY << std::endl;

 	#ifndef HPC_USE_CUDA

-		debug("[Mode] Host-only");
+		print_debug("[Mode] Host-only");

-		debug("[Pointers] Allocating...");
+		print_debug("[Pointers] Allocating...");

-		// A[NX][NY]
-		DATA_TYPE** A = new DATA_TYPE*[NX] {};
-		for(unsigned int x = 0; x < NX; x++)
-		{
-			A[x] = new DATA_TYPE[NY] {};
-		}
+		DATA_TYPE* A = new DATA_TYPE[NX * NY];
+		DATA_TYPE* X = new DATA_TYPE[NY];
+		DATA_TYPE* Y = new DATA_TYPE[NX];

-		// X[NY]
-		DATA_TYPE* X = new DATA_TYPE[NY] {};
-
-		// Y[NX]
-		DATA_TYPE* Y = new DATA_TYPE[NX] {};
-
-		debug("[Pointers] Allocated!");
+		print_debug("[Pointers] Allocated!");

 		#ifdef HPC_INCLUDE_INIT
-			debug("[Benchmark] Starting...");
+			print_debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif

-		debug("[Init] Initializing...");
+		print_debug("[Init] Initializing...");
 		init_array(A, X, Y);
-		debug("[Init] Initialized!");
+		print_debug("[Init] Initialized!");

 		#ifndef HPC_INCLUDE_INIT
-			debug("[Benchmark] Starting...");
+			print_debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif

-		debug("[Kernel] Running...");
+		print_debug("[Kernel] Running...");
 		kernel_atax(A, X, Y);
-		debug("[Kernel] Completed!");
+		print_debug("[Kernel] Completed!");

-		debug("[Benchmark] Stopping...");
+		print_debug("[Benchmark] Stopping...");
 		polybench_stop_instruments;
 		polybench_print_instruments;
-		debug("[Benchmark] Complete!");
+		print_debug("[Benchmark] Complete!");

-		debug("[Verify] Printing...")
+		#ifdef HPC_DEBUG
+			print_debug("[Debug] Displaying A:");
+			print_array(A, NX * NY);
+			print_debug("[Debug] Displaying X:");
+			print_array(X, NY);
+			print_debug("[Debug] Displaying Y:");
+			print_array(Y, NX);
+		#endif
+
+		print_debug("[Verify] Printing...");
 		polybench_prevent_dce(
-			print_array(Y)
+			print_array(Y, NX)
 		);
-		debug("[Verify] Done!")
+		print_debug("[Verify] Done!");

 	#else
 	
-		debug("[Mode] Host-and-device, CUDA");
+		print_debug("[Mode] Host-and-device, CUDA");

-		debug("[Pointers] Allocating...");
+		print_debug("[Pointers] Allocating...");
 		DATA_TYPE* A;
 		DATA_TYPE* X;
 		DATA_TYPE* Y;
+		DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
+		DATA_TYPE* host_X = new DATA_TYPE[NY];
+		DATA_TYPE* host_Y = new DATA_TYPE[NX];
 		
-		debug("[CUDA] Allocating A...");
-		if(cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY)) 
+		print_debug("[CUDA] Allocating A...");
+		if(cudaError_t err = cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY)) 
 		{
-			debug("[CUDA] Could not allocate A!");
+			print_cudaError(err, "[CUDA] Could not allocate A!");
 			return 1;
 		}
-		debug("[CUDA] Allocated A!");
+		print_debug("[CUDA] Allocated A!");
 		
-		debug("[CUDA] Allocating X...");
-		if(cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
+		print_debug("[CUDA] Allocating X...");
+		if(cudaError_t err = cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
 		{
-			debug("[CUDA] Could not allocate X!");
+			print_cudaError(err, "[CUDA] Could not allocate X!");
 			return 1;
 		}
-		debug("[CUDA] Allocated X!");
+		print_debug("[CUDA] Allocated X!");

-		debug("[CUDA] Allocating Y...");
-		if(cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
+		print_debug("[CUDA] Allocating Y...");
+		if(cudaError_t err = cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
 		{
-			debug("[CUDA] Could not allocate Y!");
+			print_cudaError(err, "[CUDA] Could not allocate Y!");
 			return 1;
 		}
-		debug("[CUDA] Allocated Y!");
+		print_debug("[CUDA] Allocated Y!");

 		#ifdef POLYBENCH_INCLUDE_INIT
-			debug("[Benchmark] Starting...");
+			print_debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif

-		debug("[Init] Initializing...");
+		print_debug("[Init] Initializing...");
 		init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
-		if(cudaGetLastError())
+		if(cudaError_t err = cudaGetLastError())
 		{
-			debug("[Init] Failed to execute kernel!");
+			print_cudaError(err, "[Init] Failed to execute kernel!");
 			return 1;
 		}
-		debug("[Init] Initialized!");
+		print_debug("[Init] Complete!");

 		#ifndef POLYBENCH_INCLUDE_INIT
-			debug("[Benchmark] Starting...");
+			print_debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif

-		// kernel_atax_cuda<<<1, 1>>>();
+		print_debug("[Kernel] Running...");
+		kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
+		print_debug("[Kernel] Complete!");

+		print_debug("[CUDA] Copying A back...");
+		if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
+			print_cudaError(err, "[CUDA] Could copy A back!");
+			return 1;
+		};
+		print_debug("[CUDA] Copied A back!");
+
+		print_debug("[CUDA] Copying X back...");
+		if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) {
+			print_cudaError(err, "[CUDA] Could copy X back!");
+			return 1;
+		};
+		print_debug("[CUDA] Copied X back!");
+
+		print_debug("[CUDA] Copying Y back...");
+		if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {
+			print_cudaError(err, "[CUDA] Could copy Y back!");
+			return 1;
+		};
+		print_debug("[CUDA] Copied Y back!");
+
+		print_debug("[Benchmark] Stopping...");
 		polybench_stop_instruments;
 		polybench_print_instruments;
+		print_debug("[Benchmark] Complete!");

-		// Y = cudaMemcpy();
-
-		debug("[CUDA] Freeing A...");
-		if(cudaFree(A)) {
-			debug("[CUDA] Could not free A!");
+		print_debug("[CUDA] Freeing A...");
+		if(cudaError_t err = cudaFree(A)) {
+			print_cudaError(err, "[CUDA] Could not free A!");
 			return 1;
 		}
-		debug("[CUDA] Freed A!");
+		print_debug("[CUDA] Freed A!");

-		debug("[CUDA] Freeing X...");
-		if(cudaFree(X)) {
-			debug("[CUDA] Could not free X!");
+		print_debug("[CUDA] Freeing X...");
+		if(cudaError_t err = cudaFree(X)) {
+			print_cudaError(err, "[CUDA] Could not free X!");
 			return 1;
 		}
-		debug("[CUDA] Freed X!");
+		print_debug("[CUDA] Freed X!");

-		debug("[CUDA] Freeing Y...");
-		if(cudaFree(Y)) {
-			debug("[CUDA] Could not free Y!");
+		print_debug("[CUDA] Freeing Y...");
+		if(cudaError_t err = cudaFree(Y)) {
+			print_cudaError(err, "[CUDA] Could not free Y!");
 			return 1;
 		}
-		debug("[CUDA] Freed Y!");
+		print_debug("[CUDA] Freed Y!");

-		/*
+		#ifdef HPC_DEBUG
+			print_debug("[Debug] Displaying A:");
+			print_array(host_A, NX * NY);
+			print_debug("[Debug] Displaying X:");
+			print_array(host_X, NY);
+			print_debug("[Debug] Displaying Y:");
+			print_array(host_Y, NX);
+		#endif
+
+		print_debug("[Verify] Printing...");
 		polybench_prevent_dce(
-			print_array(Y)
+			print_array(host_Y, NX)
 		);
-		*/
+		print_debug("[Verify] Done!");

 	#endif

--- a/atax/atax.hu
+++ b/atax/atax.hu
@ -3,8 +3,12 @@
 	#define ATAX_H

 	/* Default to STANDARD_DATASET. */
-	#if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
-		#define STANDARD_DATASET
+	#if !defined(NANO_DATASET) && !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
+		#ifdef HPC_DEBUG
+			#define NANO_DATASET
+		#else
+			#define EXTRALARGE_DATASET
+		#endif
 	#endif

 	/* Do not define anything if the user manually defines the size. */
@ -12,6 +16,11 @@

 		/* Define the possible dataset sizes. */

+		#ifdef NANO_DATASET
+			#define NX 3
+			#define NY 5
+		#endif
+
 		#ifdef MINI_DATASET
 			#define NX 32
 			#define NY 32