1
Fork 0
mirror of https://github.com/Steffo99/unimore-hpc-assignments.git synced 2024-11-25 09:34:23 +00:00

Get everything to work

This commit is contained in:
Steffo 2022-12-02 00:15:33 +01:00
parent ae73536d76
commit ef30e88e01
Signed by: steffo
GPG key ID: 6965406171929D01
22 changed files with 404 additions and 278 deletions

6
.editorconfig Normal file
View file

@ -0,0 +1,6 @@
root = true
[*]
end_of_line = lf
insert_final_newline = true
indent_style = tab

8
.idea/.gitignore vendored
View file

@ -1,8 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View file

@ -1,40 +0,0 @@
<component name="ProjectCodeStyleConfiguration">
<code_scheme name="Project" version="173">
<Objective-C>
<option name="INDENT_CLASS_MEMBERS" value="8" />
<option name="INDENT_VISIBILITY_KEYWORDS" value="4" />
<option name="INDENT_PREPROCESSOR_DIRECTIVE" value="4" />
<option name="INDENT_DIRECTIVE_AS_CODE" value="true" />
<option name="SPACE_BEFORE_INIT_LIST" value="true" />
<option name="SPACE_BEFORE_POINTER_IN_DECLARATION" value="false" />
<option name="SPACE_AFTER_POINTER_IN_DECLARATION" value="true" />
<option name="SPACE_BEFORE_REFERENCE_IN_DECLARATION" value="false" />
<option name="SPACE_AFTER_REFERENCE_IN_DECLARATION" value="true" />
<option name="DISCHARGED_SHORT_TERNARY_OPERATOR" value="true" />
</Objective-C>
<clangFormatSettings>
<option name="ENABLED" value="true" />
</clangFormatSettings>
<files>
<extensions>
<pair source="cpp" header="hpp" fileNamingConvention="NONE" />
<pair source="c" header="h" fileNamingConvention="NONE" />
<pair source="cu" header="cuh" fileNamingConvention="NONE" />
</extensions>
</files>
<codeStyleSettings language="Markdown">
<indentOptions>
<option name="INDENT_SIZE" value="3" />
<option name="TAB_SIZE" value="3" />
</indentOptions>
</codeStyleSettings>
<codeStyleSettings language="ObjectiveC">
<indentOptions>
<option name="INDENT_SIZE" value="2" />
<option name="TAB_SIZE" value="2" />
<option name="SMART_TABS" value="true" />
<option name="KEEP_INDENTS_ON_EMPTY_LINES" value="true" />
</indentOptions>
</codeStyleSettings>
</code_scheme>
</component>

View file

@ -1,5 +0,0 @@
<component name="ProjectCodeStyleConfiguration">
<state>
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
</state>
</component>

View file

@ -1,15 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CLionExternalBuildManager">
<target id="4d1b5109-3338-4779-bba7-8def1c68abbb" name="All" defaultType="MAKE">
<configuration id="c01c91f7-5730-4713-b432-50125cbe22a7" name="All">
<build type="MAKE">
<make targetName="all" />
</build>
<clean type="MAKE">
<make targetName="clean" />
</clean>
</configuration>
</target>
</component>
</project>

View file

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DiscordProjectSettings">
<option name="show" value="PROJECT" />
<option name="description" value="" />
</component>
</project>

View file

@ -1,11 +0,0 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
<option name="processCode" value="true" />
<option name="processLiterals" value="true" />
<option name="processComments" value="true" />
</inspection_tool>
</profile>
</component>

View file

@ -1,25 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MakefileBuildTargetsManager">
<user-build-targets>
<build-target name="all">
<build-configurations>
<build-configuration>
<make-targets>
<make-target>all</make-target>
</make-targets>
</build-configuration>
</build-configurations>
</build-target>
<build-target name="clean">
<build-configurations>
<build-configuration>
<make-targets>
<make-target>clean</make-target>
</make-targets>
</build-configuration>
</build-configurations>
</build-target>
</user-build-targets>
</component>
</project>

View file

@ -1,20 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MakefileSettings">
<option name="linkedExternalProjectsSettings">
<MakefileProjectSettings>
<option name="externalProjectPath" value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
<option name="modules">
<set>
<option value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
</set>
</option>
<option name="version" value="2" />
</MakefileProjectSettings>
</option>
</component>
<component name="MakefileWorkspace" PROJECT_DIR="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax">
<contentRoot DIR="$PROJECT_DIR$" />
</component>
</project>

View file

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/unimore-hpc-1.iml" filepath="$PROJECT_DIR$/.idea/unimore-hpc-1.iml" />
</modules>
</component>
</project>

View file

@ -1,7 +0,0 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="atax_acc" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" WORKING_DIR="file://$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" PASS_PARENT_ENVS_2="true" PROJECT_NAME="unimore-hpc-1" TARGET_NAME="all" CONFIG_NAME="all" version="1" RUN_PATH="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax/atax_acc">
<method v="2">
<option name="CLION.COMPOUND.BUILD" enabled="true" />
</method>
</configuration>
</component>

View file

@ -1,9 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View file

@ -1,2 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module classpath="External" type="CPP_MODULE" version="4" />

View file

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

106
.idea/workspace.xml Normal file
View file

@ -0,0 +1,106 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="CMakePresetLoader"><![CDATA[{
"useNewFormat": true
}]]></component>
<component name="CMakeRunConfigurationManager">
<generated>
<config projectName="unimore-hpc-assignments" targetName="atax.elf" />
</generated>
</component>
<component name="CMakeSettings">
<configurations />
</component>
<component name="ChangeListManager">
<list default="true" id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="">
<change beforePath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.vscode/settings.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/settings.json" afterDir="false" />
<change beforePath="$PROJECT_DIR$/atax/.bench.sh" beforeDir="false" afterPath="$PROJECT_DIR$/atax/.bench.sh" afterDir="false" />
<change beforePath="$PROJECT_DIR$/atax/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/atax/Makefile" afterDir="false" />
<change beforePath="$PROJECT_DIR$/atax/atax.cu" beforeDir="false" afterPath="$PROJECT_DIR$/atax/atax.cu" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="MakefileLocalSettings">
<option name="projectSyncType">
<map>
<entry key="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" value="RE_IMPORT" />
</map>
</option>
</component>
<component name="MarkdownSettingsMigration">
<option name="stateVersion" value="1" />
</component>
<component name="ProjectId" id="2IKa1Pp6YCa8Ycz7UwOe8DKGQjU" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"RunOnceActivity.OpenProjectViewOnStart": "true",
"RunOnceActivity.ShowReadmeOnStart": "true",
"RunOnceActivity.cidr.known.project.marker": "true",
"SHARE_PROJECT_CONFIGURATION_FILES": "true",
"WebServerToolWindowFactoryState": "false",
"cf.first.check.clang-format": "false",
"cidr.known.project.marker": "true",
"last_opened_file_path": "/home/steffo/Workspaces/Steffo99/unimore-hpc-assignments/atax",
"nodejs_package_manager_path": "npm",
"settings.editor.selected.configurable": "MakefileSettings"
},
"keyToStringList": {
"GitStage.ChangesTree.GroupingKeys": [
"directory",
"module",
"repository"
]
}
}]]></component>
<component name="RunManager">
<configuration default="true" type="CLionExternalRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" PASS_PARENT_ENVS_2="true">
<method v="2">
<option name="CLION.EXTERNAL.BUILD" enabled="true" />
</method>
</configuration>
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="" />
<created>1669932513703</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1669932513703</updated>
<workItem from="1669932517554" duration="325000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
<component name="Vcs.Log.Tabs.Properties">
<option name="TAB_STATES">
<map>
<entry key="MAIN">
<value>
<State />
</value>
</entry>
</map>
</option>
</component>
<component name="XSLT-Support.FileAssociations.UIState">
<expand />
<select />
</component>
</project>

View file

@ -6,7 +6,7 @@
"${workspaceFolder}/**"
],
"defines": [],
"compilerPath": "/usr/local/cuda-10.0/bin/nvcc",
"compilerPath": "/opt/cuda/bin/nvcc",
"cStandard": "c11",
"cppStandard": "c++14",
"configurationProvider": "ms-vscode.makefile-tools"

1
.vscode/launch.json vendored
View file

@ -15,6 +15,7 @@
"environment": [],
"externalConsole": false,
"MIMode": "gdb",
"miDebuggerPath": "/usr/bin/gdb",
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",

36
.vscode/settings.json vendored
View file

@ -1,5 +1,39 @@
{
"files.associations": {
"*.hu": "cuda-cpp"
"*.hu": "cuda-cpp",
"array": "cpp",
"*.tcc": "cpp",
"cctype": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"concepts": "cpp",
"cstdarg": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"unordered_map": "cpp",
"vector": "cpp",
"exception": "cpp",
"functional": "cpp",
"initializer_list": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"stdexcept": "cpp",
"streambuf": "cpp",
"string": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"typeinfo": "cpp",
"utility": "cpp"
}
}

View file

@ -1,12 +1,12 @@
#!/bin/bash
run_benchmarks() {
runs=25
runs=3
totalt=0.0
for i in $(seq $runs)
do
exet=$(./atax.elf)
exet=$(./atax.elf 2> /dev/null)
totalt=$(awk "BEGIN{print $totalt+$exet}")
echo -n "."
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
@ -16,9 +16,9 @@ run_benchmarks() {
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
}
for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET
do
for c in $(seq 0 7)
for c in $(seq 0 3)
do
cxxflags="-D$dataset"
@ -32,12 +32,8 @@ do
cxxflags="$cxxflags -DHPC_USE_CUDA"
fi
if (( $c & 2 ))
then
cxxflags="$cxxflags -DHPC_USE_STRIDE"
fi
echo "Flags: $cxxflags"
make --silent "clean"
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
run_benchmarks

View file

@ -5,16 +5,25 @@ MAKEFLAGS+= -r
CXXFLAGS+= -DPOLYBENCH_TIME
# -O3 applies all compiler optimization, improving from 800ms to 300ms
CXXFLAGS+= -O3
# Enable this to view the contents of the arrays
CXXFLAGS+= -DHPC_DEBUG
# Enable this to use CUDA
CXXFLAGS+= -DHPC_USE_CUDA
# Extend CFLAGS with command line parameters
CXXFLAGS+= ${EXTRA_CXXFLAGS}
# Select the location of the local CUDA install
CUDA_HOME:=/usr/local/cuda-10.0
# CUDA_HOME:=/usr/local/cuda-10.0
CUDA_HOME:=/opt/cuda
# Specify the directory of the nvc compiler
NVCC:=$(CUDA_HOME)/bin/nvcc
# Specify the flags for the nvc compiler
NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
# Optimize for @Steffo's NVIDIA GTX 1070
NVCFLAGS+= -arch=compute_61
NVCFLAGS+= -code=sm_61
%.elf: %.cu.o polybench.cu.o
$(NVCC) $(NVCFLAGS) $^ -o $@ $(LDFLAGS)
@ -23,13 +32,12 @@ NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
$(NVCC) $(NVCFLAGS) -c $< -o $@
.PHONY: bench clean dev
all: atax.elf
dev: atax.elf
./atax.elf
.PHONY: bench clean
bench:
./.bench.sh
clean:
rm *.elf *.cu.o
rm *.elf

View file

@ -3,6 +3,7 @@
#include <string.h>
#include <math.h>
#include <iostream>
#include <string>
/* Include polybench common header. */
#include "polybench.hu"
@ -23,16 +24,34 @@
#define CUDA_NTHREADS 128
#endif
// Enable syntax highlighting for the CUDA mode
// TODO: Remove this, as it will be set by .bench.sh
#define HPC_USE_CUDA
// Enable syntax highlighting for the stride mode
// TODO: Remove this, as it will be set by .bench.sh
#define HPC_USE_STRIDE
/**
* Given a `x` and a `y`, compute the relative index of the element in the `A` matrix.
*/
__host__ __device__ inline static unsigned int a_index(unsigned int x, unsigned int y) {
return x * NY + y;
}
// Create macro for debug logging
#define debug(txt) std::cerr << txt << std::endl
/**
* Log a debug message.
*/
__host__ inline static void print_debug(std::string txt) {
#ifdef HPC_DEBUG
std::cerr << txt << std::endl;
#endif
}
/**
* Log an error message.
*/
#ifdef HPC_USE_CUDA
__host__ inline static void print_cudaError(cudaError_t err, std::string txt) {
#ifdef HPC_DEBUG
std::cerr << txt;
fprintf( stderr, ": error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(err) );
#endif
}
#endif
/**
@ -45,7 +64,7 @@
* To be called on the CPU (uses the `__host__` qualifier).
*/
#ifndef HPC_USE_CUDA
__host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
__host__ static void init_array(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
{
/* X = [ 3.14, 6.28, 9.42, ... ] */
for (unsigned int y = 0; y < NY; y++)
@ -72,7 +91,7 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
{
for (unsigned int y = 0; y < NY; y++)
{
A[x][y] = (DATA_TYPE)(x * (y + 1)) / NX;
A[a_index(x, y)] = (DATA_TYPE)(x * (y + 1)) / NX;
}
}
}
@ -87,20 +106,21 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
__device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
{
// Find how many iterations should be performed by each thread
unsigned int perThread = NY / threads;
unsigned int perThread = NY / threads + 1;
// Find the index of the current thread, even if threads span multiple blocks
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
// Have each thread perform the previously determined number of iterations
for(int stride = 0; stride < perThread; stride++) {
for(int stride = 0; stride < perThread; stride++)
{
// Find the index of the current iteration
// This is equal to `y` of the init_array function
int iterationIdx = blockThreadIdx * stride;
unsigned int iterationIdx = threads * stride + blockThreadIdx;
// Prevent the thread from accessing unallocated memory
if(iterationIdx < NY) {
if(iterationIdx < NY)
{
// Set the array element
X[iterationIdx] = iterationIdx * M_PI;
}
@ -117,20 +137,21 @@ __device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
__device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
{
// Find how many iterations should be performed by each thread
unsigned int perThread = NX / threads;
unsigned int perThread = NX / threads + 1;
// Find the index of the current thread, even if threads span multiple blocks
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
// Have each thread perform the previously determined number of iterations
for(int stride = 0; stride < perThread; stride++) {
for(int stride = 0; stride < perThread; stride++)
{
// Find the index of the current iteration
// This is equal to `y` of the init_array function
int iterationIdx = blockThreadIdx * stride;
unsigned int iterationIdx = threads * stride + blockThreadIdx;
// Prevent the thread from accessing unallocated memory
if(iterationIdx < NX) {
if(iterationIdx < NX)
{
// Set the array element
Y[iterationIdx] = 0;
}
@ -150,12 +171,29 @@ __device__ static void init_array_cuda_a(DATA_TYPE* A, unsigned int threads)
unsigned int elements = NX * NY;
// Find how many iterations should be performed by each thread
unsigned int perThread = elements / threads;
unsigned int perThread = elements / threads + 1;
// Find the index of the current thread, even if threads span multiple blocks
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
/* TODO */
// Have each thread perform the previously determined number of iterations
for(int stride = 0; stride < perThread; stride++)
{
// Find the index of the current iteration
// This is equal to `y` of the init_array function
unsigned int iterationIdx = threads * stride + blockThreadIdx;
// Determine current x and y
unsigned int y = iterationIdx % NY;
unsigned int x = iterationIdx / NY;
// Prevent the thread from accessing unallocated memory
if(iterationIdx < elements)
{
// Set the array element
A[iterationIdx] = (DATA_TYPE)(x * (y + 1)) / NX;
}
}
}
#endif
@ -188,11 +226,11 @@ __global__ static void init_array_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
*
* To be called on the CPU (uses the `__host__` qualifier).
*/
__host__ static void print_array(DATA_TYPE* Y)
__host__ static void print_array(DATA_TYPE* Z, unsigned int size)
{
for (unsigned int x = 0; x < NX; x++)
for (unsigned int z = 0; z < size; z++)
{
fprintf(stderr, DATA_PRINTF_MODIFIER, Y[x]);
fprintf(stderr, DATA_PRINTF_MODIFIER, Z[z]);
}
fprintf(stderr, "\n");
}
@ -212,25 +250,79 @@ __host__ static void print_array(DATA_TYPE* Y)
*
* Parallelizing this is the goal of the assignment.
*
* Currently to be called on the CPU (uses the `__host__` qualifier), but we may probably want to change that soon.
* To be called on the CPU (uses the `__host__` qualifier).
*/
__host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
#ifndef HPC_USE_CUDA
__host__ static void kernel_atax(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
{
for (unsigned int x = 0; x < NX; x++)
for (unsigned int x = 0; x < NY; x++)
{
DATA_TYPE tmp = 0;
for (unsigned int y = 0; y < NY; y++)
for (unsigned int y = 0; y < NX; y++)
{
tmp += A[x][y] * X[y];
tmp += A[a_index(x, y)] * X[y];
}
for (unsigned int y = 0; y < NY; y++)
for (unsigned int y = 0; y < NX; y++)
{
Y[y] += A[x][y] * tmp;
Y[x] += A[a_index(x, y)] * tmp;
}
}
}
#endif
/**
* Compute ATAX :
* - A is the input matrix
* - X is an input vector
* - Y is the result vector
*
* In particular:
* ```
* A * (A * X) = Y
* ```
* Wait, there's no transposition here?!?
*
* Parallelizing this is the goal of the assignment.
*
* To be called on the device as a kernel (uses the `__global__` qualifier).
*/
#ifdef HPC_USE_CUDA
__global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
{
// Find out how many threads there are
unsigned int threads = gridDim.x * blockDim.x;
// Find how many iterations should be performed by each thread
unsigned int perThread = NX / threads + 1;
// Find the index of the current thread, even if threads span multiple blocks
unsigned int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
// Have each thread perform the previously determined number of iterations
for(int stride = 0; stride < perThread; stride++)
{
unsigned int x = threads * stride + blockThreadIdx;
if(x < NX)
{
DATA_TYPE tmp = 0;
for (unsigned int y = 0; y < NX; y++)
{
tmp += A[a_index(x, y)] * X[y];
}
for (unsigned int y = 0; y < NX; y++)
{
atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
}
}
}
}
#endif
/**
@ -240,143 +332,180 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
*/
__host__ int main(int argc, char** argv)
{
debug("Starting main...");
print_debug("[Main] Starting...");
std::cerr << "[Main] NX is: " << NX << std::endl;
std::cerr << "[Main] NY is: " << NY << std::endl;
#ifndef HPC_USE_CUDA
debug("[Mode] Host-only");
print_debug("[Mode] Host-only");
debug("[Pointers] Allocating...");
print_debug("[Pointers] Allocating...");
// A[NX][NY]
DATA_TYPE** A = new DATA_TYPE*[NX] {};
for(unsigned int x = 0; x < NX; x++)
{
A[x] = new DATA_TYPE[NY] {};
}
DATA_TYPE* A = new DATA_TYPE[NX * NY];
DATA_TYPE* X = new DATA_TYPE[NY];
DATA_TYPE* Y = new DATA_TYPE[NX];
// X[NY]
DATA_TYPE* X = new DATA_TYPE[NY] {};
// Y[NX]
DATA_TYPE* Y = new DATA_TYPE[NX] {};
debug("[Pointers] Allocated!");
print_debug("[Pointers] Allocated!");
#ifdef HPC_INCLUDE_INIT
debug("[Benchmark] Starting...");
print_debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
debug("[Init] Initializing...");
print_debug("[Init] Initializing...");
init_array(A, X, Y);
debug("[Init] Initialized!");
print_debug("[Init] Initialized!");
#ifndef HPC_INCLUDE_INIT
debug("[Benchmark] Starting...");
print_debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
debug("[Kernel] Running...");
print_debug("[Kernel] Running...");
kernel_atax(A, X, Y);
debug("[Kernel] Completed!");
print_debug("[Kernel] Completed!");
debug("[Benchmark] Stopping...");
print_debug("[Benchmark] Stopping...");
polybench_stop_instruments;
polybench_print_instruments;
debug("[Benchmark] Complete!");
print_debug("[Benchmark] Complete!");
debug("[Verify] Printing...")
#ifdef HPC_DEBUG
print_debug("[Debug] Displaying A:");
print_array(A, NX * NY);
print_debug("[Debug] Displaying X:");
print_array(X, NY);
print_debug("[Debug] Displaying Y:");
print_array(Y, NX);
#endif
print_debug("[Verify] Printing...");
polybench_prevent_dce(
print_array(Y)
print_array(Y, NX)
);
debug("[Verify] Done!")
print_debug("[Verify] Done!");
#else
debug("[Mode] Host-and-device, CUDA");
print_debug("[Mode] Host-and-device, CUDA");
debug("[Pointers] Allocating...");
print_debug("[Pointers] Allocating...");
DATA_TYPE* A;
DATA_TYPE* X;
DATA_TYPE* Y;
DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
DATA_TYPE* host_X = new DATA_TYPE[NY];
DATA_TYPE* host_Y = new DATA_TYPE[NX];
debug("[CUDA] Allocating A...");
if(cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
print_debug("[CUDA] Allocating A...");
if(cudaError_t err = cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
{
debug("[CUDA] Could not allocate A!");
print_cudaError(err, "[CUDA] Could not allocate A!");
return 1;
}
debug("[CUDA] Allocated A!");
print_debug("[CUDA] Allocated A!");
debug("[CUDA] Allocating X...");
if(cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
print_debug("[CUDA] Allocating X...");
if(cudaError_t err = cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
{
debug("[CUDA] Could not allocate X!");
print_cudaError(err, "[CUDA] Could not allocate X!");
return 1;
}
debug("[CUDA] Allocated X!");
print_debug("[CUDA] Allocated X!");
debug("[CUDA] Allocating Y...");
if(cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
print_debug("[CUDA] Allocating Y...");
if(cudaError_t err = cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
{
debug("[CUDA] Could not allocate Y!");
print_cudaError(err, "[CUDA] Could not allocate Y!");
return 1;
}
debug("[CUDA] Allocated Y!");
print_debug("[CUDA] Allocated Y!");
#ifdef POLYBENCH_INCLUDE_INIT
debug("[Benchmark] Starting...");
print_debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
debug("[Init] Initializing...");
print_debug("[Init] Initializing...");
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
if(cudaGetLastError())
if(cudaError_t err = cudaGetLastError())
{
debug("[Init] Failed to execute kernel!");
print_cudaError(err, "[Init] Failed to execute kernel!");
return 1;
}
debug("[Init] Initialized!");
print_debug("[Init] Complete!");
#ifndef POLYBENCH_INCLUDE_INIT
debug("[Benchmark] Starting...");
print_debug("[Benchmark] Starting...");
polybench_start_instruments;
#endif
// kernel_atax_cuda<<<1, 1>>>();
print_debug("[Kernel] Running...");
kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
print_debug("[Kernel] Complete!");
print_debug("[CUDA] Copying A back...");
if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
print_cudaError(err, "[CUDA] Could copy A back!");
return 1;
};
print_debug("[CUDA] Copied A back!");
print_debug("[CUDA] Copying X back...");
if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) {
print_cudaError(err, "[CUDA] Could copy X back!");
return 1;
};
print_debug("[CUDA] Copied X back!");
print_debug("[CUDA] Copying Y back...");
if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {
print_cudaError(err, "[CUDA] Could copy Y back!");
return 1;
};
print_debug("[CUDA] Copied Y back!");
print_debug("[Benchmark] Stopping...");
polybench_stop_instruments;
polybench_print_instruments;
print_debug("[Benchmark] Complete!");
// Y = cudaMemcpy();
debug("[CUDA] Freeing A...");
if(cudaFree(A)) {
debug("[CUDA] Could not free A!");
print_debug("[CUDA] Freeing A...");
if(cudaError_t err = cudaFree(A)) {
print_cudaError(err, "[CUDA] Could not free A!");
return 1;
}
debug("[CUDA] Freed A!");
print_debug("[CUDA] Freed A!");
debug("[CUDA] Freeing X...");
if(cudaFree(X)) {
debug("[CUDA] Could not free X!");
print_debug("[CUDA] Freeing X...");
if(cudaError_t err = cudaFree(X)) {
print_cudaError(err, "[CUDA] Could not free X!");
return 1;
}
debug("[CUDA] Freed X!");
print_debug("[CUDA] Freed X!");
debug("[CUDA] Freeing Y...");
if(cudaFree(Y)) {
debug("[CUDA] Could not free Y!");
print_debug("[CUDA] Freeing Y...");
if(cudaError_t err = cudaFree(Y)) {
print_cudaError(err, "[CUDA] Could not free Y!");
return 1;
}
debug("[CUDA] Freed Y!");
print_debug("[CUDA] Freed Y!");
/*
#ifdef HPC_DEBUG
print_debug("[Debug] Displaying A:");
print_array(host_A, NX * NY);
print_debug("[Debug] Displaying X:");
print_array(host_X, NY);
print_debug("[Debug] Displaying Y:");
print_array(host_Y, NX);
#endif
print_debug("[Verify] Printing...");
polybench_prevent_dce(
print_array(Y)
print_array(host_Y, NX)
);
*/
print_debug("[Verify] Done!");
#endif

View file

@ -3,8 +3,12 @@
#define ATAX_H
/* Default to STANDARD_DATASET. */
#if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
#define STANDARD_DATASET
#if !defined(NANO_DATASET) && !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
#ifdef HPC_DEBUG
#define NANO_DATASET
#else
#define EXTRALARGE_DATASET
#endif
#endif
/* Do not define anything if the user manually defines the size. */
@ -12,6 +16,11 @@
/* Define the possible dataset sizes. */
#ifdef NANO_DATASET
#define NX 3
#define NY 5
#endif
#ifdef MINI_DATASET
#define NX 32
#define NY 32