mirror of
https://github.com/Steffo99/unimore-hpc-assignments.git
synced 2024-11-25 09:34:23 +00:00
Get everything to work
This commit is contained in:
parent
ae73536d76
commit
ef30e88e01
22 changed files with 404 additions and 278 deletions
6
.editorconfig
Normal file
6
.editorconfig
Normal file
|
@ -0,0 +1,6 @@
|
|||
root = true
|
||||
|
||||
[*]
|
||||
end_of_line = lf
|
||||
insert_final_newline = true
|
||||
indent_style = tab
|
8
.idea/.gitignore
vendored
8
.idea/.gitignore
vendored
|
@ -1,8 +0,0 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
|
@ -1,40 +0,0 @@
|
|||
<component name="ProjectCodeStyleConfiguration">
|
||||
<code_scheme name="Project" version="173">
|
||||
<Objective-C>
|
||||
<option name="INDENT_CLASS_MEMBERS" value="8" />
|
||||
<option name="INDENT_VISIBILITY_KEYWORDS" value="4" />
|
||||
<option name="INDENT_PREPROCESSOR_DIRECTIVE" value="4" />
|
||||
<option name="INDENT_DIRECTIVE_AS_CODE" value="true" />
|
||||
<option name="SPACE_BEFORE_INIT_LIST" value="true" />
|
||||
<option name="SPACE_BEFORE_POINTER_IN_DECLARATION" value="false" />
|
||||
<option name="SPACE_AFTER_POINTER_IN_DECLARATION" value="true" />
|
||||
<option name="SPACE_BEFORE_REFERENCE_IN_DECLARATION" value="false" />
|
||||
<option name="SPACE_AFTER_REFERENCE_IN_DECLARATION" value="true" />
|
||||
<option name="DISCHARGED_SHORT_TERNARY_OPERATOR" value="true" />
|
||||
</Objective-C>
|
||||
<clangFormatSettings>
|
||||
<option name="ENABLED" value="true" />
|
||||
</clangFormatSettings>
|
||||
<files>
|
||||
<extensions>
|
||||
<pair source="cpp" header="hpp" fileNamingConvention="NONE" />
|
||||
<pair source="c" header="h" fileNamingConvention="NONE" />
|
||||
<pair source="cu" header="cuh" fileNamingConvention="NONE" />
|
||||
</extensions>
|
||||
</files>
|
||||
<codeStyleSettings language="Markdown">
|
||||
<indentOptions>
|
||||
<option name="INDENT_SIZE" value="3" />
|
||||
<option name="TAB_SIZE" value="3" />
|
||||
</indentOptions>
|
||||
</codeStyleSettings>
|
||||
<codeStyleSettings language="ObjectiveC">
|
||||
<indentOptions>
|
||||
<option name="INDENT_SIZE" value="2" />
|
||||
<option name="TAB_SIZE" value="2" />
|
||||
<option name="SMART_TABS" value="true" />
|
||||
<option name="KEEP_INDENTS_ON_EMPTY_LINES" value="true" />
|
||||
</indentOptions>
|
||||
</codeStyleSettings>
|
||||
</code_scheme>
|
||||
</component>
|
|
@ -1,5 +0,0 @@
|
|||
<component name="ProjectCodeStyleConfiguration">
|
||||
<state>
|
||||
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
|
||||
</state>
|
||||
</component>
|
|
@ -1,15 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="CLionExternalBuildManager">
|
||||
<target id="4d1b5109-3338-4779-bba7-8def1c68abbb" name="All" defaultType="MAKE">
|
||||
<configuration id="c01c91f7-5730-4713-b432-50125cbe22a7" name="All">
|
||||
<build type="MAKE">
|
||||
<make targetName="all" />
|
||||
</build>
|
||||
<clean type="MAKE">
|
||||
<make targetName="clean" />
|
||||
</clean>
|
||||
</configuration>
|
||||
</target>
|
||||
</component>
|
||||
</project>
|
|
@ -1,7 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="DiscordProjectSettings">
|
||||
<option name="show" value="PROJECT" />
|
||||
<option name="description" value="" />
|
||||
</component>
|
||||
</project>
|
|
@ -1,11 +0,0 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
|
||||
<option name="processCode" value="true" />
|
||||
<option name="processLiterals" value="true" />
|
||||
<option name="processComments" value="true" />
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
|
@ -1,25 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="MakefileBuildTargetsManager">
|
||||
<user-build-targets>
|
||||
<build-target name="all">
|
||||
<build-configurations>
|
||||
<build-configuration>
|
||||
<make-targets>
|
||||
<make-target>all</make-target>
|
||||
</make-targets>
|
||||
</build-configuration>
|
||||
</build-configurations>
|
||||
</build-target>
|
||||
<build-target name="clean">
|
||||
<build-configurations>
|
||||
<build-configuration>
|
||||
<make-targets>
|
||||
<make-target>clean</make-target>
|
||||
</make-targets>
|
||||
</build-configuration>
|
||||
</build-configurations>
|
||||
</build-target>
|
||||
</user-build-targets>
|
||||
</component>
|
||||
</project>
|
|
@ -1,20 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ExternalStorageConfigurationManager" enabled="true" />
|
||||
<component name="MakefileSettings">
|
||||
<option name="linkedExternalProjectsSettings">
|
||||
<MakefileProjectSettings>
|
||||
<option name="externalProjectPath" value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
|
||||
<option name="modules">
|
||||
<set>
|
||||
<option value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
|
||||
</set>
|
||||
</option>
|
||||
<option name="version" value="2" />
|
||||
</MakefileProjectSettings>
|
||||
</option>
|
||||
</component>
|
||||
<component name="MakefileWorkspace" PROJECT_DIR="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax">
|
||||
<contentRoot DIR="$PROJECT_DIR$" />
|
||||
</component>
|
||||
</project>
|
|
@ -1,8 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/unimore-hpc-1.iml" filepath="$PROJECT_DIR$/.idea/unimore-hpc-1.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -1,7 +0,0 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="atax_acc" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" WORKING_DIR="file://$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" PASS_PARENT_ENVS_2="true" PROJECT_NAME="unimore-hpc-1" TARGET_NAME="all" CONFIG_NAME="all" version="1" RUN_PATH="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax/atax_acc">
|
||||
<method v="2">
|
||||
<option name="CLION.COMPOUND.BUILD" enabled="true" />
|
||||
</method>
|
||||
</configuration>
|
||||
</component>
|
|
@ -1,9 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
|
@ -1,2 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module classpath="External" type="CPP_MODULE" version="4" />
|
|
@ -1,6 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
106
.idea/workspace.xml
Normal file
106
.idea/workspace.xml
Normal file
|
@ -0,0 +1,106 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="AutoImportSettings">
|
||||
<option name="autoReloadType" value="SELECTIVE" />
|
||||
</component>
|
||||
<component name="CMakePresetLoader"><![CDATA[{
|
||||
"useNewFormat": true
|
||||
}]]></component>
|
||||
<component name="CMakeRunConfigurationManager">
|
||||
<generated>
|
||||
<config projectName="unimore-hpc-assignments" targetName="atax.elf" />
|
||||
</generated>
|
||||
</component>
|
||||
<component name="CMakeSettings">
|
||||
<configurations />
|
||||
</component>
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="">
|
||||
<change beforePath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.vscode/settings.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/settings.json" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/atax/.bench.sh" beforeDir="false" afterPath="$PROJECT_DIR$/atax/.bench.sh" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/atax/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/atax/Makefile" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/atax/atax.cu" beforeDir="false" afterPath="$PROJECT_DIR$/atax/atax.cu" afterDir="false" />
|
||||
</list>
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
<component name="MakefileLocalSettings">
|
||||
<option name="projectSyncType">
|
||||
<map>
|
||||
<entry key="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" value="RE_IMPORT" />
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
<component name="MarkdownSettingsMigration">
|
||||
<option name="stateVersion" value="1" />
|
||||
</component>
|
||||
<component name="ProjectId" id="2IKa1Pp6YCa8Ycz7UwOe8DKGQjU" />
|
||||
<component name="ProjectViewState">
|
||||
<option name="hideEmptyMiddlePackages" value="true" />
|
||||
<option name="showLibraryContents" value="true" />
|
||||
</component>
|
||||
<component name="PropertiesComponent"><![CDATA[{
|
||||
"keyToString": {
|
||||
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"RunOnceActivity.cidr.known.project.marker": "true",
|
||||
"SHARE_PROJECT_CONFIGURATION_FILES": "true",
|
||||
"WebServerToolWindowFactoryState": "false",
|
||||
"cf.first.check.clang-format": "false",
|
||||
"cidr.known.project.marker": "true",
|
||||
"last_opened_file_path": "/home/steffo/Workspaces/Steffo99/unimore-hpc-assignments/atax",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"settings.editor.selected.configurable": "MakefileSettings"
|
||||
},
|
||||
"keyToStringList": {
|
||||
"GitStage.ChangesTree.GroupingKeys": [
|
||||
"directory",
|
||||
"module",
|
||||
"repository"
|
||||
]
|
||||
}
|
||||
}]]></component>
|
||||
<component name="RunManager">
|
||||
<configuration default="true" type="CLionExternalRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" PASS_PARENT_ENVS_2="true">
|
||||
<method v="2">
|
||||
<option name="CLION.EXTERNAL.BUILD" enabled="true" />
|
||||
</method>
|
||||
</configuration>
|
||||
</component>
|
||||
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="" />
|
||||
<created>1669932513703</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1669932513703</updated>
|
||||
<workItem from="1669932517554" duration="325000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
<component name="Vcs.Log.Tabs.Properties">
|
||||
<option name="TAB_STATES">
|
||||
<map>
|
||||
<entry key="MAIN">
|
||||
<value>
|
||||
<State />
|
||||
</value>
|
||||
</entry>
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
<component name="XSLT-Support.FileAssociations.UIState">
|
||||
<expand />
|
||||
<select />
|
||||
</component>
|
||||
</project>
|
2
.vscode/c_cpp_properties.json
vendored
2
.vscode/c_cpp_properties.json
vendored
|
@ -6,7 +6,7 @@
|
|||
"${workspaceFolder}/**"
|
||||
],
|
||||
"defines": [],
|
||||
"compilerPath": "/usr/local/cuda-10.0/bin/nvcc",
|
||||
"compilerPath": "/opt/cuda/bin/nvcc",
|
||||
"cStandard": "c11",
|
||||
"cppStandard": "c++14",
|
||||
"configurationProvider": "ms-vscode.makefile-tools"
|
||||
|
|
1
.vscode/launch.json
vendored
1
.vscode/launch.json
vendored
|
@ -15,6 +15,7 @@
|
|||
"environment": [],
|
||||
"externalConsole": false,
|
||||
"MIMode": "gdb",
|
||||
"miDebuggerPath": "/usr/bin/gdb",
|
||||
"setupCommands": [
|
||||
{
|
||||
"description": "Enable pretty-printing for gdb",
|
||||
|
|
36
.vscode/settings.json
vendored
36
.vscode/settings.json
vendored
|
@ -1,5 +1,39 @@
|
|||
{
|
||||
"files.associations": {
|
||||
"*.hu": "cuda-cpp"
|
||||
"*.hu": "cuda-cpp",
|
||||
"array": "cpp",
|
||||
"*.tcc": "cpp",
|
||||
"cctype": "cpp",
|
||||
"clocale": "cpp",
|
||||
"cmath": "cpp",
|
||||
"compare": "cpp",
|
||||
"concepts": "cpp",
|
||||
"cstdarg": "cpp",
|
||||
"cstdint": "cpp",
|
||||
"cstdio": "cpp",
|
||||
"cstdlib": "cpp",
|
||||
"cwchar": "cpp",
|
||||
"cwctype": "cpp",
|
||||
"unordered_map": "cpp",
|
||||
"vector": "cpp",
|
||||
"exception": "cpp",
|
||||
"functional": "cpp",
|
||||
"initializer_list": "cpp",
|
||||
"iosfwd": "cpp",
|
||||
"iostream": "cpp",
|
||||
"istream": "cpp",
|
||||
"limits": "cpp",
|
||||
"new": "cpp",
|
||||
"numbers": "cpp",
|
||||
"ostream": "cpp",
|
||||
"stdexcept": "cpp",
|
||||
"streambuf": "cpp",
|
||||
"string": "cpp",
|
||||
"string_view": "cpp",
|
||||
"system_error": "cpp",
|
||||
"tuple": "cpp",
|
||||
"type_traits": "cpp",
|
||||
"typeinfo": "cpp",
|
||||
"utility": "cpp"
|
||||
}
|
||||
}
|
|
@ -1,12 +1,12 @@
|
|||
#!/bin/bash
|
||||
|
||||
run_benchmarks() {
|
||||
runs=25
|
||||
runs=3
|
||||
totalt=0.0
|
||||
|
||||
for i in $(seq $runs)
|
||||
do
|
||||
exet=$(./atax.elf)
|
||||
exet=$(./atax.elf 2> /dev/null)
|
||||
totalt=$(awk "BEGIN{print $totalt+$exet}")
|
||||
echo -n "."
|
||||
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
|
||||
|
@ -16,9 +16,9 @@ run_benchmarks() {
|
|||
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
|
||||
}
|
||||
|
||||
for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
|
||||
for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET
|
||||
do
|
||||
for c in $(seq 0 7)
|
||||
for c in $(seq 0 3)
|
||||
do
|
||||
cxxflags="-D$dataset"
|
||||
|
||||
|
@ -32,12 +32,8 @@ do
|
|||
cxxflags="$cxxflags -DHPC_USE_CUDA"
|
||||
fi
|
||||
|
||||
if (( $c & 2 ))
|
||||
then
|
||||
cxxflags="$cxxflags -DHPC_USE_STRIDE"
|
||||
fi
|
||||
|
||||
echo "Flags: $cxxflags"
|
||||
make --silent "clean"
|
||||
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
|
||||
|
||||
run_benchmarks
|
||||
|
|
|
@ -5,16 +5,25 @@ MAKEFLAGS+= -r
|
|||
CXXFLAGS+= -DPOLYBENCH_TIME
|
||||
# -O3 applies all compiler optimization, improving from 800ms to 300ms
|
||||
CXXFLAGS+= -O3
|
||||
# Enable this to view the contents of the arrays
|
||||
CXXFLAGS+= -DHPC_DEBUG
|
||||
# Enable this to use CUDA
|
||||
CXXFLAGS+= -DHPC_USE_CUDA
|
||||
# Extend CFLAGS with command line parameters
|
||||
CXXFLAGS+= ${EXTRA_CXXFLAGS}
|
||||
|
||||
# Select the location of the local CUDA install
|
||||
CUDA_HOME:=/usr/local/cuda-10.0
|
||||
# CUDA_HOME:=/usr/local/cuda-10.0
|
||||
CUDA_HOME:=/opt/cuda
|
||||
# Specify the directory of the nvc compiler
|
||||
NVCC:=$(CUDA_HOME)/bin/nvcc
|
||||
# Specify the flags for the nvc compiler
|
||||
NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
|
||||
|
||||
# Optimize for @Steffo's NVIDIA GTX 1070
|
||||
NVCFLAGS+= -arch=compute_61
|
||||
NVCFLAGS+= -code=sm_61
|
||||
|
||||
|
||||
%.elf: %.cu.o polybench.cu.o
|
||||
$(NVCC) $(NVCFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
@ -23,13 +32,12 @@ NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
|
|||
$(NVCC) $(NVCFLAGS) -c $< -o $@
|
||||
|
||||
|
||||
.PHONY: bench clean dev
|
||||
all: atax.elf
|
||||
|
||||
dev: atax.elf
|
||||
./atax.elf
|
||||
.PHONY: bench clean
|
||||
|
||||
bench:
|
||||
./.bench.sh
|
||||
|
||||
clean:
|
||||
rm *.elf *.cu.o
|
||||
rm *.elf
|
||||
|
|
323
atax/atax.cu
323
atax/atax.cu
|
@ -3,6 +3,7 @@
|
|||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
/* Include polybench common header. */
|
||||
#include "polybench.hu"
|
||||
|
@ -23,16 +24,34 @@
|
|||
#define CUDA_NTHREADS 128
|
||||
#endif
|
||||
|
||||
// Enable syntax highlighting for the CUDA mode
|
||||
// TODO: Remove this, as it will be set by .bench.sh
|
||||
#define HPC_USE_CUDA
|
||||
|
||||
// Enable syntax highlighting for the stride mode
|
||||
// TODO: Remove this, as it will be set by .bench.sh
|
||||
#define HPC_USE_STRIDE
|
||||
/**
|
||||
* Given a `x` and a `y`, compute the relative index of the element in the `A` matrix.
|
||||
*/
|
||||
__host__ __device__ inline static unsigned int a_index(unsigned int x, unsigned int y) {
|
||||
return x * NY + y;
|
||||
}
|
||||
|
||||
// Create macro for debug logging
|
||||
#define debug(txt) std::cerr << txt << std::endl
|
||||
/**
|
||||
* Log a debug message.
|
||||
*/
|
||||
__host__ inline static void print_debug(std::string txt) {
|
||||
#ifdef HPC_DEBUG
|
||||
std::cerr << txt << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Log an error message.
|
||||
*/
|
||||
#ifdef HPC_USE_CUDA
|
||||
__host__ inline static void print_cudaError(cudaError_t err, std::string txt) {
|
||||
#ifdef HPC_DEBUG
|
||||
std::cerr << txt;
|
||||
fprintf( stderr, ": error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(err) );
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
|
@ -45,7 +64,7 @@
|
|||
* To be called on the CPU (uses the `__host__` qualifier).
|
||||
*/
|
||||
#ifndef HPC_USE_CUDA
|
||||
__host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||
__host__ static void init_array(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||
{
|
||||
/* X = [ 3.14, 6.28, 9.42, ... ] */
|
||||
for (unsigned int y = 0; y < NY; y++)
|
||||
|
@ -72,7 +91,7 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
|||
{
|
||||
for (unsigned int y = 0; y < NY; y++)
|
||||
{
|
||||
A[x][y] = (DATA_TYPE)(x * (y + 1)) / NX;
|
||||
A[a_index(x, y)] = (DATA_TYPE)(x * (y + 1)) / NX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -87,20 +106,21 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
|||
__device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
|
||||
{
|
||||
// Find how many iterations should be performed by each thread
|
||||
unsigned int perThread = NY / threads;
|
||||
unsigned int perThread = NY / threads + 1;
|
||||
|
||||
// Find the index of the current thread, even if threads span multiple blocks
|
||||
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Have each thread perform the previously determined number of iterations
|
||||
for(int stride = 0; stride < perThread; stride++) {
|
||||
for(int stride = 0; stride < perThread; stride++)
|
||||
{
|
||||
// Find the index of the current iteration
|
||||
// This is equal to `y` of the init_array function
|
||||
int iterationIdx = blockThreadIdx * stride;
|
||||
unsigned int iterationIdx = threads * stride + blockThreadIdx;
|
||||
|
||||
// Prevent the thread from accessing unallocated memory
|
||||
if(iterationIdx < NY) {
|
||||
|
||||
if(iterationIdx < NY)
|
||||
{
|
||||
// Set the array element
|
||||
X[iterationIdx] = iterationIdx * M_PI;
|
||||
}
|
||||
|
@ -117,20 +137,21 @@ __device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
|
|||
__device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
|
||||
{
|
||||
// Find how many iterations should be performed by each thread
|
||||
unsigned int perThread = NX / threads;
|
||||
unsigned int perThread = NX / threads + 1;
|
||||
|
||||
// Find the index of the current thread, even if threads span multiple blocks
|
||||
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Have each thread perform the previously determined number of iterations
|
||||
for(int stride = 0; stride < perThread; stride++) {
|
||||
for(int stride = 0; stride < perThread; stride++)
|
||||
{
|
||||
// Find the index of the current iteration
|
||||
// This is equal to `y` of the init_array function
|
||||
int iterationIdx = blockThreadIdx * stride;
|
||||
unsigned int iterationIdx = threads * stride + blockThreadIdx;
|
||||
|
||||
// Prevent the thread from accessing unallocated memory
|
||||
if(iterationIdx < NX) {
|
||||
|
||||
if(iterationIdx < NX)
|
||||
{
|
||||
// Set the array element
|
||||
Y[iterationIdx] = 0;
|
||||
}
|
||||
|
@ -150,12 +171,29 @@ __device__ static void init_array_cuda_a(DATA_TYPE* A, unsigned int threads)
|
|||
unsigned int elements = NX * NY;
|
||||
|
||||
// Find how many iterations should be performed by each thread
|
||||
unsigned int perThread = elements / threads;
|
||||
unsigned int perThread = elements / threads + 1;
|
||||
|
||||
// Find the index of the current thread, even if threads span multiple blocks
|
||||
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
/* TODO */
|
||||
// Have each thread perform the previously determined number of iterations
|
||||
for(int stride = 0; stride < perThread; stride++)
|
||||
{
|
||||
// Find the index of the current iteration
|
||||
// This is equal to `y` of the init_array function
|
||||
unsigned int iterationIdx = threads * stride + blockThreadIdx;
|
||||
|
||||
// Determine current x and y
|
||||
unsigned int y = iterationIdx % NY;
|
||||
unsigned int x = iterationIdx / NY;
|
||||
|
||||
// Prevent the thread from accessing unallocated memory
|
||||
if(iterationIdx < elements)
|
||||
{
|
||||
// Set the array element
|
||||
A[iterationIdx] = (DATA_TYPE)(x * (y + 1)) / NX;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -188,11 +226,11 @@ __global__ static void init_array_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
|
|||
*
|
||||
* To be called on the CPU (uses the `__host__` qualifier).
|
||||
*/
|
||||
__host__ static void print_array(DATA_TYPE* Y)
|
||||
__host__ static void print_array(DATA_TYPE* Z, unsigned int size)
|
||||
{
|
||||
for (unsigned int x = 0; x < NX; x++)
|
||||
for (unsigned int z = 0; z < size; z++)
|
||||
{
|
||||
fprintf(stderr, DATA_PRINTF_MODIFIER, Y[x]);
|
||||
fprintf(stderr, DATA_PRINTF_MODIFIER, Z[z]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
@ -212,25 +250,79 @@ __host__ static void print_array(DATA_TYPE* Y)
|
|||
*
|
||||
* Parallelizing this is the goal of the assignment.
|
||||
*
|
||||
* Currently to be called on the CPU (uses the `__host__` qualifier), but we may probably want to change that soon.
|
||||
* To be called on the CPU (uses the `__host__` qualifier).
|
||||
*/
|
||||
__host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||
#ifndef HPC_USE_CUDA
|
||||
__host__ static void kernel_atax(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||
{
|
||||
for (unsigned int x = 0; x < NX; x++)
|
||||
for (unsigned int x = 0; x < NY; x++)
|
||||
{
|
||||
DATA_TYPE tmp = 0;
|
||||
|
||||
for (unsigned int y = 0; y < NY; y++)
|
||||
for (unsigned int y = 0; y < NX; y++)
|
||||
{
|
||||
tmp += A[x][y] * X[y];
|
||||
tmp += A[a_index(x, y)] * X[y];
|
||||
}
|
||||
|
||||
for (unsigned int y = 0; y < NY; y++)
|
||||
for (unsigned int y = 0; y < NX; y++)
|
||||
{
|
||||
Y[y] += A[x][y] * tmp;
|
||||
Y[x] += A[a_index(x, y)] * tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Compute ATAX :
|
||||
* - A is the input matrix
|
||||
* - X is an input vector
|
||||
* - Y is the result vector
|
||||
*
|
||||
* In particular:
|
||||
* ```
|
||||
* A * (A * X) = Y
|
||||
* ```
|
||||
* Wait, there's no transposition here?!?
|
||||
*
|
||||
* Parallelizing this is the goal of the assignment.
|
||||
*
|
||||
* To be called on the device as a kernel (uses the `__global__` qualifier).
|
||||
*/
|
||||
#ifdef HPC_USE_CUDA
|
||||
__global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||
{
|
||||
// Find out how many threads there are
|
||||
unsigned int threads = gridDim.x * blockDim.x;
|
||||
|
||||
// Find how many iterations should be performed by each thread
|
||||
unsigned int perThread = NX / threads + 1;
|
||||
|
||||
// Find the index of the current thread, even if threads span multiple blocks
|
||||
unsigned int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Have each thread perform the previously determined number of iterations
|
||||
for(int stride = 0; stride < perThread; stride++)
|
||||
{
|
||||
unsigned int x = threads * stride + blockThreadIdx;
|
||||
|
||||
if(x < NX)
|
||||
{
|
||||
DATA_TYPE tmp = 0;
|
||||
|
||||
for (unsigned int y = 0; y < NX; y++)
|
||||
{
|
||||
tmp += A[a_index(x, y)] * X[y];
|
||||
}
|
||||
|
||||
for (unsigned int y = 0; y < NX; y++)
|
||||
{
|
||||
atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
|
@ -240,143 +332,180 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
|||
*/
|
||||
__host__ int main(int argc, char** argv)
|
||||
{
|
||||
debug("Starting main...");
|
||||
print_debug("[Main] Starting...");
|
||||
std::cerr << "[Main] NX is: " << NX << std::endl;
|
||||
std::cerr << "[Main] NY is: " << NY << std::endl;
|
||||
|
||||
#ifndef HPC_USE_CUDA
|
||||
|
||||
debug("[Mode] Host-only");
|
||||
print_debug("[Mode] Host-only");
|
||||
|
||||
debug("[Pointers] Allocating...");
|
||||
print_debug("[Pointers] Allocating...");
|
||||
|
||||
// A[NX][NY]
|
||||
DATA_TYPE** A = new DATA_TYPE*[NX] {};
|
||||
for(unsigned int x = 0; x < NX; x++)
|
||||
{
|
||||
A[x] = new DATA_TYPE[NY] {};
|
||||
}
|
||||
DATA_TYPE* A = new DATA_TYPE[NX * NY];
|
||||
DATA_TYPE* X = new DATA_TYPE[NY];
|
||||
DATA_TYPE* Y = new DATA_TYPE[NX];
|
||||
|
||||
// X[NY]
|
||||
DATA_TYPE* X = new DATA_TYPE[NY] {};
|
||||
|
||||
// Y[NX]
|
||||
DATA_TYPE* Y = new DATA_TYPE[NX] {};
|
||||
|
||||
debug("[Pointers] Allocated!");
|
||||
print_debug("[Pointers] Allocated!");
|
||||
|
||||
#ifdef HPC_INCLUDE_INIT
|
||||
debug("[Benchmark] Starting...");
|
||||
print_debug("[Benchmark] Starting...");
|
||||
polybench_start_instruments;
|
||||
#endif
|
||||
|
||||
debug("[Init] Initializing...");
|
||||
print_debug("[Init] Initializing...");
|
||||
init_array(A, X, Y);
|
||||
debug("[Init] Initialized!");
|
||||
print_debug("[Init] Initialized!");
|
||||
|
||||
#ifndef HPC_INCLUDE_INIT
|
||||
debug("[Benchmark] Starting...");
|
||||
print_debug("[Benchmark] Starting...");
|
||||
polybench_start_instruments;
|
||||
#endif
|
||||
|
||||
debug("[Kernel] Running...");
|
||||
print_debug("[Kernel] Running...");
|
||||
kernel_atax(A, X, Y);
|
||||
debug("[Kernel] Completed!");
|
||||
print_debug("[Kernel] Completed!");
|
||||
|
||||
debug("[Benchmark] Stopping...");
|
||||
print_debug("[Benchmark] Stopping...");
|
||||
polybench_stop_instruments;
|
||||
polybench_print_instruments;
|
||||
debug("[Benchmark] Complete!");
|
||||
print_debug("[Benchmark] Complete!");
|
||||
|
||||
debug("[Verify] Printing...")
|
||||
#ifdef HPC_DEBUG
|
||||
print_debug("[Debug] Displaying A:");
|
||||
print_array(A, NX * NY);
|
||||
print_debug("[Debug] Displaying X:");
|
||||
print_array(X, NY);
|
||||
print_debug("[Debug] Displaying Y:");
|
||||
print_array(Y, NX);
|
||||
#endif
|
||||
|
||||
print_debug("[Verify] Printing...");
|
||||
polybench_prevent_dce(
|
||||
print_array(Y)
|
||||
print_array(Y, NX)
|
||||
);
|
||||
debug("[Verify] Done!")
|
||||
print_debug("[Verify] Done!");
|
||||
|
||||
#else
|
||||
|
||||
debug("[Mode] Host-and-device, CUDA");
|
||||
print_debug("[Mode] Host-and-device, CUDA");
|
||||
|
||||
debug("[Pointers] Allocating...");
|
||||
print_debug("[Pointers] Allocating...");
|
||||
DATA_TYPE* A;
|
||||
DATA_TYPE* X;
|
||||
DATA_TYPE* Y;
|
||||
DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
|
||||
DATA_TYPE* host_X = new DATA_TYPE[NY];
|
||||
DATA_TYPE* host_Y = new DATA_TYPE[NX];
|
||||
|
||||
debug("[CUDA] Allocating A...");
|
||||
if(cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
|
||||
print_debug("[CUDA] Allocating A...");
|
||||
if(cudaError_t err = cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
|
||||
{
|
||||
debug("[CUDA] Could not allocate A!");
|
||||
print_cudaError(err, "[CUDA] Could not allocate A!");
|
||||
return 1;
|
||||
}
|
||||
debug("[CUDA] Allocated A!");
|
||||
print_debug("[CUDA] Allocated A!");
|
||||
|
||||
debug("[CUDA] Allocating X...");
|
||||
if(cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
|
||||
print_debug("[CUDA] Allocating X...");
|
||||
if(cudaError_t err = cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
|
||||
{
|
||||
debug("[CUDA] Could not allocate X!");
|
||||
print_cudaError(err, "[CUDA] Could not allocate X!");
|
||||
return 1;
|
||||
}
|
||||
debug("[CUDA] Allocated X!");
|
||||
print_debug("[CUDA] Allocated X!");
|
||||
|
||||
debug("[CUDA] Allocating Y...");
|
||||
if(cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
|
||||
print_debug("[CUDA] Allocating Y...");
|
||||
if(cudaError_t err = cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
|
||||
{
|
||||
debug("[CUDA] Could not allocate Y!");
|
||||
print_cudaError(err, "[CUDA] Could not allocate Y!");
|
||||
return 1;
|
||||
}
|
||||
debug("[CUDA] Allocated Y!");
|
||||
print_debug("[CUDA] Allocated Y!");
|
||||
|
||||
#ifdef POLYBENCH_INCLUDE_INIT
|
||||
debug("[Benchmark] Starting...");
|
||||
print_debug("[Benchmark] Starting...");
|
||||
polybench_start_instruments;
|
||||
#endif
|
||||
|
||||
debug("[Init] Initializing...");
|
||||
print_debug("[Init] Initializing...");
|
||||
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
|
||||
if(cudaGetLastError())
|
||||
if(cudaError_t err = cudaGetLastError())
|
||||
{
|
||||
debug("[Init] Failed to execute kernel!");
|
||||
print_cudaError(err, "[Init] Failed to execute kernel!");
|
||||
return 1;
|
||||
}
|
||||
debug("[Init] Initialized!");
|
||||
print_debug("[Init] Complete!");
|
||||
|
||||
#ifndef POLYBENCH_INCLUDE_INIT
|
||||
debug("[Benchmark] Starting...");
|
||||
print_debug("[Benchmark] Starting...");
|
||||
polybench_start_instruments;
|
||||
#endif
|
||||
|
||||
// kernel_atax_cuda<<<1, 1>>>();
|
||||
print_debug("[Kernel] Running...");
|
||||
kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
|
||||
print_debug("[Kernel] Complete!");
|
||||
|
||||
print_debug("[CUDA] Copying A back...");
|
||||
if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
|
||||
print_cudaError(err, "[CUDA] Could copy A back!");
|
||||
return 1;
|
||||
};
|
||||
print_debug("[CUDA] Copied A back!");
|
||||
|
||||
print_debug("[CUDA] Copying X back...");
|
||||
if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) {
|
||||
print_cudaError(err, "[CUDA] Could copy X back!");
|
||||
return 1;
|
||||
};
|
||||
print_debug("[CUDA] Copied X back!");
|
||||
|
||||
print_debug("[CUDA] Copying Y back...");
|
||||
if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {
|
||||
print_cudaError(err, "[CUDA] Could copy Y back!");
|
||||
return 1;
|
||||
};
|
||||
print_debug("[CUDA] Copied Y back!");
|
||||
|
||||
print_debug("[Benchmark] Stopping...");
|
||||
polybench_stop_instruments;
|
||||
polybench_print_instruments;
|
||||
print_debug("[Benchmark] Complete!");
|
||||
|
||||
// Y = cudaMemcpy();
|
||||
|
||||
debug("[CUDA] Freeing A...");
|
||||
if(cudaFree(A)) {
|
||||
debug("[CUDA] Could not free A!");
|
||||
print_debug("[CUDA] Freeing A...");
|
||||
if(cudaError_t err = cudaFree(A)) {
|
||||
print_cudaError(err, "[CUDA] Could not free A!");
|
||||
return 1;
|
||||
}
|
||||
debug("[CUDA] Freed A!");
|
||||
print_debug("[CUDA] Freed A!");
|
||||
|
||||
debug("[CUDA] Freeing X...");
|
||||
if(cudaFree(X)) {
|
||||
debug("[CUDA] Could not free X!");
|
||||
print_debug("[CUDA] Freeing X...");
|
||||
if(cudaError_t err = cudaFree(X)) {
|
||||
print_cudaError(err, "[CUDA] Could not free X!");
|
||||
return 1;
|
||||
}
|
||||
debug("[CUDA] Freed X!");
|
||||
print_debug("[CUDA] Freed X!");
|
||||
|
||||
debug("[CUDA] Freeing Y...");
|
||||
if(cudaFree(Y)) {
|
||||
debug("[CUDA] Could not free Y!");
|
||||
print_debug("[CUDA] Freeing Y...");
|
||||
if(cudaError_t err = cudaFree(Y)) {
|
||||
print_cudaError(err, "[CUDA] Could not free Y!");
|
||||
return 1;
|
||||
}
|
||||
debug("[CUDA] Freed Y!");
|
||||
print_debug("[CUDA] Freed Y!");
|
||||
|
||||
/*
|
||||
#ifdef HPC_DEBUG
|
||||
print_debug("[Debug] Displaying A:");
|
||||
print_array(host_A, NX * NY);
|
||||
print_debug("[Debug] Displaying X:");
|
||||
print_array(host_X, NY);
|
||||
print_debug("[Debug] Displaying Y:");
|
||||
print_array(host_Y, NX);
|
||||
#endif
|
||||
|
||||
print_debug("[Verify] Printing...");
|
||||
polybench_prevent_dce(
|
||||
print_array(Y)
|
||||
print_array(host_Y, NX)
|
||||
);
|
||||
*/
|
||||
print_debug("[Verify] Done!");
|
||||
|
||||
#endif
|
||||
|
||||
|
|
13
atax/atax.hu
13
atax/atax.hu
|
@ -3,8 +3,12 @@
|
|||
#define ATAX_H
|
||||
|
||||
/* Default to STANDARD_DATASET. */
|
||||
#if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
|
||||
#define STANDARD_DATASET
|
||||
#if !defined(NANO_DATASET) && !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
|
||||
#ifdef HPC_DEBUG
|
||||
#define NANO_DATASET
|
||||
#else
|
||||
#define EXTRALARGE_DATASET
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Do not define anything if the user manually defines the size. */
|
||||
|
@ -12,6 +16,11 @@
|
|||
|
||||
/* Define the possible dataset sizes. */
|
||||
|
||||
#ifdef NANO_DATASET
|
||||
#define NX 3
|
||||
#define NY 5
|
||||
#endif
|
||||
|
||||
#ifdef MINI_DATASET
|
||||
#define NX 32
|
||||
#define NY 32
|
||||
|
|
Loading…
Reference in a new issue