mirror of
https://github.com/Steffo99/unimore-hpc-assignments.git
synced 2024-11-29 03:24:22 +00:00
Get everything to work
This commit is contained in:
parent
ae73536d76
commit
ef30e88e01
22 changed files with 404 additions and 278 deletions
6
.editorconfig
Normal file
6
.editorconfig
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
root = true
|
||||||
|
|
||||||
|
[*]
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
indent_style = tab
|
8
.idea/.gitignore
vendored
8
.idea/.gitignore
vendored
|
@ -1,8 +0,0 @@
|
||||||
# Default ignored files
|
|
||||||
/shelf/
|
|
||||||
/workspace.xml
|
|
||||||
# Editor-based HTTP Client requests
|
|
||||||
/httpRequests/
|
|
||||||
# Datasource local storage ignored files
|
|
||||||
/dataSources/
|
|
||||||
/dataSources.local.xml
|
|
|
@ -1,40 +0,0 @@
|
||||||
<component name="ProjectCodeStyleConfiguration">
|
|
||||||
<code_scheme name="Project" version="173">
|
|
||||||
<Objective-C>
|
|
||||||
<option name="INDENT_CLASS_MEMBERS" value="8" />
|
|
||||||
<option name="INDENT_VISIBILITY_KEYWORDS" value="4" />
|
|
||||||
<option name="INDENT_PREPROCESSOR_DIRECTIVE" value="4" />
|
|
||||||
<option name="INDENT_DIRECTIVE_AS_CODE" value="true" />
|
|
||||||
<option name="SPACE_BEFORE_INIT_LIST" value="true" />
|
|
||||||
<option name="SPACE_BEFORE_POINTER_IN_DECLARATION" value="false" />
|
|
||||||
<option name="SPACE_AFTER_POINTER_IN_DECLARATION" value="true" />
|
|
||||||
<option name="SPACE_BEFORE_REFERENCE_IN_DECLARATION" value="false" />
|
|
||||||
<option name="SPACE_AFTER_REFERENCE_IN_DECLARATION" value="true" />
|
|
||||||
<option name="DISCHARGED_SHORT_TERNARY_OPERATOR" value="true" />
|
|
||||||
</Objective-C>
|
|
||||||
<clangFormatSettings>
|
|
||||||
<option name="ENABLED" value="true" />
|
|
||||||
</clangFormatSettings>
|
|
||||||
<files>
|
|
||||||
<extensions>
|
|
||||||
<pair source="cpp" header="hpp" fileNamingConvention="NONE" />
|
|
||||||
<pair source="c" header="h" fileNamingConvention="NONE" />
|
|
||||||
<pair source="cu" header="cuh" fileNamingConvention="NONE" />
|
|
||||||
</extensions>
|
|
||||||
</files>
|
|
||||||
<codeStyleSettings language="Markdown">
|
|
||||||
<indentOptions>
|
|
||||||
<option name="INDENT_SIZE" value="3" />
|
|
||||||
<option name="TAB_SIZE" value="3" />
|
|
||||||
</indentOptions>
|
|
||||||
</codeStyleSettings>
|
|
||||||
<codeStyleSettings language="ObjectiveC">
|
|
||||||
<indentOptions>
|
|
||||||
<option name="INDENT_SIZE" value="2" />
|
|
||||||
<option name="TAB_SIZE" value="2" />
|
|
||||||
<option name="SMART_TABS" value="true" />
|
|
||||||
<option name="KEEP_INDENTS_ON_EMPTY_LINES" value="true" />
|
|
||||||
</indentOptions>
|
|
||||||
</codeStyleSettings>
|
|
||||||
</code_scheme>
|
|
||||||
</component>
|
|
|
@ -1,5 +0,0 @@
|
||||||
<component name="ProjectCodeStyleConfiguration">
|
|
||||||
<state>
|
|
||||||
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
|
|
||||||
</state>
|
|
||||||
</component>
|
|
|
@ -1,15 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="CLionExternalBuildManager">
|
|
||||||
<target id="4d1b5109-3338-4779-bba7-8def1c68abbb" name="All" defaultType="MAKE">
|
|
||||||
<configuration id="c01c91f7-5730-4713-b432-50125cbe22a7" name="All">
|
|
||||||
<build type="MAKE">
|
|
||||||
<make targetName="all" />
|
|
||||||
</build>
|
|
||||||
<clean type="MAKE">
|
|
||||||
<make targetName="clean" />
|
|
||||||
</clean>
|
|
||||||
</configuration>
|
|
||||||
</target>
|
|
||||||
</component>
|
|
||||||
</project>
|
|
|
@ -1,7 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="DiscordProjectSettings">
|
|
||||||
<option name="show" value="PROJECT" />
|
|
||||||
<option name="description" value="" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
|
@ -1,11 +0,0 @@
|
||||||
<component name="InspectionProjectProfileManager">
|
|
||||||
<profile version="1.0">
|
|
||||||
<option name="myName" value="Project Default" />
|
|
||||||
<inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
|
||||||
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
|
|
||||||
<option name="processCode" value="true" />
|
|
||||||
<option name="processLiterals" value="true" />
|
|
||||||
<option name="processComments" value="true" />
|
|
||||||
</inspection_tool>
|
|
||||||
</profile>
|
|
||||||
</component>
|
|
|
@ -1,25 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="MakefileBuildTargetsManager">
|
|
||||||
<user-build-targets>
|
|
||||||
<build-target name="all">
|
|
||||||
<build-configurations>
|
|
||||||
<build-configuration>
|
|
||||||
<make-targets>
|
|
||||||
<make-target>all</make-target>
|
|
||||||
</make-targets>
|
|
||||||
</build-configuration>
|
|
||||||
</build-configurations>
|
|
||||||
</build-target>
|
|
||||||
<build-target name="clean">
|
|
||||||
<build-configurations>
|
|
||||||
<build-configuration>
|
|
||||||
<make-targets>
|
|
||||||
<make-target>clean</make-target>
|
|
||||||
</make-targets>
|
|
||||||
</build-configuration>
|
|
||||||
</build-configurations>
|
|
||||||
</build-target>
|
|
||||||
</user-build-targets>
|
|
||||||
</component>
|
|
||||||
</project>
|
|
|
@ -1,20 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="ExternalStorageConfigurationManager" enabled="true" />
|
|
||||||
<component name="MakefileSettings">
|
|
||||||
<option name="linkedExternalProjectsSettings">
|
|
||||||
<MakefileProjectSettings>
|
|
||||||
<option name="externalProjectPath" value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
|
|
||||||
<option name="modules">
|
|
||||||
<set>
|
|
||||||
<option value="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" />
|
|
||||||
</set>
|
|
||||||
</option>
|
|
||||||
<option name="version" value="2" />
|
|
||||||
</MakefileProjectSettings>
|
|
||||||
</option>
|
|
||||||
</component>
|
|
||||||
<component name="MakefileWorkspace" PROJECT_DIR="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax">
|
|
||||||
<contentRoot DIR="$PROJECT_DIR$" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
|
@ -1,8 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="ProjectModuleManager">
|
|
||||||
<modules>
|
|
||||||
<module fileurl="file://$PROJECT_DIR$/.idea/unimore-hpc-1.iml" filepath="$PROJECT_DIR$/.idea/unimore-hpc-1.iml" />
|
|
||||||
</modules>
|
|
||||||
</component>
|
|
||||||
</project>
|
|
|
@ -1,7 +0,0 @@
|
||||||
<component name="ProjectRunConfigurationManager">
|
|
||||||
<configuration default="false" name="atax_acc" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" WORKING_DIR="file://$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" PASS_PARENT_ENVS_2="true" PROJECT_NAME="unimore-hpc-1" TARGET_NAME="all" CONFIG_NAME="all" version="1" RUN_PATH="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax/atax_acc">
|
|
||||||
<method v="2">
|
|
||||||
<option name="CLION.COMPOUND.BUILD" enabled="true" />
|
|
||||||
</method>
|
|
||||||
</configuration>
|
|
||||||
</component>
|
|
|
@ -1,9 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<module type="JAVA_MODULE" version="4">
|
|
||||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
|
||||||
<exclude-output />
|
|
||||||
<content url="file://$MODULE_DIR$" />
|
|
||||||
<orderEntry type="inheritedJdk" />
|
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
|
||||||
</component>
|
|
||||||
</module>
|
|
|
@ -1,2 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<module classpath="External" type="CPP_MODULE" version="4" />
|
|
|
@ -1,6 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="VcsDirectoryMappings">
|
|
||||||
<mapping directory="" vcs="Git" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
106
.idea/workspace.xml
Normal file
106
.idea/workspace.xml
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="AutoImportSettings">
|
||||||
|
<option name="autoReloadType" value="SELECTIVE" />
|
||||||
|
</component>
|
||||||
|
<component name="CMakePresetLoader"><![CDATA[{
|
||||||
|
"useNewFormat": true
|
||||||
|
}]]></component>
|
||||||
|
<component name="CMakeRunConfigurationManager">
|
||||||
|
<generated>
|
||||||
|
<config projectName="unimore-hpc-assignments" targetName="atax.elf" />
|
||||||
|
</generated>
|
||||||
|
</component>
|
||||||
|
<component name="CMakeSettings">
|
||||||
|
<configurations />
|
||||||
|
</component>
|
||||||
|
<component name="ChangeListManager">
|
||||||
|
<list default="true" id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="">
|
||||||
|
<change beforePath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/c_cpp_properties.json" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/.vscode/settings.json" beforeDir="false" afterPath="$PROJECT_DIR$/.vscode/settings.json" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/atax/.bench.sh" beforeDir="false" afterPath="$PROJECT_DIR$/atax/.bench.sh" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/atax/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/atax/Makefile" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/atax/atax.cu" beforeDir="false" afterPath="$PROJECT_DIR$/atax/atax.cu" afterDir="false" />
|
||||||
|
</list>
|
||||||
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
|
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||||
|
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||||
|
</component>
|
||||||
|
<component name="Git.Settings">
|
||||||
|
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||||
|
</component>
|
||||||
|
<component name="MakefileLocalSettings">
|
||||||
|
<option name="projectSyncType">
|
||||||
|
<map>
|
||||||
|
<entry key="$PROJECT_DIR$/OpenMP/linear-algebra/kernels/atax" value="RE_IMPORT" />
|
||||||
|
</map>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
|
<component name="MarkdownSettingsMigration">
|
||||||
|
<option name="stateVersion" value="1" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectId" id="2IKa1Pp6YCa8Ycz7UwOe8DKGQjU" />
|
||||||
|
<component name="ProjectViewState">
|
||||||
|
<option name="hideEmptyMiddlePackages" value="true" />
|
||||||
|
<option name="showLibraryContents" value="true" />
|
||||||
|
</component>
|
||||||
|
<component name="PropertiesComponent"><![CDATA[{
|
||||||
|
"keyToString": {
|
||||||
|
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
||||||
|
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||||
|
"RunOnceActivity.cidr.known.project.marker": "true",
|
||||||
|
"SHARE_PROJECT_CONFIGURATION_FILES": "true",
|
||||||
|
"WebServerToolWindowFactoryState": "false",
|
||||||
|
"cf.first.check.clang-format": "false",
|
||||||
|
"cidr.known.project.marker": "true",
|
||||||
|
"last_opened_file_path": "/home/steffo/Workspaces/Steffo99/unimore-hpc-assignments/atax",
|
||||||
|
"nodejs_package_manager_path": "npm",
|
||||||
|
"settings.editor.selected.configurable": "MakefileSettings"
|
||||||
|
},
|
||||||
|
"keyToStringList": {
|
||||||
|
"GitStage.ChangesTree.GroupingKeys": [
|
||||||
|
"directory",
|
||||||
|
"module",
|
||||||
|
"repository"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}]]></component>
|
||||||
|
<component name="RunManager">
|
||||||
|
<configuration default="true" type="CLionExternalRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" PASS_PARENT_ENVS_2="true">
|
||||||
|
<method v="2">
|
||||||
|
<option name="CLION.EXTERNAL.BUILD" enabled="true" />
|
||||||
|
</method>
|
||||||
|
</configuration>
|
||||||
|
</component>
|
||||||
|
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
||||||
|
<component name="TaskManager">
|
||||||
|
<task active="true" id="Default" summary="Default task">
|
||||||
|
<changelist id="bf144d77-4aec-4d84-b6cb-b699a8ba6326" name="Changes" comment="" />
|
||||||
|
<created>1669932513703</created>
|
||||||
|
<option name="number" value="Default" />
|
||||||
|
<option name="presentableId" value="Default" />
|
||||||
|
<updated>1669932513703</updated>
|
||||||
|
<workItem from="1669932517554" duration="325000" />
|
||||||
|
</task>
|
||||||
|
<servers />
|
||||||
|
</component>
|
||||||
|
<component name="TypeScriptGeneratedFilesManager">
|
||||||
|
<option name="version" value="3" />
|
||||||
|
</component>
|
||||||
|
<component name="Vcs.Log.Tabs.Properties">
|
||||||
|
<option name="TAB_STATES">
|
||||||
|
<map>
|
||||||
|
<entry key="MAIN">
|
||||||
|
<value>
|
||||||
|
<State />
|
||||||
|
</value>
|
||||||
|
</entry>
|
||||||
|
</map>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
|
<component name="XSLT-Support.FileAssociations.UIState">
|
||||||
|
<expand />
|
||||||
|
<select />
|
||||||
|
</component>
|
||||||
|
</project>
|
2
.vscode/c_cpp_properties.json
vendored
2
.vscode/c_cpp_properties.json
vendored
|
@ -6,7 +6,7 @@
|
||||||
"${workspaceFolder}/**"
|
"${workspaceFolder}/**"
|
||||||
],
|
],
|
||||||
"defines": [],
|
"defines": [],
|
||||||
"compilerPath": "/usr/local/cuda-10.0/bin/nvcc",
|
"compilerPath": "/opt/cuda/bin/nvcc",
|
||||||
"cStandard": "c11",
|
"cStandard": "c11",
|
||||||
"cppStandard": "c++14",
|
"cppStandard": "c++14",
|
||||||
"configurationProvider": "ms-vscode.makefile-tools"
|
"configurationProvider": "ms-vscode.makefile-tools"
|
||||||
|
|
1
.vscode/launch.json
vendored
1
.vscode/launch.json
vendored
|
@ -15,6 +15,7 @@
|
||||||
"environment": [],
|
"environment": [],
|
||||||
"externalConsole": false,
|
"externalConsole": false,
|
||||||
"MIMode": "gdb",
|
"MIMode": "gdb",
|
||||||
|
"miDebuggerPath": "/usr/bin/gdb",
|
||||||
"setupCommands": [
|
"setupCommands": [
|
||||||
{
|
{
|
||||||
"description": "Enable pretty-printing for gdb",
|
"description": "Enable pretty-printing for gdb",
|
||||||
|
|
36
.vscode/settings.json
vendored
36
.vscode/settings.json
vendored
|
@ -1,5 +1,39 @@
|
||||||
{
|
{
|
||||||
"files.associations": {
|
"files.associations": {
|
||||||
"*.hu": "cuda-cpp"
|
"*.hu": "cuda-cpp",
|
||||||
|
"array": "cpp",
|
||||||
|
"*.tcc": "cpp",
|
||||||
|
"cctype": "cpp",
|
||||||
|
"clocale": "cpp",
|
||||||
|
"cmath": "cpp",
|
||||||
|
"compare": "cpp",
|
||||||
|
"concepts": "cpp",
|
||||||
|
"cstdarg": "cpp",
|
||||||
|
"cstdint": "cpp",
|
||||||
|
"cstdio": "cpp",
|
||||||
|
"cstdlib": "cpp",
|
||||||
|
"cwchar": "cpp",
|
||||||
|
"cwctype": "cpp",
|
||||||
|
"unordered_map": "cpp",
|
||||||
|
"vector": "cpp",
|
||||||
|
"exception": "cpp",
|
||||||
|
"functional": "cpp",
|
||||||
|
"initializer_list": "cpp",
|
||||||
|
"iosfwd": "cpp",
|
||||||
|
"iostream": "cpp",
|
||||||
|
"istream": "cpp",
|
||||||
|
"limits": "cpp",
|
||||||
|
"new": "cpp",
|
||||||
|
"numbers": "cpp",
|
||||||
|
"ostream": "cpp",
|
||||||
|
"stdexcept": "cpp",
|
||||||
|
"streambuf": "cpp",
|
||||||
|
"string": "cpp",
|
||||||
|
"string_view": "cpp",
|
||||||
|
"system_error": "cpp",
|
||||||
|
"tuple": "cpp",
|
||||||
|
"type_traits": "cpp",
|
||||||
|
"typeinfo": "cpp",
|
||||||
|
"utility": "cpp"
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,12 +1,12 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
run_benchmarks() {
|
run_benchmarks() {
|
||||||
runs=25
|
runs=3
|
||||||
totalt=0.0
|
totalt=0.0
|
||||||
|
|
||||||
for i in $(seq $runs)
|
for i in $(seq $runs)
|
||||||
do
|
do
|
||||||
exet=$(./atax.elf)
|
exet=$(./atax.elf 2> /dev/null)
|
||||||
totalt=$(awk "BEGIN{print $totalt+$exet}")
|
totalt=$(awk "BEGIN{print $totalt+$exet}")
|
||||||
echo -n "."
|
echo -n "."
|
||||||
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
|
# echo "Run #$i: " $(awk "BEGIN{printf(\"%.3g\", $exet)}") "seconds"
|
||||||
|
@ -16,9 +16,9 @@ run_benchmarks() {
|
||||||
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
|
echo " Average of $runs runs: " $(awk "BEGIN{printf(\"%.3g\", $avgt)}") "seconds"
|
||||||
}
|
}
|
||||||
|
|
||||||
for dataset in MINI_DATASET SMALL_DATASET STANDARD_DATASET LARGE_DATASET EXTRALARGE_DATASET
|
for dataset in EXTRALARGE_DATASET LARGE_DATASET STANDARD_DATASET SMALL_DATASET MINI_DATASET
|
||||||
do
|
do
|
||||||
for c in $(seq 0 7)
|
for c in $(seq 0 3)
|
||||||
do
|
do
|
||||||
cxxflags="-D$dataset"
|
cxxflags="-D$dataset"
|
||||||
|
|
||||||
|
@ -32,12 +32,8 @@ do
|
||||||
cxxflags="$cxxflags -DHPC_USE_CUDA"
|
cxxflags="$cxxflags -DHPC_USE_CUDA"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if (( $c & 2 ))
|
|
||||||
then
|
|
||||||
cxxflags="$cxxflags -DHPC_USE_STRIDE"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Flags: $cxxflags"
|
echo "Flags: $cxxflags"
|
||||||
|
make --silent "clean"
|
||||||
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
|
make --silent "EXTRA_CXXFLAGS=$cxxflags" "atax.elf"
|
||||||
|
|
||||||
run_benchmarks
|
run_benchmarks
|
||||||
|
|
|
@ -5,16 +5,25 @@ MAKEFLAGS+= -r
|
||||||
CXXFLAGS+= -DPOLYBENCH_TIME
|
CXXFLAGS+= -DPOLYBENCH_TIME
|
||||||
# -O3 applies all compiler optimization, improving from 800ms to 300ms
|
# -O3 applies all compiler optimization, improving from 800ms to 300ms
|
||||||
CXXFLAGS+= -O3
|
CXXFLAGS+= -O3
|
||||||
|
# Enable this to view the contents of the arrays
|
||||||
|
CXXFLAGS+= -DHPC_DEBUG
|
||||||
|
# Enable this to use CUDA
|
||||||
|
CXXFLAGS+= -DHPC_USE_CUDA
|
||||||
# Extend CFLAGS with command line parameters
|
# Extend CFLAGS with command line parameters
|
||||||
CXXFLAGS+= ${EXTRA_CXXFLAGS}
|
CXXFLAGS+= ${EXTRA_CXXFLAGS}
|
||||||
|
|
||||||
# Select the location of the local CUDA install
|
# Select the location of the local CUDA install
|
||||||
CUDA_HOME:=/usr/local/cuda-10.0
|
# CUDA_HOME:=/usr/local/cuda-10.0
|
||||||
|
CUDA_HOME:=/opt/cuda
|
||||||
# Specify the directory of the nvc compiler
|
# Specify the directory of the nvc compiler
|
||||||
NVCC:=$(CUDA_HOME)/bin/nvcc
|
NVCC:=$(CUDA_HOME)/bin/nvcc
|
||||||
# Specify the flags for the nvc compiler
|
# Specify the flags for the nvc compiler
|
||||||
NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
|
NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
|
||||||
|
|
||||||
|
# Optimize for @Steffo's NVIDIA GTX 1070
|
||||||
|
NVCFLAGS+= -arch=compute_61
|
||||||
|
NVCFLAGS+= -code=sm_61
|
||||||
|
|
||||||
|
|
||||||
%.elf: %.cu.o polybench.cu.o
|
%.elf: %.cu.o polybench.cu.o
|
||||||
$(NVCC) $(NVCFLAGS) $^ -o $@ $(LDFLAGS)
|
$(NVCC) $(NVCFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
@ -23,13 +32,12 @@ NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
|
||||||
$(NVCC) $(NVCFLAGS) -c $< -o $@
|
$(NVCC) $(NVCFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
|
||||||
.PHONY: bench clean dev
|
all: atax.elf
|
||||||
|
|
||||||
dev: atax.elf
|
.PHONY: bench clean
|
||||||
./atax.elf
|
|
||||||
|
|
||||||
bench:
|
bench:
|
||||||
./.bench.sh
|
./.bench.sh
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm *.elf *.cu.o
|
rm *.elf
|
||||||
|
|
323
atax/atax.cu
323
atax/atax.cu
|
@ -3,6 +3,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
/* Include polybench common header. */
|
/* Include polybench common header. */
|
||||||
#include "polybench.hu"
|
#include "polybench.hu"
|
||||||
|
@ -23,16 +24,34 @@
|
||||||
#define CUDA_NTHREADS 128
|
#define CUDA_NTHREADS 128
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Enable syntax highlighting for the CUDA mode
|
|
||||||
// TODO: Remove this, as it will be set by .bench.sh
|
|
||||||
#define HPC_USE_CUDA
|
|
||||||
|
|
||||||
// Enable syntax highlighting for the stride mode
|
/**
|
||||||
// TODO: Remove this, as it will be set by .bench.sh
|
* Given a `x` and a `y`, compute the relative index of the element in the `A` matrix.
|
||||||
#define HPC_USE_STRIDE
|
*/
|
||||||
|
__host__ __device__ inline static unsigned int a_index(unsigned int x, unsigned int y) {
|
||||||
|
return x * NY + y;
|
||||||
|
}
|
||||||
|
|
||||||
// Create macro for debug logging
|
/**
|
||||||
#define debug(txt) std::cerr << txt << std::endl
|
* Log a debug message.
|
||||||
|
*/
|
||||||
|
__host__ inline static void print_debug(std::string txt) {
|
||||||
|
#ifdef HPC_DEBUG
|
||||||
|
std::cerr << txt << std::endl;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log an error message.
|
||||||
|
*/
|
||||||
|
#ifdef HPC_USE_CUDA
|
||||||
|
__host__ inline static void print_cudaError(cudaError_t err, std::string txt) {
|
||||||
|
#ifdef HPC_DEBUG
|
||||||
|
std::cerr << txt;
|
||||||
|
fprintf( stderr, ": error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(err) );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -45,7 +64,7 @@
|
||||||
* To be called on the CPU (uses the `__host__` qualifier).
|
* To be called on the CPU (uses the `__host__` qualifier).
|
||||||
*/
|
*/
|
||||||
#ifndef HPC_USE_CUDA
|
#ifndef HPC_USE_CUDA
|
||||||
__host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
__host__ static void init_array(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||||
{
|
{
|
||||||
/* X = [ 3.14, 6.28, 9.42, ... ] */
|
/* X = [ 3.14, 6.28, 9.42, ... ] */
|
||||||
for (unsigned int y = 0; y < NY; y++)
|
for (unsigned int y = 0; y < NY; y++)
|
||||||
|
@ -72,7 +91,7 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||||
{
|
{
|
||||||
for (unsigned int y = 0; y < NY; y++)
|
for (unsigned int y = 0; y < NY; y++)
|
||||||
{
|
{
|
||||||
A[x][y] = (DATA_TYPE)(x * (y + 1)) / NX;
|
A[a_index(x, y)] = (DATA_TYPE)(x * (y + 1)) / NX;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -87,20 +106,21 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||||
__device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
|
__device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
|
||||||
{
|
{
|
||||||
// Find how many iterations should be performed by each thread
|
// Find how many iterations should be performed by each thread
|
||||||
unsigned int perThread = NY / threads;
|
unsigned int perThread = NY / threads + 1;
|
||||||
|
|
||||||
// Find the index of the current thread, even if threads span multiple blocks
|
// Find the index of the current thread, even if threads span multiple blocks
|
||||||
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
// Have each thread perform the previously determined number of iterations
|
// Have each thread perform the previously determined number of iterations
|
||||||
for(int stride = 0; stride < perThread; stride++) {
|
for(int stride = 0; stride < perThread; stride++)
|
||||||
|
{
|
||||||
// Find the index of the current iteration
|
// Find the index of the current iteration
|
||||||
// This is equal to `y` of the init_array function
|
// This is equal to `y` of the init_array function
|
||||||
int iterationIdx = blockThreadIdx * stride;
|
unsigned int iterationIdx = threads * stride + blockThreadIdx;
|
||||||
|
|
||||||
// Prevent the thread from accessing unallocated memory
|
// Prevent the thread from accessing unallocated memory
|
||||||
if(iterationIdx < NY) {
|
if(iterationIdx < NY)
|
||||||
|
{
|
||||||
// Set the array element
|
// Set the array element
|
||||||
X[iterationIdx] = iterationIdx * M_PI;
|
X[iterationIdx] = iterationIdx * M_PI;
|
||||||
}
|
}
|
||||||
|
@ -117,20 +137,21 @@ __device__ static void init_array_cuda_x(DATA_TYPE* X, unsigned int threads)
|
||||||
__device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
|
__device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
|
||||||
{
|
{
|
||||||
// Find how many iterations should be performed by each thread
|
// Find how many iterations should be performed by each thread
|
||||||
unsigned int perThread = NX / threads;
|
unsigned int perThread = NX / threads + 1;
|
||||||
|
|
||||||
// Find the index of the current thread, even if threads span multiple blocks
|
// Find the index of the current thread, even if threads span multiple blocks
|
||||||
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
// Have each thread perform the previously determined number of iterations
|
// Have each thread perform the previously determined number of iterations
|
||||||
for(int stride = 0; stride < perThread; stride++) {
|
for(int stride = 0; stride < perThread; stride++)
|
||||||
|
{
|
||||||
// Find the index of the current iteration
|
// Find the index of the current iteration
|
||||||
// This is equal to `y` of the init_array function
|
// This is equal to `y` of the init_array function
|
||||||
int iterationIdx = blockThreadIdx * stride;
|
unsigned int iterationIdx = threads * stride + blockThreadIdx;
|
||||||
|
|
||||||
// Prevent the thread from accessing unallocated memory
|
// Prevent the thread from accessing unallocated memory
|
||||||
if(iterationIdx < NX) {
|
if(iterationIdx < NX)
|
||||||
|
{
|
||||||
// Set the array element
|
// Set the array element
|
||||||
Y[iterationIdx] = 0;
|
Y[iterationIdx] = 0;
|
||||||
}
|
}
|
||||||
|
@ -150,12 +171,29 @@ __device__ static void init_array_cuda_a(DATA_TYPE* A, unsigned int threads)
|
||||||
unsigned int elements = NX * NY;
|
unsigned int elements = NX * NY;
|
||||||
|
|
||||||
// Find how many iterations should be performed by each thread
|
// Find how many iterations should be performed by each thread
|
||||||
unsigned int perThread = elements / threads;
|
unsigned int perThread = elements / threads + 1;
|
||||||
|
|
||||||
// Find the index of the current thread, even if threads span multiple blocks
|
// Find the index of the current thread, even if threads span multiple blocks
|
||||||
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
/* TODO */
|
// Have each thread perform the previously determined number of iterations
|
||||||
|
for(int stride = 0; stride < perThread; stride++)
|
||||||
|
{
|
||||||
|
// Find the index of the current iteration
|
||||||
|
// This is equal to `y` of the init_array function
|
||||||
|
unsigned int iterationIdx = threads * stride + blockThreadIdx;
|
||||||
|
|
||||||
|
// Determine current x and y
|
||||||
|
unsigned int y = iterationIdx % NY;
|
||||||
|
unsigned int x = iterationIdx / NY;
|
||||||
|
|
||||||
|
// Prevent the thread from accessing unallocated memory
|
||||||
|
if(iterationIdx < elements)
|
||||||
|
{
|
||||||
|
// Set the array element
|
||||||
|
A[iterationIdx] = (DATA_TYPE)(x * (y + 1)) / NX;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -188,11 +226,11 @@ __global__ static void init_array_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||||
*
|
*
|
||||||
* To be called on the CPU (uses the `__host__` qualifier).
|
* To be called on the CPU (uses the `__host__` qualifier).
|
||||||
*/
|
*/
|
||||||
__host__ static void print_array(DATA_TYPE* Y)
|
__host__ static void print_array(DATA_TYPE* Z, unsigned int size)
|
||||||
{
|
{
|
||||||
for (unsigned int x = 0; x < NX; x++)
|
for (unsigned int z = 0; z < size; z++)
|
||||||
{
|
{
|
||||||
fprintf(stderr, DATA_PRINTF_MODIFIER, Y[x]);
|
fprintf(stderr, DATA_PRINTF_MODIFIER, Z[z]);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
@ -212,25 +250,79 @@ __host__ static void print_array(DATA_TYPE* Y)
|
||||||
*
|
*
|
||||||
* Parallelizing this is the goal of the assignment.
|
* Parallelizing this is the goal of the assignment.
|
||||||
*
|
*
|
||||||
* Currently to be called on the CPU (uses the `__host__` qualifier), but we may probably want to change that soon.
|
* To be called on the CPU (uses the `__host__` qualifier).
|
||||||
*/
|
*/
|
||||||
__host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
#ifndef HPC_USE_CUDA
|
||||||
|
__host__ static void kernel_atax(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||||
{
|
{
|
||||||
for (unsigned int x = 0; x < NX; x++)
|
for (unsigned int x = 0; x < NY; x++)
|
||||||
{
|
{
|
||||||
DATA_TYPE tmp = 0;
|
DATA_TYPE tmp = 0;
|
||||||
|
|
||||||
for (unsigned int y = 0; y < NY; y++)
|
for (unsigned int y = 0; y < NX; y++)
|
||||||
{
|
{
|
||||||
tmp += A[x][y] * X[y];
|
tmp += A[a_index(x, y)] * X[y];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int y = 0; y < NY; y++)
|
for (unsigned int y = 0; y < NX; y++)
|
||||||
{
|
{
|
||||||
Y[y] += A[x][y] * tmp;
|
Y[x] += A[a_index(x, y)] * tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute ATAX :
|
||||||
|
* - A is the input matrix
|
||||||
|
* - X is an input vector
|
||||||
|
* - Y is the result vector
|
||||||
|
*
|
||||||
|
* In particular:
|
||||||
|
* ```
|
||||||
|
* A * (A * X) = Y
|
||||||
|
* ```
|
||||||
|
* Wait, there's no transposition here?!?
|
||||||
|
*
|
||||||
|
* Parallelizing this is the goal of the assignment.
|
||||||
|
*
|
||||||
|
* To be called on the device as a kernel (uses the `__global__` qualifier).
|
||||||
|
*/
|
||||||
|
#ifdef HPC_USE_CUDA
|
||||||
|
__global__ static void kernel_atax_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||||
|
{
|
||||||
|
// Find out how many threads there are
|
||||||
|
unsigned int threads = gridDim.x * blockDim.x;
|
||||||
|
|
||||||
|
// Find how many iterations should be performed by each thread
|
||||||
|
unsigned int perThread = NX / threads + 1;
|
||||||
|
|
||||||
|
// Find the index of the current thread, even if threads span multiple blocks
|
||||||
|
unsigned int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
// Have each thread perform the previously determined number of iterations
|
||||||
|
for(int stride = 0; stride < perThread; stride++)
|
||||||
|
{
|
||||||
|
unsigned int x = threads * stride + blockThreadIdx;
|
||||||
|
|
||||||
|
if(x < NX)
|
||||||
|
{
|
||||||
|
DATA_TYPE tmp = 0;
|
||||||
|
|
||||||
|
for (unsigned int y = 0; y < NX; y++)
|
||||||
|
{
|
||||||
|
tmp += A[a_index(x, y)] * X[y];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned int y = 0; y < NX; y++)
|
||||||
|
{
|
||||||
|
atomicAdd(&Y[x], A[a_index(x, y)] * tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -240,143 +332,180 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
|
||||||
*/
|
*/
|
||||||
__host__ int main(int argc, char** argv)
|
__host__ int main(int argc, char** argv)
|
||||||
{
|
{
|
||||||
debug("Starting main...");
|
print_debug("[Main] Starting...");
|
||||||
|
std::cerr << "[Main] NX is: " << NX << std::endl;
|
||||||
|
std::cerr << "[Main] NY is: " << NY << std::endl;
|
||||||
|
|
||||||
#ifndef HPC_USE_CUDA
|
#ifndef HPC_USE_CUDA
|
||||||
|
|
||||||
debug("[Mode] Host-only");
|
print_debug("[Mode] Host-only");
|
||||||
|
|
||||||
debug("[Pointers] Allocating...");
|
print_debug("[Pointers] Allocating...");
|
||||||
|
|
||||||
// A[NX][NY]
|
DATA_TYPE* A = new DATA_TYPE[NX * NY];
|
||||||
DATA_TYPE** A = new DATA_TYPE*[NX] {};
|
DATA_TYPE* X = new DATA_TYPE[NY];
|
||||||
for(unsigned int x = 0; x < NX; x++)
|
DATA_TYPE* Y = new DATA_TYPE[NX];
|
||||||
{
|
|
||||||
A[x] = new DATA_TYPE[NY] {};
|
|
||||||
}
|
|
||||||
|
|
||||||
// X[NY]
|
print_debug("[Pointers] Allocated!");
|
||||||
DATA_TYPE* X = new DATA_TYPE[NY] {};
|
|
||||||
|
|
||||||
// Y[NX]
|
|
||||||
DATA_TYPE* Y = new DATA_TYPE[NX] {};
|
|
||||||
|
|
||||||
debug("[Pointers] Allocated!");
|
|
||||||
|
|
||||||
#ifdef HPC_INCLUDE_INIT
|
#ifdef HPC_INCLUDE_INIT
|
||||||
debug("[Benchmark] Starting...");
|
print_debug("[Benchmark] Starting...");
|
||||||
polybench_start_instruments;
|
polybench_start_instruments;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
debug("[Init] Initializing...");
|
print_debug("[Init] Initializing...");
|
||||||
init_array(A, X, Y);
|
init_array(A, X, Y);
|
||||||
debug("[Init] Initialized!");
|
print_debug("[Init] Initialized!");
|
||||||
|
|
||||||
#ifndef HPC_INCLUDE_INIT
|
#ifndef HPC_INCLUDE_INIT
|
||||||
debug("[Benchmark] Starting...");
|
print_debug("[Benchmark] Starting...");
|
||||||
polybench_start_instruments;
|
polybench_start_instruments;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
debug("[Kernel] Running...");
|
print_debug("[Kernel] Running...");
|
||||||
kernel_atax(A, X, Y);
|
kernel_atax(A, X, Y);
|
||||||
debug("[Kernel] Completed!");
|
print_debug("[Kernel] Completed!");
|
||||||
|
|
||||||
debug("[Benchmark] Stopping...");
|
print_debug("[Benchmark] Stopping...");
|
||||||
polybench_stop_instruments;
|
polybench_stop_instruments;
|
||||||
polybench_print_instruments;
|
polybench_print_instruments;
|
||||||
debug("[Benchmark] Complete!");
|
print_debug("[Benchmark] Complete!");
|
||||||
|
|
||||||
debug("[Verify] Printing...")
|
#ifdef HPC_DEBUG
|
||||||
|
print_debug("[Debug] Displaying A:");
|
||||||
|
print_array(A, NX * NY);
|
||||||
|
print_debug("[Debug] Displaying X:");
|
||||||
|
print_array(X, NY);
|
||||||
|
print_debug("[Debug] Displaying Y:");
|
||||||
|
print_array(Y, NX);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
print_debug("[Verify] Printing...");
|
||||||
polybench_prevent_dce(
|
polybench_prevent_dce(
|
||||||
print_array(Y)
|
print_array(Y, NX)
|
||||||
);
|
);
|
||||||
debug("[Verify] Done!")
|
print_debug("[Verify] Done!");
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
debug("[Mode] Host-and-device, CUDA");
|
print_debug("[Mode] Host-and-device, CUDA");
|
||||||
|
|
||||||
debug("[Pointers] Allocating...");
|
print_debug("[Pointers] Allocating...");
|
||||||
DATA_TYPE* A;
|
DATA_TYPE* A;
|
||||||
DATA_TYPE* X;
|
DATA_TYPE* X;
|
||||||
DATA_TYPE* Y;
|
DATA_TYPE* Y;
|
||||||
|
DATA_TYPE* host_A = new DATA_TYPE[NX * NY];
|
||||||
|
DATA_TYPE* host_X = new DATA_TYPE[NY];
|
||||||
|
DATA_TYPE* host_Y = new DATA_TYPE[NX];
|
||||||
|
|
||||||
debug("[CUDA] Allocating A...");
|
print_debug("[CUDA] Allocating A...");
|
||||||
if(cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
|
if(cudaError_t err = cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY))
|
||||||
{
|
{
|
||||||
debug("[CUDA] Could not allocate A!");
|
print_cudaError(err, "[CUDA] Could not allocate A!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
debug("[CUDA] Allocated A!");
|
print_debug("[CUDA] Allocated A!");
|
||||||
|
|
||||||
debug("[CUDA] Allocating X...");
|
print_debug("[CUDA] Allocating X...");
|
||||||
if(cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
|
if(cudaError_t err = cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
|
||||||
{
|
{
|
||||||
debug("[CUDA] Could not allocate X!");
|
print_cudaError(err, "[CUDA] Could not allocate X!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
debug("[CUDA] Allocated X!");
|
print_debug("[CUDA] Allocated X!");
|
||||||
|
|
||||||
debug("[CUDA] Allocating Y...");
|
print_debug("[CUDA] Allocating Y...");
|
||||||
if(cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
|
if(cudaError_t err = cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
|
||||||
{
|
{
|
||||||
debug("[CUDA] Could not allocate Y!");
|
print_cudaError(err, "[CUDA] Could not allocate Y!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
debug("[CUDA] Allocated Y!");
|
print_debug("[CUDA] Allocated Y!");
|
||||||
|
|
||||||
#ifdef POLYBENCH_INCLUDE_INIT
|
#ifdef POLYBENCH_INCLUDE_INIT
|
||||||
debug("[Benchmark] Starting...");
|
print_debug("[Benchmark] Starting...");
|
||||||
polybench_start_instruments;
|
polybench_start_instruments;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
debug("[Init] Initializing...");
|
print_debug("[Init] Initializing...");
|
||||||
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
|
init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
|
||||||
if(cudaGetLastError())
|
if(cudaError_t err = cudaGetLastError())
|
||||||
{
|
{
|
||||||
debug("[Init] Failed to execute kernel!");
|
print_cudaError(err, "[Init] Failed to execute kernel!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
debug("[Init] Initialized!");
|
print_debug("[Init] Complete!");
|
||||||
|
|
||||||
#ifndef POLYBENCH_INCLUDE_INIT
|
#ifndef POLYBENCH_INCLUDE_INIT
|
||||||
debug("[Benchmark] Starting...");
|
print_debug("[Benchmark] Starting...");
|
||||||
polybench_start_instruments;
|
polybench_start_instruments;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// kernel_atax_cuda<<<1, 1>>>();
|
print_debug("[Kernel] Running...");
|
||||||
|
kernel_atax_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
|
||||||
|
print_debug("[Kernel] Complete!");
|
||||||
|
|
||||||
|
print_debug("[CUDA] Copying A back...");
|
||||||
|
if(cudaError_t err = cudaMemcpy(host_A, A, sizeof(DATA_TYPE) * NX * NY, cudaMemcpyDeviceToHost)) {
|
||||||
|
print_cudaError(err, "[CUDA] Could copy A back!");
|
||||||
|
return 1;
|
||||||
|
};
|
||||||
|
print_debug("[CUDA] Copied A back!");
|
||||||
|
|
||||||
|
print_debug("[CUDA] Copying X back...");
|
||||||
|
if(cudaError_t err = cudaMemcpy(host_X, X, sizeof(DATA_TYPE) * NY, cudaMemcpyDeviceToHost)) {
|
||||||
|
print_cudaError(err, "[CUDA] Could copy X back!");
|
||||||
|
return 1;
|
||||||
|
};
|
||||||
|
print_debug("[CUDA] Copied X back!");
|
||||||
|
|
||||||
|
print_debug("[CUDA] Copying Y back...");
|
||||||
|
if(cudaError_t err = cudaMemcpy(host_Y, Y, sizeof(DATA_TYPE) * NX, cudaMemcpyDeviceToHost)) {
|
||||||
|
print_cudaError(err, "[CUDA] Could copy Y back!");
|
||||||
|
return 1;
|
||||||
|
};
|
||||||
|
print_debug("[CUDA] Copied Y back!");
|
||||||
|
|
||||||
|
print_debug("[Benchmark] Stopping...");
|
||||||
polybench_stop_instruments;
|
polybench_stop_instruments;
|
||||||
polybench_print_instruments;
|
polybench_print_instruments;
|
||||||
|
print_debug("[Benchmark] Complete!");
|
||||||
|
|
||||||
// Y = cudaMemcpy();
|
print_debug("[CUDA] Freeing A...");
|
||||||
|
if(cudaError_t err = cudaFree(A)) {
|
||||||
debug("[CUDA] Freeing A...");
|
print_cudaError(err, "[CUDA] Could not free A!");
|
||||||
if(cudaFree(A)) {
|
|
||||||
debug("[CUDA] Could not free A!");
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
debug("[CUDA] Freed A!");
|
print_debug("[CUDA] Freed A!");
|
||||||
|
|
||||||
debug("[CUDA] Freeing X...");
|
print_debug("[CUDA] Freeing X...");
|
||||||
if(cudaFree(X)) {
|
if(cudaError_t err = cudaFree(X)) {
|
||||||
debug("[CUDA] Could not free X!");
|
print_cudaError(err, "[CUDA] Could not free X!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
debug("[CUDA] Freed X!");
|
print_debug("[CUDA] Freed X!");
|
||||||
|
|
||||||
debug("[CUDA] Freeing Y...");
|
print_debug("[CUDA] Freeing Y...");
|
||||||
if(cudaFree(Y)) {
|
if(cudaError_t err = cudaFree(Y)) {
|
||||||
debug("[CUDA] Could not free Y!");
|
print_cudaError(err, "[CUDA] Could not free Y!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
debug("[CUDA] Freed Y!");
|
print_debug("[CUDA] Freed Y!");
|
||||||
|
|
||||||
/*
|
#ifdef HPC_DEBUG
|
||||||
|
print_debug("[Debug] Displaying A:");
|
||||||
|
print_array(host_A, NX * NY);
|
||||||
|
print_debug("[Debug] Displaying X:");
|
||||||
|
print_array(host_X, NY);
|
||||||
|
print_debug("[Debug] Displaying Y:");
|
||||||
|
print_array(host_Y, NX);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
print_debug("[Verify] Printing...");
|
||||||
polybench_prevent_dce(
|
polybench_prevent_dce(
|
||||||
print_array(Y)
|
print_array(host_Y, NX)
|
||||||
);
|
);
|
||||||
*/
|
print_debug("[Verify] Done!");
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
13
atax/atax.hu
13
atax/atax.hu
|
@ -3,8 +3,12 @@
|
||||||
#define ATAX_H
|
#define ATAX_H
|
||||||
|
|
||||||
/* Default to STANDARD_DATASET. */
|
/* Default to STANDARD_DATASET. */
|
||||||
#if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
|
#if !defined(NANO_DATASET) && !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(STANDARD_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
|
||||||
#define STANDARD_DATASET
|
#ifdef HPC_DEBUG
|
||||||
|
#define NANO_DATASET
|
||||||
|
#else
|
||||||
|
#define EXTRALARGE_DATASET
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Do not define anything if the user manually defines the size. */
|
/* Do not define anything if the user manually defines the size. */
|
||||||
|
@ -12,6 +16,11 @@
|
||||||
|
|
||||||
/* Define the possible dataset sizes. */
|
/* Define the possible dataset sizes. */
|
||||||
|
|
||||||
|
#ifdef NANO_DATASET
|
||||||
|
#define NX 3
|
||||||
|
#define NY 5
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef MINI_DATASET
|
#ifdef MINI_DATASET
|
||||||
#define NX 32
|
#define NX 32
|
||||||
#define NY 32
|
#define NY 32
|
||||||
|
|
Loading…
Reference in a new issue