From 0d0b0f04cfb07379455b1ae2f5feb41a0cab57a0 Mon Sep 17 00:00:00 2001 From: Alessandro Capotondi Date: Wed, 28 Apr 2021 11:09:08 +0200 Subject: [PATCH] HPC CUDA Lab 1 --- README.md | 3 + cuda/lab1/.solutions/exercise1.cu | 76 ++++++++++ cuda/lab1/.solutions/exercise2.cu | 175 ++++++++++++++++++++++ cuda/lab1/.solutions/exercise3-v1.cu | 193 +++++++++++++++++++++++++ cuda/lab1/.solutions/exercise3-v2.cu | 197 +++++++++++++++++++++++++ cuda/lab1/.solutions/exercise3-v3.cu | 208 +++++++++++++++++++++++++++ cuda/lab1/Makefile | 55 +++++++ cuda/lab1/exercise1.cu | 76 ++++++++++ cuda/lab1/exercise2.cu | 175 ++++++++++++++++++++++ cuda/lab1/exercise3.cu | 190 ++++++++++++++++++++++++ cuda/lab1/utils.c | 138 ++++++++++++++++++ cuda/lab1/utils.h | 142 ++++++++++++++++++ 12 files changed, 1628 insertions(+) create mode 100644 cuda/lab1/.solutions/exercise1.cu create mode 100644 cuda/lab1/.solutions/exercise2.cu create mode 100644 cuda/lab1/.solutions/exercise3-v1.cu create mode 100644 cuda/lab1/.solutions/exercise3-v2.cu create mode 100644 cuda/lab1/.solutions/exercise3-v3.cu create mode 100644 cuda/lab1/Makefile create mode 100644 cuda/lab1/exercise1.cu create mode 100644 cuda/lab1/exercise2.cu create mode 100644 cuda/lab1/exercise3.cu create mode 100644 cuda/lab1/utils.c create mode 100644 cuda/lab1/utils.h diff --git a/README.md b/README.md index 6387020..fb28203 100644 --- a/README.md +++ b/README.md @@ -13,3 +13,6 @@ The exercises related to OpenMP programming model can be found in the folder `op - `openmp\lab1`: OpenMP basics: *parallel*, *for-loop*, *sections*, and *tasking*. - `openmp\lab2`: OpenMP Advanced: *reduction*, *tasking*, *optimizations*. - `openmp\lab3`: OpenMP 4.x+: *Accelerator Model (targeting: Nvidia GP-GPU)* + +### CUDA Exercises +- `cuda\lab1`: CUDA Basics. diff --git a/cuda/lab1/.solutions/exercise1.cu b/cuda/lab1/.solutions/exercise1.cu new file mode 100644 index 0000000..c215ea2 --- /dev/null +++ b/cuda/lab1/.solutions/exercise1.cu @@ -0,0 +1,76 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file exercise1.c + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief Exercise 1 + * + * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152 + */ + +#include +#include +#include +#include + +#include + +/** + * @brief EX 1 - Launch CUDA kernel to "print" helloworld + * + * a) Detect global thread id. (tip: use threadIdx.x, blockDim.x, and blockIdx.x) + * b) Explore thread execution and schedule changing: N={8, 16, 32} and M={4,8,16} + * + * @return void + */ + +__global__ void helloworld(void) +{ + int gid = threadIdx.x + blockDim.x * blockIdx.x; + printf("Hello world, I am global thread %d (threadIdx=%d, blockIdx=%d, blockDim=%d)\n", + gid, threadIdx.x, blockIdx.x, blockDim.x); +} + +#ifndef N +#define N 8 +#endif + +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 4 +#endif + +int main(int argc, const char **argv) +{ + helloworld<<>>(); + + // CUDA exit -- needed to flush printf write buffer + cudaDeviceReset(); + return 0; +} diff --git a/cuda/lab1/.solutions/exercise2.cu b/cuda/lab1/.solutions/exercise2.cu new file mode 100644 index 0000000..214d34e --- /dev/null +++ b/cuda/lab1/.solutions/exercise2.cu @@ -0,0 +1,175 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file exercise1.c + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief Exercise 2 + * + * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152 + */ + +#include +#include +#include +#include +#include +#include +#include + +#define gpuErrchk(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) + exit(code); + } +} + +extern "C" +{ +#include "utils.h" +} + +#define TWO02 (1 << 2) +#define TWO04 (1 << 4) +#define TWO08 (1 << 8) +#ifndef N +#define N (1 << 27) +#endif + +#ifndef BLOCK_SIZE +#define BLOCK_SIZE (1024) +#endif + +/* + *SAXPY (host implementation) + * y := a * x + y + */ +void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n) +{ +#pragma omp parallel for simd schedule(simd: static) + for (int i = 0; i < n; i++) + { + y[i] = a * x[i] + y[i]; + } +} + +__global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + y[i] = a * x[i] + y[i]; +} + +int main(int argc, const char **argv) +{ + int iret = 0; + int n = N; + float *h_x, *d_x; + float *h_y, *d_y; + float *h_z; + float a = 101.0f / TWO02, + b, c; + + if (argc > 1) + n = atoi(argv[1]); + + if (NULL == (h_x = (float *)malloc(sizeof(float) * n))) + { + printf("error: memory allocation for 'x'\n"); + iret = -1; + } + if (NULL == (h_y = (float *)malloc(sizeof(float) * n))) + { + printf("error: memory allocation for 'y'\n"); + iret = -1; + } + if (NULL == (h_z = (float *)malloc(sizeof(float) * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + if (0 != iret) + { + free(h_x); + free(h_y); + free(h_z); + exit(EXIT_FAILURE); + } + + //Init Data + b = rand() % TWO04; + c = rand() % TWO08; + for (int i = 0; i < n; i++) + { + h_x[i] = b / (float)TWO02; + h_y[i] = h_z[i] = c / (float)TWO04; + } + + //CUDA Buffer Allocation + gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n)); + gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n)); + + start_timer(); + gpuErrchk(cudaMemcpy(d_x, h_x, sizeof(float) * n, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpy(d_y, h_y, sizeof(float) * n, cudaMemcpyHostToDevice)); + gpu_saxpy<<<((n + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(d_y, a, d_x, n); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaMemcpy(h_y, d_y, sizeof(float) * n, cudaMemcpyDeviceToHost)); + stop_timer(); + printf("saxpy (GPU): %9.3f sec %9.1f MFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((1.0e6 / 1e9) * elapsed_ns())); + + //Check Matematical Consistency + start_timer(); + host_saxpy(h_z, a, h_x, n); + stop_timer(); + printf("saxpy (Host): %9.3f sec %9.1f MFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((1.0e6 / 1e9) * elapsed_ns())); + for (int i = 0; i < n; ++i) + { + iret = *(int *)(h_y + i) ^ *(int *)(h_z + i); + assert(iret == 0); + } + + //CUDA Buffer Allocation + free(h_x); + gpuErrchk(cudaFree(d_x)); + free(h_y); + gpuErrchk(cudaFree(d_y)); + free(h_z); + + // CUDA exit -- needed to flush printf write buffer + cudaDeviceReset(); + return 0; +} diff --git a/cuda/lab1/.solutions/exercise3-v1.cu b/cuda/lab1/.solutions/exercise3-v1.cu new file mode 100644 index 0000000..72142e5 --- /dev/null +++ b/cuda/lab1/.solutions/exercise3-v1.cu @@ -0,0 +1,193 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file exercise3.cu + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief Exercise 3 - CUDA MATMUL + * + * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152 + */ + +#include +#include +#include +#include + +#define gpuErrchk(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) + exit(code); + } +} + +extern "C" +{ +#include "utils.h" +} + +#define TWO02 (1 << 2) +#define TWO04 (1 << 4) +#define TWO08 (1 << 8) + +#ifndef N +#define N (32) +#endif + +void gemm(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c, int n) +{ + +#pragma omp parallel for collapse(2) + for (int i = 0; i < n; ++i) + { + for (int j = 0; j < n; ++j) + { + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[i * n + k] * b[k *n + j]; + } + c[i * n + j] = sum; + } + } +} + +/** + * @brief EX 3 - Complete Matrix Multiplication + */ +__global__ void gemm_kernel(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c, int n) +{ + int row = threadIdx.x; + int col = threadIdx.y; + + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[row * n + k] * b[k * n + col]; + } + c[row * n + col] = sum; +} + +int main(int argc, char *argv[]) +{ + int n = N, iret = 0; + float *a, *b, *c, *g; + struct timespec rt[2]; + double wt; // walltime + + if (argc > 1) + n = atoi(argv[1]); + + if (NULL == (a = (float *)malloc(sizeof(*a) * n * n))) + { + printf("error: memory allocation for 'x'\n"); + iret = -1; + } + if (NULL == (b = (float *)malloc(sizeof(*b) * n * n))) + { + printf("error: memory allocation for 'y'\n"); + iret = -1; + } + if (NULL == (c = (float *)malloc(sizeof(*c) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + if (NULL == (g = (float *)malloc(sizeof(*g) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + + if (0 != iret) + { + free(a); + free(b); + free(c); + free(g); + exit(EXIT_FAILURE); + } + + //Init Data + int _b = rand() % TWO04; + int _c = rand() % TWO08; +#pragma omp parallel for + for (int i = 0; i < n * n; i++) + { + a[i] = _b / (float)TWO02; + b[i] = _c / (float)TWO04; + c[i] = g[i] = 0.0; + } + + clock_gettime(CLOCK_REALTIME, rt + 0); + gemm(a, b, g, n); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("GEMM (Host) : %9.3f sec %9.1f GFLOPS\n", wt, 2.0 * n * n * n / (1.0e9 * wt)); + + //CUDA Buffer Allocation + float *d_a, *d_b, *d_c; + gpuErrchk(cudaMalloc((void **)&d_a, sizeof(float) * n * n)); + gpuErrchk(cudaMalloc((void **)&d_b, sizeof(float) * n * n)); + gpuErrchk(cudaMalloc((void **)&d_c, sizeof(float) * n * n)); + + clock_gettime(CLOCK_REALTIME, rt + 0); + gpuErrchk(cudaMemcpy(d_a, a, sizeof(float) * n * n, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpy(d_b, b, sizeof(float) * n * n, cudaMemcpyHostToDevice)); + dim3 dimBlock(n,n); + dim3 dimGrid(1,1); + gemm_kernel<<>> (d_a, d_b, d_c, n); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaMemcpy(c, d_c, sizeof(float) * n * n, cudaMemcpyDeviceToHost)); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("GEMM (GPU): %9.3f sec %9.1f GFLOPS\n", wt, 2.0 * n * n * n / (1.0e9 * wt)); + + for (int i = 0; i < n * n; i++) + { + iret = *(int *)(g + i) ^ *(int *)(c + i); + assert(iret == 0); + } + free(a); + free(b); + free(c); + free(g); + gpuErrchk(cudaFree(d_a)); + gpuErrchk(cudaFree(d_b)); + gpuErrchk(cudaFree(d_c)); + + return 0; +} diff --git a/cuda/lab1/.solutions/exercise3-v2.cu b/cuda/lab1/.solutions/exercise3-v2.cu new file mode 100644 index 0000000..1884158 --- /dev/null +++ b/cuda/lab1/.solutions/exercise3-v2.cu @@ -0,0 +1,197 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file exercise3.cu + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief Exercise 3 - CUDA MATMUL + * + * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152 + */ + +#include +#include +#include +#include + +#define gpuErrchk(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) + exit(code); + } +} + +extern "C" +{ +#include "utils.h" +} + +#define TWO02 (1 << 2) +#define TWO04 (1 << 4) +#define TWO08 (1 << 8) + + +#ifndef N +#define N (1 << 10) +#endif +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 32 +#endif + +void gemm(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c, int n) +{ + +#pragma omp parallel for collapse(2) + for (int i = 0; i < n; ++i) + { + for (int j = 0; j < n; ++j) + { + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[i * n + k] * b[k *n + j]; + } + c[i * n + j] = sum; + } + } +} + +/** + * @brief EX 3 - Complete Matrix Multiplication + */ +__global__ void gemm_kernel(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c, int n) +{ + int row = threadIdx.x + blockIdx.x * blockDim.x; + int col = threadIdx.y + blockIdx.y * blockDim.y; + + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[row * n + k] * b[k * n + col]; + } + c[row * n + col] = sum; +} + +int main(int argc, char *argv[]) +{ + int n = N, iret = 0; + float *a, *b, *c, *g; + struct timespec rt[2]; + double wt; // walltime + + if (argc > 1) + n = atoi(argv[1]); + + if (NULL == (a = (float *)malloc(sizeof(*a) * n * n))) + { + printf("error: memory allocation for 'x'\n"); + iret = -1; + } + if (NULL == (b = (float *)malloc(sizeof(*b) * n * n))) + { + printf("error: memory allocation for 'y'\n"); + iret = -1; + } + if (NULL == (c = (float *)malloc(sizeof(*c) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + if (NULL == (g = (float *)malloc(sizeof(*g) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + + if (0 != iret) + { + free(a); + free(b); + free(c); + free(g); + exit(EXIT_FAILURE); + } + + //Init Data + int _b = rand() % TWO04; + int _c = rand() % TWO08; +#pragma omp parallel for + for (int i = 0; i < n * n; i++) + { + a[i] = _b / (float)TWO02; + b[i] = _c / (float)TWO04; + c[i] = g[i] = 0.0; + } + + clock_gettime(CLOCK_REALTIME, rt + 0); + gemm(a, b, g, n); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("GEMM (Host) : %9.3f sec %9.1f GFLOPS\n", wt, 2.0 * n * n * n / (1.0e9 * wt)); + + //CUDA Buffer Allocation + float *d_a, *d_b, *d_c; + gpuErrchk(cudaMalloc((void **)&d_a, sizeof(float) * n * n)); + gpuErrchk(cudaMalloc((void **)&d_b, sizeof(float) * n * n)); + gpuErrchk(cudaMalloc((void **)&d_c, sizeof(float) * n * n)); + + clock_gettime(CLOCK_REALTIME, rt + 0); + gpuErrchk(cudaMemcpy(d_a, a, sizeof(float) * n * n, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpy(d_b, b, sizeof(float) * n * n, cudaMemcpyHostToDevice)); + dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); + dim3 dimGrid((n+BLOCK_SIZE-1)/BLOCK_SIZE,(n+BLOCK_SIZE-1)/BLOCK_SIZE); + gemm_kernel<<>> (d_a, d_b, d_c, n); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaMemcpy(c, d_c, sizeof(float) * n * n, cudaMemcpyDeviceToHost)); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("GEMM (GPU): %9.3f sec %9.1f GFLOPS\n", wt, 2.0 * n * n * n / (1.0e9 * wt)); + + for (int i = 0; i < n * n; i++) + { + iret = *(int *)(g + i) ^ *(int *)(c + i); + assert(iret == 0); + } + free(a); + free(b); + free(c); + free(g); + gpuErrchk(cudaFree(d_a)); + gpuErrchk(cudaFree(d_b)); + gpuErrchk(cudaFree(d_c)); + + return 0; +} diff --git a/cuda/lab1/.solutions/exercise3-v3.cu b/cuda/lab1/.solutions/exercise3-v3.cu new file mode 100644 index 0000000..3337e53 --- /dev/null +++ b/cuda/lab1/.solutions/exercise3-v3.cu @@ -0,0 +1,208 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file exercise3.cu + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief Exercise 3 - CUDA MATMUL + * + * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152 + */ + +#include +#include +#include +#include + +#define gpuErrchk(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) + exit(code); + } +} + +extern "C" +{ +#include "utils.h" +} + +#define TWO02 (1 << 2) +#define TWO04 (1 << 4) +#define TWO08 (1 << 8) + + +#ifndef N +#define N (1 << 10) +#endif +#ifndef TILE_W +#define TILE_W 4 +#endif +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 32 +#endif + +void gemm(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c, int n) +{ + +#pragma omp parallel for collapse(2) + for (int i = 0; i < n; ++i) + { + for (int j = 0; j < n; ++j) + { + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[i * n + k] * b[k *n + j]; + } + c[i * n + j] = sum; + } + } +} + +/** + * @brief EX 3 - Complete Matrix Multiplication + */ +__global__ void gemm_kernel(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c, int n) +{ + int row = (blockIdx.x * blockDim.x * TILE_W) + (threadIdx.x * TILE_W); + int col = (blockIdx.y * blockDim.y * TILE_W) + (threadIdx.y * TILE_W); + int end_row = row+TILE_W < n ? row+TILE_W : n; + int end_col = col+TILE_W < n ? col+TILE_W : n; + + for (int i = row; i < end_row; ++i) + { + for (int j = col; j < end_col; ++j) + { + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[i * n + k] * b[k *n + j]; + } + c[i * n + j] = sum; + } + } +} + +int main(int argc, char *argv[]) +{ + int n = N, iret = 0; + float *a, *b, *c, *g; + struct timespec rt[2]; + double wt; // walltime + + if (argc > 1) + n = atoi(argv[1]); + + if (NULL == (a = (float *)malloc(sizeof(*a) * n * n))) + { + printf("error: memory allocation for 'x'\n"); + iret = -1; + } + if (NULL == (b = (float *)malloc(sizeof(*b) * n * n))) + { + printf("error: memory allocation for 'y'\n"); + iret = -1; + } + if (NULL == (c = (float *)malloc(sizeof(*c) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + if (NULL == (g = (float *)malloc(sizeof(*g) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + + if (0 != iret) + { + free(a); + free(b); + free(c); + free(g); + exit(EXIT_FAILURE); + } + + //Init Data + int _b = rand() % TWO04; + int _c = rand() % TWO08; +#pragma omp parallel for + for (int i = 0; i < n * n; i++) + { + a[i] = _b / (float)TWO02; + b[i] = _c / (float)TWO04; + c[i] = g[i] = 0.0; + } + + clock_gettime(CLOCK_REALTIME, rt + 0); + gemm(a, b, g, n); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("GEMM (Host) : %9.3f sec %9.1f GFLOPS\n", wt, 2.0 * n * n * n / (1.0e9 * wt)); + + //CUDA Buffer Allocation + float *d_a, *d_b, *d_c; + gpuErrchk(cudaMalloc((void **)&d_a, sizeof(float) * n * n)); + gpuErrchk(cudaMalloc((void **)&d_b, sizeof(float) * n * n)); + gpuErrchk(cudaMalloc((void **)&d_c, sizeof(float) * n * n)); + + clock_gettime(CLOCK_REALTIME, rt + 0); + gpuErrchk(cudaMemcpy(d_a, a, sizeof(float) * n * n, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpy(d_b, b, sizeof(float) * n * n, cudaMemcpyHostToDevice)); + dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); + dim3 dimGrid((n+(BLOCK_SIZE+TILE_W)-1)/(BLOCK_SIZE+TILE_W),(n+(BLOCK_SIZE+TILE_W)-1)/(BLOCK_SIZE+TILE_W)); + gemm_kernel<<>> (d_a, d_b, d_c, n); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaMemcpy(c, d_c, sizeof(float) * n * n, cudaMemcpyDeviceToHost)); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("GEMM (GPU): %9.3f sec %9.1f GFLOPS\n", wt, 2.0 * n * n * n / (1.0e9 * wt)); + + for (int i = 0; i < n * n; i++) + { + iret = *(int *)(g + i) ^ *(int *)(c + i); + assert(iret == 0); + } + free(a); + free(b); + free(c); + free(g); + gpuErrchk(cudaFree(d_a)); + gpuErrchk(cudaFree(d_b)); + gpuErrchk(cudaFree(d_c)); + + return 0; +} diff --git a/cuda/lab1/Makefile b/cuda/lab1/Makefile new file mode 100644 index 0000000..ef0f653 --- /dev/null +++ b/cuda/lab1/Makefile @@ -0,0 +1,55 @@ +ifndef CUDA_HOME +CUDA_HOME:=/usr/local/cuda +endif + +ifndef EXERCISE +EXERCISE=exercise1.cu +endif + +BUILD_DIR ?= ./build + +NVCC=$(CUDA_HOME)/bin/nvcc +CXX=g++ + +OPT:=-O2 -g +NVOPT:=-Xcompiler -fopenmp -lineinfo -arch=sm_53 --ptxas-options=-v --use_fast_math + +CXXFLAGS:=$(OPT) -I. $(EXT_CXXFLAGS) +LDFLAGS:=-lm -lcudart $(EXT_LDFLAGS) + +NVCFLAGS:=$(CXXFLAGS) $(NVOPT) +NVLDFLAGS:=$(LDFLAGS) -lgomp + +SRCS:= utils.c +OBJS := $(SRCS:%=$(BUILD_DIR)/%.o) $(EXERCISE:%=$(BUILD_DIR)/%.o) +EXE=$(EXERCISE:.cu=.exe) + +$(EXE): $(OBJS) + $(MKDIR_P) $(dir $@) + $(NVCC) $(NVCFLAGS) $(OBJS) -o $@ $(NVLDFLAGS) + +$(BUILD_DIR)/%.cu.o: %.cu + $(MKDIR_P) $(dir $@) + $(NVCC) $(NVCFLAGS) -c $< -o $@ + +$(BUILD_DIR)/%.cpp.o: %.cpp + $(MKDIR_P) $(dir $@) + $(CXX) $(CXXFLAGS) -c $< -o $@ + +$(BUILD_DIR)/%.c.o: %.c + $(MKDIR_P) $(dir $@) + $(CXX) $(CXXFLAGS) -c $< -o $@ + +all: $(EXE) + +.PHONY: run profile clean +run: $(EXE) + ./$(EXE) + +profile: $(EXE) + sudo LD_LIBRARY_PATH=$(CUDA_HOME)/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof ./$(EXE) + +clean: + -rm -fr $(BUILD_DIR) *.exe *.out *~ + +MKDIR_P ?= mkdir -p diff --git a/cuda/lab1/exercise1.cu b/cuda/lab1/exercise1.cu new file mode 100644 index 0000000..942271f --- /dev/null +++ b/cuda/lab1/exercise1.cu @@ -0,0 +1,76 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file exercise1.cu + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief Exercise 1 + * + * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152 + */ + +#include +#include +#include +#include + +#include + +#ifndef N +#define N 32 +#endif + +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 4 +#endif + +/** + * @brief EX 1 - Launch CUDA kernel to "print" helloworld + * + * a) Detect global thread id. (tip: use threadIdx.x, blockDim.x, and blockIdx.x) + * b) Explore thread execution and schedule changing: N={32} and M={4,8,16,32,64} + * c) Add corner case management + * + * @return void + */ + +__global__ void helloworld(void) +{ + int gid; + printf("Hello world, I am global thread %d (threadIdx=%d, blockIdx=%d, blockDim=%d)\n", gid, ...); +} + +int main(int argc, const char **argv) +{ + helloworld<<>>(); + + // CUDA exit -- needed to flush printf write buffer + cudaDeviceReset(); + return 0; +} diff --git a/cuda/lab1/exercise2.cu b/cuda/lab1/exercise2.cu new file mode 100644 index 0000000..a6d30d3 --- /dev/null +++ b/cuda/lab1/exercise2.cu @@ -0,0 +1,175 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file exercise2.cu + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief Exercise 2 + * + * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152 + */ + +#include +#include +#include +#include +#include +#include +#include + +#define gpuErrchk(ans) \ +{ \ + gpuAssert((ans), __FILE__, __LINE__); \ +} +static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) + exit(code); + } +} + +extern "C" +{ + #include "utils.h" +} + +#define TWO02 (1 << 2) +#define TWO04 (1 << 4) +#define TWO08 (1 << 8) +#ifndef N +#define N (1 << 27) +#endif + +#ifndef BLOCK_SIZE +#define BLOCK_SIZE (1024) +#endif + + +/* + *SAXPY (host implementation) + * y := a * x + y + */ +void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n) +{ +#pragma omp parallel for simd schedule(simd: static) + for (int i = 0; i < n; i++) + { + y[i] = a * x[i] + y[i]; + } +} + +__global__ void saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n) +{ + //TODO: Add saxpy kernel body +} + +int main(int argc, const char **argv) +{ + int iret = 0; + int n = N; + float *h_x, *d_x; + float *h_y, *d_y; + float *h_z; + float a = 101.0f / TWO02, + b, c; + + if (argc > 1) + n = atoi(argv[1]); + + if (NULL == (h_x = (float *)malloc(sizeof(float) * n))) + { + printf("error: memory allocation for 'x'\n"); + iret = -1; + } + if (NULL == (h_y = (float *)malloc(sizeof(float) * n))) + { + printf("error: memory allocation for 'y'\n"); + iret = -1; + } + if (NULL == (h_z = (float *)malloc(sizeof(float) * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + if (0 != iret) + { + free(h_x); + free(h_y); + free(h_z); + exit(EXIT_FAILURE); + } + + //Init Data + b = rand() % TWO04; + c = rand() % TWO08; + for (int i = 0; i < n; i++) + { + h_x[i] = b / (float)TWO02; + h_y[i] = h_z[i] = c / (float)TWO04; + } + + //CUDA Buffer Allocation + gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n)); + gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n)); + + //TODO: ADD CUDA Data Move + start_timer(); + cudaMemcpy(h_x, d_x, sizeof(float) * n, cudaMemcpyDeviceToHost); + + //TODO: Add kernel call here + + gpuErrchk(cudaPeekAtLastError()); + + //TODO: ADD CUDA Data Move + + stop_timer(); + printf("saxpy (GPU): %9.3f sec %9.1f MFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((1.0e6 / 1e9) * elapsed_ns())); + + //Check Matematical Consistency + start_timer(); + host_saxpy(h_z, a, h_x, n); + stop_timer(); + printf("saxpy (Host): %9.3f sec %9.1f MFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((1.0e6 / 1e9) * elapsed_ns())); + for (int i = 0; i < n; ++i) + { + iret = *(int *)(h_y + i) ^ *(int *)(h_z + i); + assert(iret == 0); + } + + //CUDA Buffer Allocation + free(h_x); + gpuErrchk(cudaFree(d_x)); + free(h_y); + gpuErrchk(cudaFree(d_y)); + free(h_z); + return 0; +} diff --git a/cuda/lab1/exercise3.cu b/cuda/lab1/exercise3.cu new file mode 100644 index 0000000..d915004 --- /dev/null +++ b/cuda/lab1/exercise3.cu @@ -0,0 +1,190 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file exercise3.cu + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief Exercise 3 - CUDA MATMUL + * + * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152 + */ + +#include +#include +#include +#include + +#define gpuErrchk(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) +{ + if (code != cudaSuccess) + { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) + exit(code); + } +} + +extern "C" +{ +#include "utils.h" +} + +#define TWO02 (1 << 2) +#define TWO04 (1 << 4) +#define TWO08 (1 << 8) + + +#ifndef N +#define N (1 << 10) +#endif +#ifndef TILE_W +#define TILE_W 4 +#endif +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 32 +#endif + +void gemm(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c, int n) +{ + +#pragma omp parallel for collapse(2) + for (int i = 0; i < n; ++i) + { + for (int j = 0; j < n; ++j) + { + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[i * n + k] * b[k *n + j]; + } + c[i * n + j] = sum; + } + } +} + +/** + * @brief EX 3 - Complete Matrix Multiplication + */ +__global__ void gemm_kernel(float *__restrict__ a, float *__restrict__ b, float *__restrict__ c, int n) +{ + //TODO: Add GEMM kernel body +} + +int main(int argc, char *argv[]) +{ + int n = N, iret = 0; + float *a, *b, *c, *g; + struct timespec rt[2]; + double wt; // walltime + + if (argc > 1) + n = atoi(argv[1]); + + if (NULL == (a = (float *)malloc(sizeof(*a) * n * n))) + { + printf("error: memory allocation for 'x'\n"); + iret = -1; + } + if (NULL == (b = (float *)malloc(sizeof(*b) * n * n))) + { + printf("error: memory allocation for 'y'\n"); + iret = -1; + } + if (NULL == (c = (float *)malloc(sizeof(*c) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + if (NULL == (g = (float *)malloc(sizeof(*g) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + + if (0 != iret) + { + free(a); + free(b); + free(c); + free(g); + exit(EXIT_FAILURE); + } + + //Init Data + int _b = rand() % TWO04; + int _c = rand() % TWO08; +#pragma omp parallel for + for (int i = 0; i < n * n; i++) + { + a[i] = _b / (float)TWO02; + b[i] = _c / (float)TWO04; + c[i] = g[i] = 0.0; + } + + clock_gettime(CLOCK_REALTIME, rt + 0); + gemm(a, b, g, n); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("GEMM (Host) : %9.3f sec %9.1f GFLOPS\n", wt, 2.0 * n * n * n / (1.0e9 * wt)); + + //CUDA Buffer Allocation + float *d_a, *d_b, *d_c; + //TODO: Add HERE Cuda data allocation + + clock_gettime(CLOCK_REALTIME, rt + 0); + //TODO: Add HERE Cuda data transfer + + //TODO: Add HERE dimBlock + //TODO: Add HERE dimGrid + //TODO: Add HERE kernel launch + gpuErrchk(cudaPeekAtLastError()); + //TODO: Add HERE Cuda data transfer + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("GEMM (GPU): %9.3f sec %9.1f GFLOPS\n", wt, 2.0 * n * n * n / (1.0e9 * wt)); + + for (int i = 0; i < n * n; i++) + { + iret = *(int *)(g + i) ^ *(int *)(c + i); + assert(iret == 0); + } + free(a); + free(b); + free(c); + free(g); + gpuErrchk(cudaFree(d_a)); + gpuErrchk(cudaFree(d_b)); + gpuErrchk(cudaFree(d_c)); + + return 0; +} diff --git a/cuda/lab1/utils.c b/cuda/lab1/utils.c new file mode 100644 index 0000000..0ce0dc5 --- /dev/null +++ b/cuda/lab1/utils.c @@ -0,0 +1,138 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/** + * @file utils.c + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief File containing utilities functions for HPC Unimore Class + * + * Utilities for OpenMP lab. + * + * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html + */ + +#define _POSIX_C_SOURCE 199309L +#include +#include +#include +#include +#include + +extern "C" { + +#include "utils.h" + +#define MAX_ITERATIONS 100 +static struct timespec timestampA, timestampB; +static unsigned long long statistics[MAX_ITERATIONS]; +static int iterations = 0; + +static unsigned long long __diff_ns(struct timespec start, struct timespec end) +{ + struct timespec temp; + if ((end.tv_nsec - start.tv_nsec) < 0) + { + temp.tv_sec = end.tv_sec - start.tv_sec - 1; + temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec; + } + else + { + temp.tv_sec = end.tv_sec - start.tv_sec; + temp.tv_nsec = end.tv_nsec - start.tv_nsec; + } + + return temp.tv_nsec + temp.tv_sec * 1000000000ULL; +} + +void start_timer() +{ + asm volatile("" :: + : "memory"); + clock_gettime(CLOCK_MONOTONIC_RAW, ×tampA); + asm volatile("" :: + : "memory"); +} + +void stop_timer() +{ + unsigned long long elapsed = 0ULL; + asm volatile("" :: + : "memory"); + clock_gettime(CLOCK_MONOTONIC_RAW, ×tampB); + asm volatile("" :: + : "memory"); +} + +unsigned long long elapsed_ns() +{ + return __diff_ns(timestampA, timestampB); +} + +void start_stats() +{ + start_timer(); +} + +void collect_stats() +{ + assert(iterations < MAX_ITERATIONS); + stop_timer(); + statistics[iterations++] = elapsed_ns(); +} + +void print_stats() +{ + unsigned long long min = ULLONG_MAX; + unsigned long long max = 0LL; + double average = 0.0; + double std_deviation = 0.0; + double sum = 0.0; + + /* Compute the sum of all elements */ + for (int i = 0; i < iterations; i++) + { + if (statistics[i] > max) + max = statistics[i]; + if (statistics[i] < min) + min = statistics[i]; + sum = sum + statistics[i] / 1E6; + } + average = sum / (double)iterations; + + /* Compute variance and standard deviation */ + for (int i = 0; i < iterations; i++) + { + sum = sum + pow((statistics[i] / 1E6 - average), 2); + } + std_deviation = sqrt(sum / (double)iterations); + + printf("AvgTime\tMinTime\tMaxTime\tStdDev\n"); + printf("%.4f ms\t%.4f ms\t%.4f ms\t%.4f\n", (double)average, (double)min / 1E6, (double)max / 1E6, (double)std_deviation); +} + +} diff --git a/cuda/lab1/utils.h b/cuda/lab1/utils.h new file mode 100644 index 0000000..966281c --- /dev/null +++ b/cuda/lab1/utils.h @@ -0,0 +1,142 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file utils.h + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief File containing utilities functions for HPC Unimore Class + * + * The header define time functions and dummy workload used on the example tests. + * + * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html + */ +#ifndef __UTILS_H__ +#define __UTILS_H__ + +#include + +#if defined(VERBOSE) +#define DEBUG_PRINT(x, ...) printf((x), ##__VA_ARGS__) +#else +#define DEBUG_PRINT(x, ...) +#endif + +#if !defined(NTHREADS) +#define NTHREADS (4) +#endif + +extern "C" +{ + +/** + * @brief The function set the timestampA + * + * The function is used to measure elapsed time between two execution points. + * The function start_timer() sets the starting point timestamp, while the function + * stop_timer() sets the termination timestamp. The elapsed time, expressed in nanoseconds, + * between the two points can be retrieved using the function elapsed_ns(). + * + * Example usage: + * @code + * start_timer(); // Point A + * //SOME CODE HERE + * stop_timer(); // Point B + * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B + * //SOME OTHER CODE HERE + * stop_timer(); // Point C + * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C + * @endcode + * + * @return void + * @see start_timer() + * @see stop_timer() + * @see elapsed_ns() + */ + void start_timer(); + +/** + * @brief The function set the second timestamps + * + * The function is used to measure elapsed time between two execution points. + * The function start_timer() sets the starting point timestamp, while the function + * stop_timer() returns the elapsed time, expressed in nanoseconds between the last call + * of start_timer() and the current execution point. + * + * Example usage: + * @code + * start_timer(); // Point A + * //SOME CODE HERE + * stop_timer(); // Point B + * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B + * //SOME OTHER CODE HERE + * stop_timer(); // Point C + * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C + * @endcode + * + * @return void + * @see start_timer() + * @see stop_timer() + * @see elapsed_ns() + */ + void stop_timer(); + +/** + * @brief Elapsed nano seconds between start_timer() and stop_timer(). + * + * @return Elapsed nano seconds + * @see start_timer() + * @see stop_timer() + */ + unsigned long long elapsed_ns(); + +/** + * @brief The function init the starting point of stat measurement. + * + * The function is similar to start_timer(). + * + * @return void + * @see start_timer + */ + void start_stats(); + +/** + * @brief The function collects the elapsed time between the current exeuction point and the + * last call of start_stats(). + * + * @return void + */ + void collect_stats(); + +/** + * @brief The function display the collected statistics. + * @return void + */ + void print_stats(); +} +#endif /*__UTILS_H__*/