Update Lab3

2024-11-26 01:54:22 +00:00 · 2022-05-17 15:43:44 +02:00 · 2022-05-17 15:43:44 +02:00 · b558d22f47
commit b558d22f47
parent 5761b0de1c
7 changed files with 649 additions and 96 deletions
--- a/cuda/lab3/.solutions/saxpy-v3.cu
+++ b/cuda/lab3/.solutions/saxpy-v3.cu
@ -73,6 +73,10 @@ extern "C"
 #define BLOCK_SIZE (512)
 #endif
 #ifndef N_STREAMS
 #define N_STREAMS (16)
 #endif
 /*
 *SAXPY (host implementation)
 * y := a * x + y
@ -143,32 +147,25 @@ int main(int argc, const char **argv)
    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));
    start_timer();
-    int TILE = n / 8;
+    int TILE = n / N_STREAMS;
-    //TODO Copy the first Tile (i=0)
+    cudaStream_t stream[N_STREAMS];
-    gpuErrchk(cudaMemcpyAsync(&d_x[0], &h_x[0], sizeof(float) * TILE, cudaMemcpyHostToDevice));
+    for(int i = 0; i < N_STREAMS; i++)
-    gpuErrchk(cudaMemcpyAsync(&d_y[0], &h_y[0], sizeof(float) * TILE, cudaMemcpyHostToDevice));
+	    cudaStreamCreate(&stream[i]);
    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {
-        //TODO Wait Tile i 
+        //TODO Copy in Tile i (stream i)
-        cudaDeviceSynchronize();
+        gpuErrchk(cudaMemcpyAsync(&d_x[i], &h_x[i], sizeof(float) * TILE, cudaMemcpyHostToDevice, stream[i/TILE]));
        gpuErrchk(cudaMemcpyAsync(&d_y[i], &h_y[i], sizeof(float) * TILE, cudaMemcpyHostToDevice, stream[i/TILE]));
-        //TODO Copy the out tile i-1
+        //TODO Kernel Tile i (stream i)
-        if(i>0)
+        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE,0,stream[i/TILE]>>>(&d_y[i], a, &d_x[i], TILE);
            gpuErrchk(cudaMemcpyAsync(&h_y[i-TILE], &d_y[i-TILE], sizeof(float) * TILE, cudaMemcpyDeviceToHost));
-        //TODO Launch Kernel over tile i
+        //TODO Copy out Tile i (stream i)
-        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(&d_y[i], a, &d_x[i], TILE);
+        gpuErrchk(cudaMemcpyAsync(&h_y[i], &d_y[i], sizeof(float) * TILE, cudaMemcpyDeviceToHost,stream[i/TILE]));
        //TODO Copy the in tile i+=TILE
        if(i+TILE < n){
            gpuErrchk(cudaMemcpyAsync(&d_x[i+TILE], &h_x[i+TILE], sizeof(float) * TILE, cudaMemcpyHostToDevice));
            gpuErrchk(cudaMemcpyAsync(&d_y[i+TILE], &h_y[i+TILE], sizeof(float) * TILE, cudaMemcpyHostToDevice));
    }
-    }
+    //TODO Wait all the streams...
    //TODO Copy out the last tile n-TILE
    gpuErrchk(cudaMemcpyAsync(&h_y[n-TILE], &d_y[n-TILE], sizeof(float) * TILE, cudaMemcpyDeviceToHost));
    //TODO Wait last tile
    cudaDeviceSynchronize();
    stop_timer();
    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
@ -184,13 +181,16 @@ int main(int argc, const char **argv)
        assert(iret == 0);
    }
    //CUDA Buffer Allocation
    free(h_x);
    gpuErrchk(cudaFree(d_x));
    free(h_y);
    gpuErrchk(cudaFree(d_y));
    free(h_z);
    for (int i=0; i<N_STREAMS; ++i) 
      cudaStreamDestroy(stream[i]);
    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
    return 0;
--- a/cuda/lab3/.solutions/saxpy-v4.cu
+++ b/cuda/lab3/.solutions/saxpy-v4.cu
@ -70,7 +70,7 @@ extern "C"
 #endif
 #ifndef BLOCK_SIZE
-#define BLOCK_SIZE (512)
+#define BLOCK_SIZE (128)
 #endif
 #ifndef N_STREAMS
@ -81,16 +81,17 @@ extern "C"
 *SAXPY (host implementation)
 * y := a * x + y
 */
-void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
+void host_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
 {
-#pragma omp parallel for simd schedule(simd: static)
+#pragma omp parallel for simd schedule(simd \
                                       : static)
    for (int i = 0; i < n; i++)
    {
        y[i] = a * x[i] + y[i];
    }
 }
-__global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
+__global__ void gpu_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
@ -101,8 +102,8 @@ int main(int argc, const char **argv)
 {
    int iret = 0;
    int n = N;
-    float *h_x, *d_x;
+    float *h_x;
-    float *h_y, *d_y;
+    float *h_y;
    float *h_z;
    float a = 101.0f / TWO02,
          b, c;
@ -110,16 +111,10 @@ int main(int argc, const char **argv)
    if (argc > 1)
        n = atoi(argv[1]);
-    if (NULL == (h_x = (float *)malloc(sizeof(float) * n)))
+    //CUDA Buffer Allocation
-    {
+    gpuErrchk(cudaMallocManaged((void **)&h_x, sizeof(float) * n));
-        printf("error: memory allocation for 'x'\n");
+    gpuErrchk(cudaMallocManaged((void **)&h_y, sizeof(float) * n));
-        iret = -1;
+
    }
    if (NULL == (h_y = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'y'\n");
        iret = -1;
    }
    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'z'\n");
@ -127,8 +122,8 @@ int main(int argc, const char **argv)
    }
    if (0 != iret)
    {
-        free(h_x);
+        gpuErrchk(cudaFree(h_x));
-        free(h_y);
+        gpuErrchk(cudaFree(h_y));
        free(h_z);
        exit(EXIT_FAILURE);
    }
@ -142,55 +137,41 @@ int main(int argc, const char **argv)
        h_y[i] = h_z[i] = c / (float)TWO04;
    }
    //CUDA Buffer Allocation
    gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n));
    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));
    start_timer();
    int TILE = n / N_STREAMS;
    cudaStream_t stream[N_STREAMS];
-    for(int i = 0; i < N_STREAMS; i++)
+    for (int i = 0; i < N_STREAMS; i++)
        cudaStreamCreate(&stream[i]);
    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {
        //TODO Copy in Tile i (stream i)
        gpuErrchk(cudaMemcpyAsync(&d_x[i], &h_x[i], sizeof(float) * TILE, cudaMemcpyHostToDevice, stream[i/TILE]));
        gpuErrchk(cudaMemcpyAsync(&d_y[i], &h_y[i], sizeof(float) * TILE, cudaMemcpyHostToDevice, stream[i/TILE]));
        //TODO Kernel Tile i (stream i)
-        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE,0,stream[i/TILE]>>>(&d_y[i], a, &d_x[i], TILE);
+        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE, 0, stream[i / TILE]>>>(&h_y[i], a, &h_x[i], TILE);
        //TODO Copy out Tile i (stream i)
        gpuErrchk(cudaMemcpyAsync(&h_y[i], &d_y[i], sizeof(float) * TILE, cudaMemcpyDeviceToHost,stream[i/TILE]));
    }
    //TODO Wait all the streams...
    cudaDeviceSynchronize();
    stop_timer();
-    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
+    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
    //Check Matematical Consistency
    start_timer();
    host_saxpy(h_z, a, h_x, n);
    stop_timer();
-    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
+    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
    for (int i = 0; i < n; ++i)
    {
        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
        assert(iret == 0);
    }
-    free(h_x);
+    gpuErrchk(cudaFree(h_x));
-    gpuErrchk(cudaFree(d_x));
+    gpuErrchk(cudaFree(h_y));
    free(h_y);
    gpuErrchk(cudaFree(d_y));
    free(h_z);
-    for (int i=0; i<N_STREAMS; ++i) 
+    for (int i = 0; i < N_STREAMS; ++i)
        cudaStreamDestroy(stream[i]);
    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
    return 0;
--- a/cuda/lab3/.solutions/saxpy-v5.cu
+++ b/cuda/lab3/.solutions/saxpy-v5.cu
@ -70,28 +70,23 @@ extern "C"
 #endif
 #ifndef BLOCK_SIZE
-#define BLOCK_SIZE (128)
+#define BLOCK_SIZE (512)
 #endif
 #ifndef N_STREAMS
 #define N_STREAMS (16)
 #endif
 /*
 *SAXPY (host implementation)
 * y := a * x + y
 */
-void host_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
+void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
 {
-#pragma omp parallel for simd schedule(simd \
+#pragma omp parallel for simd schedule(simd: static)
                                       : static)
    for (int i = 0; i < n; i++)
    {
        y[i] = a * x[i] + y[i];
    }
 }
-__global__ void gpu_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
+__global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
@ -102,8 +97,8 @@ int main(int argc, const char **argv)
 {
    int iret = 0;
    int n = N;
-    float *h_x;
+    float *h_x, *d_x;
-    float *h_y;
+    float *h_y, *d_y;
    float *h_z;
    float a = 101.0f / TWO02,
          b, c;
@ -111,10 +106,16 @@ int main(int argc, const char **argv)
    if (argc > 1)
        n = atoi(argv[1]);
-    //CUDA Buffer Allocation
+    if (NULL == (h_x = (float *)malloc(sizeof(float) * n)))
-    gpuErrchk(cudaMallocManaged((void **)&h_x, sizeof(float) * n));
+    {
-    gpuErrchk(cudaMallocManaged((void **)&h_y, sizeof(float) * n));
+        printf("error: memory allocation for 'x'\n");
-
+        iret = -1;
    }
    if (NULL == (h_y = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'y'\n");
        iret = -1;
    }
    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'z'\n");
@ -122,8 +123,8 @@ int main(int argc, const char **argv)
    }
    if (0 != iret)
    {
-        gpuErrchk(cudaFree(h_x));
+        free(h_x);
-        gpuErrchk(cudaFree(h_y));
+        free(h_y);
        free(h_z);
        exit(EXIT_FAILURE);
    }
@ -137,41 +138,59 @@ int main(int argc, const char **argv)
        h_y[i] = h_z[i] = c / (float)TWO04;
    }
-    start_timer();
+    //CUDA Buffer Allocation
-    int TILE = n / N_STREAMS;
+    gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n));
-    cudaStream_t stream[N_STREAMS];
+    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));
    for (int i = 0; i < N_STREAMS; i++)
        cudaStreamCreate(&stream[i]);
    start_timer();
    int TILE = n / 8;
    //TODO Copy the first Tile (i=0)
    gpuErrchk(cudaMemcpyAsync(&d_x[0], &h_x[0], sizeof(float) * TILE, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpyAsync(&d_y[0], &h_y[0], sizeof(float) * TILE, cudaMemcpyHostToDevice));
    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {
-        //TODO Kernel Tile i (stream i)
+        //TODO Wait Tile i 
-        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE, 0, stream[i / TILE]>>>(&h_y[i], a, &h_x[i], TILE);
+        cudaDeviceSynchronize();
        //TODO Copy the out tile i-1
        if(i>0)
            gpuErrchk(cudaMemcpyAsync(&h_y[i-TILE], &d_y[i-TILE], sizeof(float) * TILE, cudaMemcpyDeviceToHost));
        //TODO Launch Kernel over tile i
        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(&d_y[i], a, &d_x[i], TILE);
        //TODO Copy the in tile i+=TILE
        if(i+TILE < n){
            gpuErrchk(cudaMemcpyAsync(&d_x[i+TILE], &h_x[i+TILE], sizeof(float) * TILE, cudaMemcpyHostToDevice));
            gpuErrchk(cudaMemcpyAsync(&d_y[i+TILE], &h_y[i+TILE], sizeof(float) * TILE, cudaMemcpyHostToDevice));
        }
-    //TODO Wait all the streams...
+    }
    //TODO Copy out the last tile n-TILE
    gpuErrchk(cudaMemcpyAsync(&h_y[n-TILE], &d_y[n-TILE], sizeof(float) * TILE, cudaMemcpyDeviceToHost));
    //TODO Wait last tile
    cudaDeviceSynchronize();
    stop_timer();
-    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
+    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
    //Check Matematical Consistency
    start_timer();
    host_saxpy(h_z, a, h_x, n);
    stop_timer();
-    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
+    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
    for (int i = 0; i < n; ++i)
    {
        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
        assert(iret == 0);
    }
-    gpuErrchk(cudaFree(h_x));
+    //CUDA Buffer Allocation
-    gpuErrchk(cudaFree(h_y));
+    free(h_x);
    gpuErrchk(cudaFree(d_x));
    free(h_y);
    gpuErrchk(cudaFree(d_y));
    free(h_z);
    for (int i = 0; i < N_STREAMS; ++i)
        cudaStreamDestroy(stream[i]);
    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
    return 0;
--- a/cuda/lab3/Makefile
+++ b/cuda/lab3/Makefile
@ -12,7 +12,7 @@ NVCC=$(CUDA_HOME)/bin/nvcc
 CXX=g++
 OPT:=-O2 -g
-NVOPT:=-Xcompiler -fopenmp -lineinfo -arch=sm_53 --ptxas-options=-v --use_fast_math `pkg-config --cflags --libs opencv4`
+NVOPT:=-Xcompiler -fopenmp -lineinfo `pkg-config --cflags --libs opencv4`
 CXXFLAGS:=$(OPT) -I. $(EXT_CXXFLAGS)
 LDFLAGS:=-lm -lcudart $(EXT_LDFLAGS)
@ -47,7 +47,7 @@ run: $(EXE)
 	./$(EXE)
 profile: $(EXE)
-	sudo LD_LIBRARY_PATH=$(CUDA_HOME)/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof ./$(EXE)
+	sudo LD_LIBRARY_PATH=$(CUDA_HOME)/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof --unified-memory-profiling off ./$(EXE)
 metrics: $(EXE)
 	sudo LD_LIBRARY_PATH=$(CUDA_HOME)/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof --print-gpu-trace --metrics "eligible_warps_per_cycle,achieved_occupancy,sm_efficiency,ipc" ./$(EXE)
--- a/cuda/lab3/saxpy-v3.cu
+++ b/cuda/lab3/saxpy-v3.cu
@ -0,0 +1,192 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file saxpy.c
 * @author Alessandro Capotondi
 * @date 12 May 2020
 * @brief Saxpy
 * 
 * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152
 */
 #include <assert.h>
 #include <time.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <cuda_runtime.h>
 #define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
 static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
 {
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort)
            exit(code);
    }
 }
 extern "C"
 {
 #include "utils.h"
 }
 #define TWO02 (1 << 2)
 #define TWO04 (1 << 4)
 #define TWO08 (1 << 8)
 #ifndef N
 #define N (1 << 27)
 #endif
 #ifndef BLOCK_SIZE
 #define BLOCK_SIZE (512)
 #endif
 #ifndef N_STREAMS
 #define N_STREAMS (16)
 #endif
 /*
 *SAXPY (host implementation)
 * y := a * x + y
 */
 void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
 {
 #pragma omp parallel for simd schedule(simd: static)
    for (int i = 0; i < n; i++)
    {
        y[i] = a * x[i] + y[i];
    }
 }
 __global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        y[i] = a * x[i] + y[i];
 }
 int main(int argc, const char **argv)
 {
    int iret = 0;
    int n = N;
    float *h_x, *d_x;
    float *h_y, *d_y;
    float *h_z;
    float a = 101.0f / TWO02,
          b, c;
    if (argc > 1)
        n = atoi(argv[1]);
    if (NULL == (h_x = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'x'\n");
        iret = -1;
    }
    if (NULL == (h_y = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'y'\n");
        iret = -1;
    }
    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (0 != iret)
    {
        free(h_x);
        free(h_y);
        free(h_z);
        exit(EXIT_FAILURE);
    }
    //Init Data
    b = rand() % TWO04;
    c = rand() % TWO08;
    for (int i = 0; i < n; i++)
    {
        h_x[i] = b / (float)TWO02;
        h_y[i] = h_z[i] = c / (float)TWO04;
    }
    //CUDA Buffer Allocation
    gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n));
    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));
    start_timer();
    int TILE = n / N_STREAMS;
    //TODO Create N_STREAMS
    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {
        //TODO Copy to device Tile i (over stream i)
        //TODO Execute Kernel Tile i (stream i)
        //TODO Copy from device Tile i (stream i)
    }
    //TODO Wait all the streams...
    stop_timer();
    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
    //Check Matematical Consistency
    start_timer();
    host_saxpy(h_z, a, h_x, n);
    stop_timer();
    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
    for (int i = 0; i < n; ++i)
    {
        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
        assert(iret == 0);
    }
    free(h_x);
    gpuErrchk(cudaFree(d_x));
    free(h_y);
    gpuErrchk(cudaFree(d_y));
    free(h_z);
    for (int i=0; i<N_STREAMS; ++i) 
      cudaStreamDestroy(stream[i]);
    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
    return 0;
 }
--- a/cuda/lab3/saxpy-v4.cu
+++ b/cuda/lab3/saxpy-v4.cu
@ -0,0 +1,175 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file saxpy.c
 * @author Alessandro Capotondi
 * @date 12 May 2020
 * @brief Saxpy
 * 
 * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152
 */
 #include <assert.h>
 #include <time.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <cuda_runtime.h>
 #define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
 static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
 {
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort)
            exit(code);
    }
 }
 extern "C"
 {
 #include "utils.h"
 }
 #define TWO02 (1 << 2)
 #define TWO04 (1 << 4)
 #define TWO08 (1 << 8)
 #ifndef N
 #define N (1 << 27)
 #endif
 #ifndef BLOCK_SIZE
 #define BLOCK_SIZE (128)
 #endif
 #ifndef N_STREAMS
 #define N_STREAMS (16)
 #endif
 /*
 *SAXPY (host implementation)
 * y := a * x + y
 */
 void host_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
 {
 #pragma omp parallel for simd schedule(simd \
                                       : static)
    for (int i = 0; i < n; i++)
    {
        y[i] = a * x[i] + y[i];
    }
 }
 __global__ void gpu_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        y[i] = a * x[i] + y[i];
 }
 int main(int argc, const char **argv)
 {
    int iret = 0;
    int n = N;
    float *h_x;
    float *h_y;
    float *h_z;
    float a = 101.0f / TWO02,
          b, c;
    if (argc > 1)
        n = atoi(argv[1]);
    //CUDA Buffer Allocation
    gpuErrchk(cudaMallocManaged((void **)&h_x, sizeof(float) * n));
    gpuErrchk(cudaMallocManaged((void **)&h_y, sizeof(float) * n));
    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (0 != iret)
    {
        gpuErrchk(cudaFree(h_x));
        gpuErrchk(cudaFree(h_y));
        free(h_z);
        exit(EXIT_FAILURE);
    }
    //Init Data
    b = rand() % TWO04;
    c = rand() % TWO08;
    for (int i = 0; i < n; i++)
    {
        h_x[i] = b / (float)TWO02;
        h_y[i] = h_z[i] = c / (float)TWO04;
    }
    start_timer();
    int TILE = n / N_STREAMS;
    //TODO Create N_STREAMS
    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {   
        //TODO Execute Kernel Tile i (stream i)
    }
    //TODO Wait all the streams...
    stop_timer();
    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
    //Check Matematical Consistency
    start_timer();
    host_saxpy(h_z, a, h_x, n);
    stop_timer();
    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
    for (int i = 0; i < n; ++i)
    {
        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
        assert(iret == 0);
    }
    gpuErrchk(cudaFree(h_x));
    gpuErrchk(cudaFree(h_y));
    free(h_z);
    for (int i = 0; i < N_STREAMS; ++i)
        cudaStreamDestroy(stream[i]);
    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
    return 0;
 }
--- a/cuda/lab3/saxpy-v5.cu
+++ b/cuda/lab3/saxpy-v5.cu
@ -0,0 +1,186 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file saxpy.c
 * @author Alessandro Capotondi
 * @date 12 May 2020
 * @brief Saxpy
 * 
 * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152
 */
 #include <assert.h>
 #include <time.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <cuda_runtime.h>
 #define gpuErrchk(ans)                        \
    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
 static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
 {
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort)
            exit(code);
    }
 }
 extern "C"
 {
 #include "utils.h"
 }
 #define TWO02 (1 << 2)
 #define TWO04 (1 << 4)
 #define TWO08 (1 << 8)
 #ifndef N
 #define N (1 << 27)
 #endif
 #ifndef BLOCK_SIZE
 #define BLOCK_SIZE (512)
 #endif
 /*
 *SAXPY (host implementation)
 * y := a * x + y
 */
 void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
 {
 #pragma omp parallel for simd schedule(simd: static)
    for (int i = 0; i < n; i++)
    {
        y[i] = a * x[i] + y[i];
    }
 }
 __global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        y[i] = a * x[i] + y[i];
 }
 int main(int argc, const char **argv)
 {
    int iret = 0;
    int n = N;
    float *h_x, *d_x;
    float *h_y, *d_y;
    float *h_z;
    float a = 101.0f / TWO02,
          b, c;
    if (argc > 1)
        n = atoi(argv[1]);
    if (NULL == (h_x = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'x'\n");
        iret = -1;
    }
    if (NULL == (h_y = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'y'\n");
        iret = -1;
    }
    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (0 != iret)
    {
        free(h_x);
        free(h_y);
        free(h_z);
        exit(EXIT_FAILURE);
    }
    //Init Data
    b = rand() % TWO04;
    c = rand() % TWO08;
    for (int i = 0; i < n; i++)
    {
        h_x[i] = b / (float)TWO02;
        h_y[i] = h_z[i] = c / (float)TWO04;
    }
    //CUDA Buffer Allocation
    gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n));
    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));
    start_timer();
    int TILE = n / 8;
    //TODO Copy to device the first input Tile (i=0)
    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {
        //TODO Wait Tile i 
        //TODO Copy from the device the output tile i-1 (if i>0)
        //TODO Launch Kernel over tile i
        //TODO Copy to the device the input tile i+=TILE (if i+TILE < n)
    }
    //TODO Copy out the last tile n-TILE
    //TODO Wait last tile
    stop_timer();
    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
    //Check Matematical Consistency
    start_timer();
    host_saxpy(h_z, a, h_x, n);
    stop_timer();
    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
    for (int i = 0; i < n; ++i)
    {
        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
        assert(iret == 0);
    }
    //CUDA Buffer Allocation
    free(h_x);
    gpuErrchk(cudaFree(d_x));
    free(h_y);
    gpuErrchk(cudaFree(d_y));
    free(h_z);
    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
    return 0;
 }