Update Lab3

2025-04-18 12:30:31 +00:00 · 2022-05-17 15:43:44 +02:00 · 2022-05-17 15:43:44 +02:00 · b558d22f47
commit b558d22f47
parent 5761b0de1c
7 changed files with 649 additions and 96 deletions
--- a/cuda/lab3/.solutions/saxpy-v3.cu
+++ b/cuda/lab3/.solutions/saxpy-v3.cu
@ -73,6 +73,10 @@ extern "C"
 #define BLOCK_SIZE (512)
 #endif

+#ifndef N_STREAMS
+#define N_STREAMS (16)
+#endif
+
 /*
 *SAXPY (host implementation)
 * y := a * x + y
@ -143,32 +147,25 @@ int main(int argc, const char **argv)
    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));

    start_timer();
-    int TILE = n / 8;
-    //TODO Copy the first Tile (i=0)
-    gpuErrchk(cudaMemcpyAsync(&d_x[0], &h_x[0], sizeof(float) * TILE, cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpyAsync(&d_y[0], &h_y[0], sizeof(float) * TILE, cudaMemcpyHostToDevice));
+    int TILE = n / N_STREAMS;
+    cudaStream_t stream[N_STREAMS];
+    for(int i = 0; i < N_STREAMS; i++)
+	    cudaStreamCreate(&stream[i]);
+
    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {
-        //TODO Wait Tile i 
-        cudaDeviceSynchronize();
+        //TODO Copy in Tile i (stream i)
+        gpuErrchk(cudaMemcpyAsync(&d_x[i], &h_x[i], sizeof(float) * TILE, cudaMemcpyHostToDevice, stream[i/TILE]));
+        gpuErrchk(cudaMemcpyAsync(&d_y[i], &h_y[i], sizeof(float) * TILE, cudaMemcpyHostToDevice, stream[i/TILE]));

-        //TODO Copy the out tile i-1
-        if(i>0)
-            gpuErrchk(cudaMemcpyAsync(&h_y[i-TILE], &d_y[i-TILE], sizeof(float) * TILE, cudaMemcpyDeviceToHost));
+        //TODO Kernel Tile i (stream i)
+        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE,0,stream[i/TILE]>>>(&d_y[i], a, &d_x[i], TILE);

-        //TODO Launch Kernel over tile i
-        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(&d_y[i], a, &d_x[i], TILE);
-
-        //TODO Copy the in tile i+=TILE
-        if(i+TILE < n){
-            gpuErrchk(cudaMemcpyAsync(&d_x[i+TILE], &h_x[i+TILE], sizeof(float) * TILE, cudaMemcpyHostToDevice));
-            gpuErrchk(cudaMemcpyAsync(&d_y[i+TILE], &h_y[i+TILE], sizeof(float) * TILE, cudaMemcpyHostToDevice));
-        }
+        //TODO Copy out Tile i (stream i)
+        gpuErrchk(cudaMemcpyAsync(&h_y[i], &d_y[i], sizeof(float) * TILE, cudaMemcpyDeviceToHost,stream[i/TILE]));
    }
-    //TODO Copy out the last tile n-TILE
-    gpuErrchk(cudaMemcpyAsync(&h_y[n-TILE], &d_y[n-TILE], sizeof(float) * TILE, cudaMemcpyDeviceToHost));
-    //TODO Wait last tile
+    //TODO Wait all the streams...
    cudaDeviceSynchronize();
    stop_timer();
    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
@ -184,13 +181,16 @@ int main(int argc, const char **argv)
        assert(iret == 0);
    }

-    //CUDA Buffer Allocation
    free(h_x);
    gpuErrchk(cudaFree(d_x));
    free(h_y);
    gpuErrchk(cudaFree(d_y));
    free(h_z);

+    for (int i=0; i<N_STREAMS; ++i) 
+      cudaStreamDestroy(stream[i]);
+
+
    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
    return 0;
--- a/cuda/lab3/.solutions/saxpy-v4.cu
+++ b/cuda/lab3/.solutions/saxpy-v4.cu
@ -70,7 +70,7 @@ extern "C"
 #endif

 #ifndef BLOCK_SIZE
-#define BLOCK_SIZE (512)
+#define BLOCK_SIZE (128)
 #endif

 #ifndef N_STREAMS
@ -81,16 +81,17 @@ extern "C"
 *SAXPY (host implementation)
 * y := a * x + y
 */
-void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
+void host_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
 {
-#pragma omp parallel for simd schedule(simd: static)
+#pragma omp parallel for simd schedule(simd \
+                                       : static)
    for (int i = 0; i < n; i++)
    {
        y[i] = a * x[i] + y[i];
    }
 }

-__global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
+__global__ void gpu_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
@ -101,8 +102,8 @@ int main(int argc, const char **argv)
 {
    int iret = 0;
    int n = N;
-    float *h_x, *d_x;
-    float *h_y, *d_y;
+    float *h_x;
+    float *h_y;
    float *h_z;
    float a = 101.0f / TWO02,
          b, c;
@ -110,16 +111,10 @@ int main(int argc, const char **argv)
    if (argc > 1)
        n = atoi(argv[1]);

-    if (NULL == (h_x = (float *)malloc(sizeof(float) * n)))
-    {
-        printf("error: memory allocation for 'x'\n");
-        iret = -1;
-    }
-    if (NULL == (h_y = (float *)malloc(sizeof(float) * n)))
-    {
-        printf("error: memory allocation for 'y'\n");
-        iret = -1;
-    }
+    //CUDA Buffer Allocation
+    gpuErrchk(cudaMallocManaged((void **)&h_x, sizeof(float) * n));
+    gpuErrchk(cudaMallocManaged((void **)&h_y, sizeof(float) * n));
+
    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'z'\n");
@ -127,8 +122,8 @@ int main(int argc, const char **argv)
    }
    if (0 != iret)
    {
-        free(h_x);
-        free(h_y);
+        gpuErrchk(cudaFree(h_x));
+        gpuErrchk(cudaFree(h_y));
        free(h_z);
        exit(EXIT_FAILURE);
    }
@ -142,54 +137,40 @@ int main(int argc, const char **argv)
        h_y[i] = h_z[i] = c / (float)TWO04;
    }

-    //CUDA Buffer Allocation
-    gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n));
-    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));
-
    start_timer();
    int TILE = n / N_STREAMS;
    cudaStream_t stream[N_STREAMS];
-    for(int i = 0; i < N_STREAMS; i++)
-	    cudaStreamCreate(&stream[i]);
+    for (int i = 0; i < N_STREAMS; i++)
+        cudaStreamCreate(&stream[i]);

    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {
-        //TODO Copy in Tile i (stream i)
-        gpuErrchk(cudaMemcpyAsync(&d_x[i], &h_x[i], sizeof(float) * TILE, cudaMemcpyHostToDevice, stream[i/TILE]));
-        gpuErrchk(cudaMemcpyAsync(&d_y[i], &h_y[i], sizeof(float) * TILE, cudaMemcpyHostToDevice, stream[i/TILE]));
-
        //TODO Kernel Tile i (stream i)
-        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE,0,stream[i/TILE]>>>(&d_y[i], a, &d_x[i], TILE);
-
-        //TODO Copy out Tile i (stream i)
-        gpuErrchk(cudaMemcpyAsync(&h_y[i], &d_y[i], sizeof(float) * TILE, cudaMemcpyDeviceToHost,stream[i/TILE]));
+        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE, 0, stream[i / TILE]>>>(&h_y[i], a, &h_x[i], TILE);
    }
    //TODO Wait all the streams...
    cudaDeviceSynchronize();
    stop_timer();
-    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
+    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));

    //Check Matematical Consistency
    start_timer();
    host_saxpy(h_z, a, h_x, n);
    stop_timer();
-    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
+    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
    for (int i = 0; i < n; ++i)
    {
        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
        assert(iret == 0);
    }

-    free(h_x);
-    gpuErrchk(cudaFree(d_x));
-    free(h_y);
-    gpuErrchk(cudaFree(d_y));
+    gpuErrchk(cudaFree(h_x));
+    gpuErrchk(cudaFree(h_y));
    free(h_z);

-    for (int i=0; i<N_STREAMS; ++i) 
-      cudaStreamDestroy(stream[i]);
-
+    for (int i = 0; i < N_STREAMS; ++i)
+        cudaStreamDestroy(stream[i]);

    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
--- a/cuda/lab3/.solutions/saxpy-v5.cu
+++ b/cuda/lab3/.solutions/saxpy-v5.cu
@ -70,28 +70,23 @@ extern "C"
 #endif

 #ifndef BLOCK_SIZE
-#define BLOCK_SIZE (128)
-#endif
-
-#ifndef N_STREAMS
-#define N_STREAMS (16)
+#define BLOCK_SIZE (512)
 #endif

 /*
 *SAXPY (host implementation)
 * y := a * x + y
 */
-void host_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
+void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
 {
-#pragma omp parallel for simd schedule(simd \
-                                       : static)
+#pragma omp parallel for simd schedule(simd: static)
    for (int i = 0; i < n; i++)
    {
        y[i] = a * x[i] + y[i];
    }
 }

-__global__ void gpu_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
+__global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
@ -102,8 +97,8 @@ int main(int argc, const char **argv)
 {
    int iret = 0;
    int n = N;
-    float *h_x;
-    float *h_y;
+    float *h_x, *d_x;
+    float *h_y, *d_y;
    float *h_z;
    float a = 101.0f / TWO02,
          b, c;
@ -111,10 +106,16 @@ int main(int argc, const char **argv)
    if (argc > 1)
        n = atoi(argv[1]);

-    //CUDA Buffer Allocation
-    gpuErrchk(cudaMallocManaged((void **)&h_x, sizeof(float) * n));
-    gpuErrchk(cudaMallocManaged((void **)&h_y, sizeof(float) * n));
-
+    if (NULL == (h_x = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'x'\n");
+        iret = -1;
+    }
+    if (NULL == (h_y = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'y'\n");
+        iret = -1;
+    }
    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
    {
        printf("error: memory allocation for 'z'\n");
@ -122,8 +123,8 @@ int main(int argc, const char **argv)
    }
    if (0 != iret)
    {
-        gpuErrchk(cudaFree(h_x));
-        gpuErrchk(cudaFree(h_y));
+        free(h_x);
+        free(h_y);
        free(h_z);
        exit(EXIT_FAILURE);
    }
@ -137,41 +138,59 @@ int main(int argc, const char **argv)
        h_y[i] = h_z[i] = c / (float)TWO04;
    }

-    start_timer();
-    int TILE = n / N_STREAMS;
-    cudaStream_t stream[N_STREAMS];
-    for (int i = 0; i < N_STREAMS; i++)
-        cudaStreamCreate(&stream[i]);
+    //CUDA Buffer Allocation
+    gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n));
+    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));

+    start_timer();
+    int TILE = n / 8;
+    //TODO Copy the first Tile (i=0)
+    gpuErrchk(cudaMemcpyAsync(&d_x[0], &h_x[0], sizeof(float) * TILE, cudaMemcpyHostToDevice));
+    gpuErrchk(cudaMemcpyAsync(&d_y[0], &h_y[0], sizeof(float) * TILE, cudaMemcpyHostToDevice));
    //TODO Loop over the Tiles
    for (int i = 0; i < n; i += TILE)
    {
-        //TODO Kernel Tile i (stream i)
-        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE, 0, stream[i / TILE]>>>(&h_y[i], a, &h_x[i], TILE);
+        //TODO Wait Tile i 
+        cudaDeviceSynchronize();
+
+        //TODO Copy the out tile i-1
+        if(i>0)
+            gpuErrchk(cudaMemcpyAsync(&h_y[i-TILE], &d_y[i-TILE], sizeof(float) * TILE, cudaMemcpyDeviceToHost));
+
+        //TODO Launch Kernel over tile i
+        gpu_saxpy<<<((TILE + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(&d_y[i], a, &d_x[i], TILE);
+
+        //TODO Copy the in tile i+=TILE
+        if(i+TILE < n){
+            gpuErrchk(cudaMemcpyAsync(&d_x[i+TILE], &h_x[i+TILE], sizeof(float) * TILE, cudaMemcpyHostToDevice));
+            gpuErrchk(cudaMemcpyAsync(&d_y[i+TILE], &h_y[i+TILE], sizeof(float) * TILE, cudaMemcpyHostToDevice));
+        }
    }
-    //TODO Wait all the streams...
+    //TODO Copy out the last tile n-TILE
+    gpuErrchk(cudaMemcpyAsync(&h_y[n-TILE], &d_y[n-TILE], sizeof(float) * TILE, cudaMemcpyDeviceToHost));
+    //TODO Wait last tile
    cudaDeviceSynchronize();
    stop_timer();
-    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
+    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));

    //Check Matematical Consistency
    start_timer();
    host_saxpy(h_z, a, h_x, n);
    stop_timer();
-    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
+    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
    for (int i = 0; i < n; ++i)
    {
        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
        assert(iret == 0);
    }

-    gpuErrchk(cudaFree(h_x));
-    gpuErrchk(cudaFree(h_y));
+    //CUDA Buffer Allocation
+    free(h_x);
+    gpuErrchk(cudaFree(d_x));
+    free(h_y);
+    gpuErrchk(cudaFree(d_y));
    free(h_z);

-    for (int i = 0; i < N_STREAMS; ++i)
-        cudaStreamDestroy(stream[i]);
-
    // CUDA exit -- needed to flush printf write buffer
    cudaDeviceReset();
    return 0;
--- a/cuda/lab3/Makefile
+++ b/cuda/lab3/Makefile
@ -12,7 +12,7 @@ NVCC=$(CUDA_HOME)/bin/nvcc
 CXX=g++

 OPT:=-O2 -g
-NVOPT:=-Xcompiler -fopenmp -lineinfo -arch=sm_53 --ptxas-options=-v --use_fast_math `pkg-config --cflags --libs opencv4`
+NVOPT:=-Xcompiler -fopenmp -lineinfo `pkg-config --cflags --libs opencv4`

 CXXFLAGS:=$(OPT) -I. $(EXT_CXXFLAGS)
 LDFLAGS:=-lm -lcudart $(EXT_LDFLAGS)
@ -47,7 +47,7 @@ run: $(EXE)
 	./$(EXE)

 profile: $(EXE)
-	sudo LD_LIBRARY_PATH=$(CUDA_HOME)/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof ./$(EXE)
+	sudo LD_LIBRARY_PATH=$(CUDA_HOME)/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof --unified-memory-profiling off ./$(EXE)

 metrics: $(EXE)
 	sudo LD_LIBRARY_PATH=$(CUDA_HOME)/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof --print-gpu-trace --metrics "eligible_warps_per_cycle,achieved_occupancy,sm_efficiency,ipc" ./$(EXE)
--- a/cuda/lab3/saxpy-v3.cu
+++ b/cuda/lab3/saxpy-v3.cu
@ -0,0 +1,192 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file saxpy.c
+ * @author Alessandro Capotondi
+ * @date 12 May 2020
+ * @brief Saxpy
+ * 
+ * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152
+ */
+
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <cuda_runtime.h>
+
+#define gpuErrchk(ans)                        \
+    {                                         \
+        gpuAssert((ans), __FILE__, __LINE__); \
+    }
+static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
+{
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort)
+            exit(code);
+    }
+}
+
+extern "C"
+{
+#include "utils.h"
+}
+
+#define TWO02 (1 << 2)
+#define TWO04 (1 << 4)
+#define TWO08 (1 << 8)
+#ifndef N
+#define N (1 << 27)
+#endif
+
+#ifndef BLOCK_SIZE
+#define BLOCK_SIZE (512)
+#endif
+
+#ifndef N_STREAMS
+#define N_STREAMS (16)
+#endif
+
+/*
+ *SAXPY (host implementation)
+ * y := a * x + y
+ */
+void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
+{
+#pragma omp parallel for simd schedule(simd: static)
+    for (int i = 0; i < n; i++)
+    {
+        y[i] = a * x[i] + y[i];
+    }
+}
+
+__global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n)
+        y[i] = a * x[i] + y[i];
+}
+
+int main(int argc, const char **argv)
+{
+    int iret = 0;
+    int n = N;
+    float *h_x, *d_x;
+    float *h_y, *d_y;
+    float *h_z;
+    float a = 101.0f / TWO02,
+          b, c;
+
+    if (argc > 1)
+        n = atoi(argv[1]);
+
+    if (NULL == (h_x = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'x'\n");
+        iret = -1;
+    }
+    if (NULL == (h_y = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'y'\n");
+        iret = -1;
+    }
+    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+    if (0 != iret)
+    {
+        free(h_x);
+        free(h_y);
+        free(h_z);
+        exit(EXIT_FAILURE);
+    }
+
+    //Init Data
+    b = rand() % TWO04;
+    c = rand() % TWO08;
+    for (int i = 0; i < n; i++)
+    {
+        h_x[i] = b / (float)TWO02;
+        h_y[i] = h_z[i] = c / (float)TWO04;
+    }
+
+    //CUDA Buffer Allocation
+    gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n));
+    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));
+
+    start_timer();
+    int TILE = n / N_STREAMS;
+    
+    //TODO Create N_STREAMS
+
+    //TODO Loop over the Tiles
+    for (int i = 0; i < n; i += TILE)
+    {
+        //TODO Copy to device Tile i (over stream i)
+        
+        //TODO Execute Kernel Tile i (stream i)
+
+        //TODO Copy from device Tile i (stream i)
+    }
+    //TODO Wait all the streams...
+
+    stop_timer();
+    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
+
+    //Check Matematical Consistency
+    start_timer();
+    host_saxpy(h_z, a, h_x, n);
+    stop_timer();
+    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
+    for (int i = 0; i < n; ++i)
+    {
+        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
+        assert(iret == 0);
+    }
+
+    free(h_x);
+    gpuErrchk(cudaFree(d_x));
+    free(h_y);
+    gpuErrchk(cudaFree(d_y));
+    free(h_z);
+
+    for (int i=0; i<N_STREAMS; ++i) 
+      cudaStreamDestroy(stream[i]);
+
+
+    // CUDA exit -- needed to flush printf write buffer
+    cudaDeviceReset();
+    return 0;
+}
--- a/cuda/lab3/saxpy-v4.cu
+++ b/cuda/lab3/saxpy-v4.cu
@ -0,0 +1,175 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file saxpy.c
+ * @author Alessandro Capotondi
+ * @date 12 May 2020
+ * @brief Saxpy
+ * 
+ * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152
+ */
+
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <cuda_runtime.h>
+
+#define gpuErrchk(ans)                        \
+    {                                         \
+        gpuAssert((ans), __FILE__, __LINE__); \
+    }
+static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
+{
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort)
+            exit(code);
+    }
+}
+
+extern "C"
+{
+#include "utils.h"
+}
+
+#define TWO02 (1 << 2)
+#define TWO04 (1 << 4)
+#define TWO08 (1 << 8)
+#ifndef N
+#define N (1 << 27)
+#endif
+
+#ifndef BLOCK_SIZE
+#define BLOCK_SIZE (128)
+#endif
+
+#ifndef N_STREAMS
+#define N_STREAMS (16)
+#endif
+
+/*
+ *SAXPY (host implementation)
+ * y := a * x + y
+ */
+void host_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
+{
+#pragma omp parallel for simd schedule(simd \
+                                       : static)
+    for (int i = 0; i < n; i++)
+    {
+        y[i] = a * x[i] + y[i];
+    }
+}
+
+__global__ void gpu_saxpy(float *__restrict__ y, float a, float *__restrict__ x, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n)
+        y[i] = a * x[i] + y[i];
+}
+
+int main(int argc, const char **argv)
+{
+    int iret = 0;
+    int n = N;
+    float *h_x;
+    float *h_y;
+    float *h_z;
+    float a = 101.0f / TWO02,
+          b, c;
+
+    if (argc > 1)
+        n = atoi(argv[1]);
+
+    //CUDA Buffer Allocation
+    gpuErrchk(cudaMallocManaged((void **)&h_x, sizeof(float) * n));
+    gpuErrchk(cudaMallocManaged((void **)&h_y, sizeof(float) * n));
+
+    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+    if (0 != iret)
+    {
+        gpuErrchk(cudaFree(h_x));
+        gpuErrchk(cudaFree(h_y));
+        free(h_z);
+        exit(EXIT_FAILURE);
+    }
+
+    //Init Data
+    b = rand() % TWO04;
+    c = rand() % TWO08;
+    for (int i = 0; i < n; i++)
+    {
+        h_x[i] = b / (float)TWO02;
+        h_y[i] = h_z[i] = c / (float)TWO04;
+    }
+
+    start_timer();
+    int TILE = n / N_STREAMS;
+    
+    //TODO Create N_STREAMS
+
+    //TODO Loop over the Tiles
+    for (int i = 0; i < n; i += TILE)
+    {   
+        //TODO Execute Kernel Tile i (stream i)
+    }
+    //TODO Wait all the streams...
+    stop_timer();
+    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
+
+    //Check Matematical Consistency
+    start_timer();
+    host_saxpy(h_z, a, h_x, n);
+    stop_timer();
+    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float)elapsed_ns()));
+    for (int i = 0; i < n; ++i)
+    {
+        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
+        assert(iret == 0);
+    }
+
+    gpuErrchk(cudaFree(h_x));
+    gpuErrchk(cudaFree(h_y));
+    free(h_z);
+
+    for (int i = 0; i < N_STREAMS; ++i)
+        cudaStreamDestroy(stream[i]);
+
+    // CUDA exit -- needed to flush printf write buffer
+    cudaDeviceReset();
+    return 0;
+}
--- a/cuda/lab3/saxpy-v5.cu
+++ b/cuda/lab3/saxpy-v5.cu
@ -0,0 +1,186 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file saxpy.c
+ * @author Alessandro Capotondi
+ * @date 12 May 2020
+ * @brief Saxpy
+ * 
+ * @see https://dolly.fim.unimore.it/2019/course/view.php?id=152
+ */
+
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <cuda_runtime.h>
+
+#define gpuErrchk(ans)                        \
+    {                                         \
+        gpuAssert((ans), __FILE__, __LINE__); \
+    }
+static inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
+{
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort)
+            exit(code);
+    }
+}
+
+extern "C"
+{
+#include "utils.h"
+}
+
+#define TWO02 (1 << 2)
+#define TWO04 (1 << 4)
+#define TWO08 (1 << 8)
+#ifndef N
+#define N (1 << 27)
+#endif
+
+#ifndef BLOCK_SIZE
+#define BLOCK_SIZE (512)
+#endif
+
+/*
+ *SAXPY (host implementation)
+ * y := a * x + y
+ */
+void host_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
+{
+#pragma omp parallel for simd schedule(simd: static)
+    for (int i = 0; i < n; i++)
+    {
+        y[i] = a * x[i] + y[i];
+    }
+}
+
+__global__ void gpu_saxpy(float * __restrict__ y, float a, float * __restrict__ x, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n)
+        y[i] = a * x[i] + y[i];
+}
+
+int main(int argc, const char **argv)
+{
+    int iret = 0;
+    int n = N;
+    float *h_x, *d_x;
+    float *h_y, *d_y;
+    float *h_z;
+    float a = 101.0f / TWO02,
+          b, c;
+
+    if (argc > 1)
+        n = atoi(argv[1]);
+
+    if (NULL == (h_x = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'x'\n");
+        iret = -1;
+    }
+    if (NULL == (h_y = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'y'\n");
+        iret = -1;
+    }
+    if (NULL == (h_z = (float *)malloc(sizeof(float) * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+    if (0 != iret)
+    {
+        free(h_x);
+        free(h_y);
+        free(h_z);
+        exit(EXIT_FAILURE);
+    }
+
+    //Init Data
+    b = rand() % TWO04;
+    c = rand() % TWO08;
+    for (int i = 0; i < n; i++)
+    {
+        h_x[i] = b / (float)TWO02;
+        h_y[i] = h_z[i] = c / (float)TWO04;
+    }
+
+    //CUDA Buffer Allocation
+    gpuErrchk(cudaMalloc((void **)&d_x, sizeof(float) * n));
+    gpuErrchk(cudaMalloc((void **)&d_y, sizeof(float) * n));
+
+    start_timer();
+    int TILE = n / 8;
+    //TODO Copy to device the first input Tile (i=0)
+
+    //TODO Loop over the Tiles
+    for (int i = 0; i < n; i += TILE)
+    {
+        //TODO Wait Tile i 
+
+        //TODO Copy from the device the output tile i-1 (if i>0)
+
+        //TODO Launch Kernel over tile i
+
+        //TODO Copy to the device the input tile i+=TILE (if i+TILE < n)
+    }
+    //TODO Copy out the last tile n-TILE
+    //TODO Wait last tile
+    stop_timer();
+    printf("saxpy (GPU): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
+
+    //Check Matematical Consistency
+    start_timer();
+    host_saxpy(h_z, a, h_x, n);
+    stop_timer();
+    printf("saxpy (Host): %9.3f sec %9.1f GFLOPS\n", elapsed_ns() / 1.0e9, 2 * n / ((float) elapsed_ns()));
+    for (int i = 0; i < n; ++i)
+    {
+        iret = *(int *)(h_y + i) ^ *(int *)(h_z + i);
+        assert(iret == 0);
+    }
+
+    //CUDA Buffer Allocation
+    free(h_x);
+    gpuErrchk(cudaFree(d_x));
+    free(h_y);
+    gpuErrchk(cudaFree(d_y));
+    free(h_z);
+
+    // CUDA exit -- needed to flush printf write buffer
+    cudaDeviceReset();
+    return 0;
+}