HPC OpenMP Lab 3

2025-04-18 12:30:31 +00:00 · 2021-04-21 10:16:41 +02:00 · 2021-04-21 10:16:41 +02:00 · 6454fbf443
commit 6454fbf443
parent 26ca5f4a19
21 changed files with 3539 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -12,4 +12,4 @@ This repo contains the exercises and the tutorials used for Unimore's HPC class
 The exercises related to OpenMP programming model can be found in the folder `openmp`. Here the list of currectly available classes:
 - `openmp\lab1`: OpenMP basics: *parallel*, *for-loop*, *sections*, and *tasking*.
 - `openmp\lab2`: OpenMP Advanced: *reduction*, *tasking*, *optimizations*.
-
+- `openmp\lab3`: OpenMP 4.x+: *Accelerator Model (targeting: Nvidia GP-GPU)*
--- a/openmp/lab3/.solutions/jacobi-omp1.c
+++ b/openmp/lab3/.solutions/jacobi-omp1.c
@ -0,0 +1,282 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file jacobi.c
+ * @author Alessandro Capotondi
+ * @date 27 Mar 2020
+ * @brief This code solves the steady state heat equation on a rectangular region.
+ * This code solves the steady state heat equation on a rectangular region.
+ *  The sequential version of this program needs approximately
+ *  18/epsilon iterations to complete. 
+ *  The physical region, and the boundary conditions, are suggested
+ *  by this diagram;
+ *                 W = 0
+ *           +------------------+
+ *           |                  |
+ *  W = 100  |                  | W = 100
+ *           |                  |
+ *           +------------------+
+ *                 W = 100
+ *  The region is covered with a grid of M by N nodes, and an N by N
+ *  array W is used to record the temperature.  The correspondence between
+ *  array indices and locations in the region is suggested by giving the
+ *  indices of the four corners:
+ *                I = 0
+ *        [0][0]-------------[0][N-1]
+ *           |                  |
+ *    J = 0  |                  |  J = N-1
+ *           |                  |
+ *      [M-1][0]-----------[M-1][N-1]
+ *                I = M-1
+ *  The steady state solution to the discrete heat equation satisfies the
+ *  following condition at an interior grid point:
+ *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  where "Central" is the index of the grid point, "North" is the index
+ *  of its immediate neighbor to the "north", and so on.
+ * 
+ *  Given an approximate solution of the steady state heat equation, a
+ *  "better" solution is given by replacing each interior point by the
+ *  average of its 4 neighbors - in other words, by using the condition
+ *  as an ASSIGNMENT statement:
+ *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  If this process is repeated often enough, the difference between successive 
+ *  estimates of the solution will go to zero.
+ *  This program carries out such an iteration, using a tolerance specified by
+ *  the user, and writes the final estimate of the solution to a file that can
+ *  be used for graphic processing.
+ * icensing:
+ *  This code is distributed under the GNU LGPL license. 
+ * odified:
+ *  18 October 2011
+ * uthor:
+ *  Original C version by Michael Quinn.
+ *  This C version by John Burkardt.
+ * eference:
+ *  Michael Quinn,
+ *  Parallel Programming in C with MPI and OpenMP,
+ *  McGraw-Hill, 2004,
+ *  ISBN13: 978-0071232654,
+ *  LC: QA76.73.C15.Q55.
+ * ocal parameters:
+ *  Local, double DIFF, the norm of the change in the solution from one iteration
+ *  to the next.
+ *  Local, double MEAN, the average of the boundary values, used to initialize
+ *  the values of the solution in the interior.
+ *  Local, double U[M][N], the solution at the previous iteration.
+ *  Local, double W[M][N], the solution computed at the latest iteration.
+ * 
+ * 
+ * @see https://en.wikipedia.org/wiki/Jacobi_method
+ * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "utils.h"
+
+static int N;
+static int MAX_ITERATIONS;
+static int SEED;
+static double CONVERGENCE_THRESHOLD;
+static FILE *data;
+
+#define SEPARATOR "------------------------------------\n"
+
+// Return the current time in seconds since the Epoch
+double get_timestamp();
+
+// Parse command line arguments to set solver parameters
+void parse_arguments(int argc, char *argv[]);
+
+// Run the Jacobi solver
+// Returns the number of iterations performed
+int run(double *restrict A, double *restrict xtmp)
+{
+    int iter = 0, iterations_print = 1;
+    double err = 0.0;
+
+    do
+    {
+        err = 0.0;
+#pragma omp target map(to                                              \
+                       : A [0:N * N]) map(from                         \
+                                          : xtmp [0:N * N]) map(tofrom \
+                                                                : err)
+        for (int i = 1; i < N - 1; i++)
+        {
+            for (int j = 1; j < N - 1; j++)
+            {
+                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
+                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
+            }
+        }
+#pragma omp target map(to                         \
+                       : xtmp [0:N * N]) map(from \
+                                             : A [0:N * N])
+        for (int i = 0; i < N; i++)
+        {
+            for (int j = 0; j < N; j++)
+            {
+                A[i * N + j] = xtmp[i * N + j];
+            }
+        }
+        iter++;
+
+#ifdef DEBUG
+        if (iter == iterations_print)
+        {
+            printf("  %8d  %f\n", iter, err);
+            iterations_print = 2 * iterations_print;
+        }
+#endif
+    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
+
+    return iter;
+}
+
+int main(int argc, char *argv[])
+{
+    parse_arguments(argc, argv);
+
+    double *A = malloc(N * N * sizeof(double));
+    double *xtmp = malloc(N * N * sizeof(double));
+
+    printf(SEPARATOR);
+    printf("Matrix size:            %dx%d\n", N, N);
+    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
+    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
+    printf(SEPARATOR);
+
+    for (int ii = 0; ii < N; ii++)
+    {
+        for (int jj = 0; jj < N; jj++)
+        {
+            double f;
+            fread(&f, sizeof(double), 1, data);
+            A[ii * N + jj] = f;
+        }
+    }
+
+    // Run Jacobi solver
+    start_timer();
+    int itr = run(A, xtmp);
+    stop_timer();
+
+    printf("Iterations     = %d\n", itr);
+    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
+    if (itr == MAX_ITERATIONS)
+        printf("WARNING: solution did not converge\n");
+    printf(SEPARATOR);
+
+    free(A);
+    free(xtmp);
+    fclose(data);
+    return 0;
+}
+
+int parse_int(const char *str)
+{
+    char *next;
+    int value = strtoul(str, &next, 10);
+    return strlen(next) ? -1 : value;
+}
+
+double parse_double(const char *str)
+{
+    char *next;
+    double value = strtod(str, &next);
+    return strlen(next) ? -1 : value;
+}
+
+void parse_arguments(int argc, char *argv[])
+{
+    // Set default values
+    N = 500;
+    MAX_ITERATIONS = 2000;
+    CONVERGENCE_THRESHOLD = 0.001;
+    SEED = 0;
+
+    for (int i = 1; i < argc; i++)
+    {
+        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
+        {
+            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
+            {
+                printf("Invalid convergence threshold\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
+        {
+            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid number of iterations\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
+        {
+            if (++i >= argc || (N = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid matrix order\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
+        {
+            printf("\n");
+            printf("Usage: ./jacobi [OPTIONS]\n\n");
+            printf("Options:\n");
+            printf("  -h  --help               Print this message\n");
+            printf("  -c  --convergence  C     Set convergence threshold\n");
+            printf("  -i  --iterations   I     Set maximum number of iterations\n");
+            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
+            printf("\n");
+            exit(0);
+        }
+        else
+        {
+            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
+            exit(1);
+        }
+    }
+
+    if (N == 1000)
+        data = fopen("data/jacobi-1000.bin", "rb");
+    else if (N == 500)
+        data = fopen("data/jacobi-500.bin", "rb");
+    else
+    {
+        printf("Invalid matrix order\n");
+        exit(1);
+    }
+}
--- a/openmp/lab3/.solutions/jacobi-omp2.c
+++ b/openmp/lab3/.solutions/jacobi-omp2.c
@ -0,0 +1,285 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file jacobi.c
+ * @author Alessandro Capotondi
+ * @date 27 Mar 2020
+ * @brief This code solves the steady state heat equation on a rectangular region.
+ * This code solves the steady state heat equation on a rectangular region.
+ *  The sequential version of this program needs approximately
+ *  18/epsilon iterations to complete. 
+ *  The physical region, and the boundary conditions, are suggested
+ *  by this diagram;
+ *                 W = 0
+ *           +------------------+
+ *           |                  |
+ *  W = 100  |                  | W = 100
+ *           |                  |
+ *           +------------------+
+ *                 W = 100
+ *  The region is covered with a grid of M by N nodes, and an N by N
+ *  array W is used to record the temperature.  The correspondence between
+ *  array indices and locations in the region is suggested by giving the
+ *  indices of the four corners:
+ *                I = 0
+ *        [0][0]-------------[0][N-1]
+ *           |                  |
+ *    J = 0  |                  |  J = N-1
+ *           |                  |
+ *      [M-1][0]-----------[M-1][N-1]
+ *                I = M-1
+ *  The steady state solution to the discrete heat equation satisfies the
+ *  following condition at an interior grid point:
+ *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  where "Central" is the index of the grid point, "North" is the index
+ *  of its immediate neighbor to the "north", and so on.
+ * 
+ *  Given an approximate solution of the steady state heat equation, a
+ *  "better" solution is given by replacing each interior point by the
+ *  average of its 4 neighbors - in other words, by using the condition
+ *  as an ASSIGNMENT statement:
+ *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  If this process is repeated often enough, the difference between successive 
+ *  estimates of the solution will go to zero.
+ *  This program carries out such an iteration, using a tolerance specified by
+ *  the user, and writes the final estimate of the solution to a file that can
+ *  be used for graphic processing.
+ * icensing:
+ *  This code is distributed under the GNU LGPL license. 
+ * odified:
+ *  18 October 2011
+ * uthor:
+ *  Original C version by Michael Quinn.
+ *  This C version by John Burkardt.
+ * eference:
+ *  Michael Quinn,
+ *  Parallel Programming in C with MPI and OpenMP,
+ *  McGraw-Hill, 2004,
+ *  ISBN13: 978-0071232654,
+ *  LC: QA76.73.C15.Q55.
+ * ocal parameters:
+ *  Local, double DIFF, the norm of the change in the solution from one iteration
+ *  to the next.
+ *  Local, double MEAN, the average of the boundary values, used to initialize
+ *  the values of the solution in the interior.
+ *  Local, double U[M][N], the solution at the previous iteration.
+ *  Local, double W[M][N], the solution computed at the latest iteration.
+ * 
+ * 
+ * @see https://en.wikipedia.org/wiki/Jacobi_method
+ * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "utils.h"
+
+static int N;
+static int MAX_ITERATIONS;
+static int SEED;
+static double CONVERGENCE_THRESHOLD;
+static FILE *data;
+
+#define SEPARATOR "------------------------------------\n"
+
+// Return the current time in seconds since the Epoch
+double get_timestamp();
+
+// Parse command line arguments to set solver parameters
+void parse_arguments(int argc, char *argv[]);
+
+// Run the Jacobi solver
+// Returns the number of iterations performed
+int run(double *restrict A, double *restrict xtmp)
+{
+    int iter = 0, iterations_print = 1;
+    double err = 0.0;
+
+    do
+    {
+        err = 0.0;
+#pragma omp target map(to                                              \
+                       : A [0:N * N]) map(from                         \
+                                          : xtmp [0:N * N]) map(tofrom \
+                                                                : err)
+#pragma omp teams distribute parallel for reduction(max \
+                                                    : err)
+        for (int i = 1; i < N - 1; i++)
+        {
+            for (int j = 1; j < N - 1; j++)
+            {
+                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
+                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
+            }
+        }
+#pragma omp target map(to                         \
+                       : xtmp [0:N * N]) map(from \
+                                             : A [0:N * N])
+#pragma omp teams distribute parallel for
+        for (int i = 0; i < N; i++)
+        {
+            for (int j = 0; j < N; j++)
+            {
+                A[i * N + j] = xtmp[i * N + j];
+            }
+        }
+        iter++;
+
+#ifdef DEBUG
+        if (iter == iterations_print)
+        {
+            printf("  %8d  %f\n", iter, err);
+            iterations_print = 2 * iterations_print;
+        }
+#endif
+    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
+
+    return iter;
+}
+
+int main(int argc, char *argv[])
+{
+    parse_arguments(argc, argv);
+
+    double *A = malloc(N * N * sizeof(double));
+    double *xtmp = malloc(N * N * sizeof(double));
+
+    printf(SEPARATOR);
+    printf("Matrix size:            %dx%d\n", N, N);
+    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
+    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
+    printf(SEPARATOR);
+
+    for (int ii = 0; ii < N; ii++)
+    {
+        for (int jj = 0; jj < N; jj++)
+        {
+            double f;
+            fread(&f, sizeof(double), 1, data);
+            A[ii * N + jj] = f;
+        }
+    }
+
+    // Run Jacobi solver
+    start_timer();
+    int itr = run(A, xtmp);
+    stop_timer();
+
+    printf("Iterations     = %d\n", itr);
+    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
+    if (itr == MAX_ITERATIONS)
+        printf("WARNING: solution did not converge\n");
+    printf(SEPARATOR);
+
+    free(A);
+    free(xtmp);
+    fclose(data);
+    return 0;
+}
+
+int parse_int(const char *str)
+{
+    char *next;
+    int value = strtoul(str, &next, 10);
+    return strlen(next) ? -1 : value;
+}
+
+double parse_double(const char *str)
+{
+    char *next;
+    double value = strtod(str, &next);
+    return strlen(next) ? -1 : value;
+}
+
+void parse_arguments(int argc, char *argv[])
+{
+    // Set default values
+    N = 500;
+    MAX_ITERATIONS = 2000;
+    CONVERGENCE_THRESHOLD = 0.001;
+    SEED = 0;
+
+    for (int i = 1; i < argc; i++)
+    {
+        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
+        {
+            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
+            {
+                printf("Invalid convergence threshold\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
+        {
+            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid number of iterations\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
+        {
+            if (++i >= argc || (N = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid matrix order\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
+        {
+            printf("\n");
+            printf("Usage: ./jacobi [OPTIONS]\n\n");
+            printf("Options:\n");
+            printf("  -h  --help               Print this message\n");
+            printf("  -c  --convergence  C     Set convergence threshold\n");
+            printf("  -i  --iterations   I     Set maximum number of iterations\n");
+            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
+            printf("\n");
+            exit(0);
+        }
+        else
+        {
+            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
+            exit(1);
+        }
+    }
+
+    if (N == 1000)
+        data = fopen("data/jacobi-1000.bin", "rb");
+    else if (N == 500)
+        data = fopen("data/jacobi-500.bin", "rb");
+    else
+    {
+        printf("Invalid matrix order\n");
+        exit(1);
+    }
+}
--- a/openmp/lab3/.solutions/jacobi-omp3.c
+++ b/openmp/lab3/.solutions/jacobi-omp3.c
@ -0,0 +1,293 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file jacobi.c
+ * @author Alessandro Capotondi
+ * @date 27 Mar 2020
+ * @brief This code solves the steady state heat equation on a rectangular region.
+ * This code solves the steady state heat equation on a rectangular region.
+ *  The sequential version of this program needs approximately
+ *  18/epsilon iterations to complete. 
+ *  The physical region, and the boundary conditions, are suggested
+ *  by this diagram;
+ *                 W = 0
+ *           +------------------+
+ *           |                  |
+ *  W = 100  |                  | W = 100
+ *           |                  |
+ *           +------------------+
+ *                 W = 100
+ *  The region is covered with a grid of M by N nodes, and an N by N
+ *  array W is used to record the temperature.  The correspondence between
+ *  array indices and locations in the region is suggested by giving the
+ *  indices of the four corners:
+ *                I = 0
+ *        [0][0]-------------[0][N-1]
+ *           |                  |
+ *    J = 0  |                  |  J = N-1
+ *           |                  |
+ *      [M-1][0]-----------[M-1][N-1]
+ *                I = M-1
+ *  The steady state solution to the discrete heat equation satisfies the
+ *  following condition at an interior grid point:
+ *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  where "Central" is the index of the grid point, "North" is the index
+ *  of its immediate neighbor to the "north", and so on.
+ * 
+ *  Given an approximate solution of the steady state heat equation, a
+ *  "better" solution is given by replacing each interior point by the
+ *  average of its 4 neighbors - in other words, by using the condition
+ *  as an ASSIGNMENT statement:
+ *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  If this process is repeated often enough, the difference between successive 
+ *  estimates of the solution will go to zero.
+ *  This program carries out such an iteration, using a tolerance specified by
+ *  the user, and writes the final estimate of the solution to a file that can
+ *  be used for graphic processing.
+ * icensing:
+ *  This code is distributed under the GNU LGPL license. 
+ * odified:
+ *  18 October 2011
+ * uthor:
+ *  Original C version by Michael Quinn.
+ *  This C version by John Burkardt.
+ * eference:
+ *  Michael Quinn,
+ *  Parallel Programming in C with MPI and OpenMP,
+ *  McGraw-Hill, 2004,
+ *  ISBN13: 978-0071232654,
+ *  LC: QA76.73.C15.Q55.
+ * ocal parameters:
+ *  Local, double DIFF, the norm of the change in the solution from one iteration
+ *  to the next.
+ *  Local, double MEAN, the average of the boundary values, used to initialize
+ *  the values of the solution in the interior.
+ *  Local, double U[M][N], the solution at the previous iteration.
+ *  Local, double W[M][N], the solution computed at the latest iteration.
+ * 
+ * 
+ * @see https://en.wikipedia.org/wiki/Jacobi_method
+ * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "utils.h"
+
+static int N;
+static int MAX_ITERATIONS;
+static int SEED;
+static double CONVERGENCE_THRESHOLD;
+static FILE *data;
+
+#define SEPARATOR "------------------------------------\n"
+
+// Return the current time in seconds since the Epoch
+double get_timestamp();
+
+// Parse command line arguments to set solver parameters
+void parse_arguments(int argc, char *argv[]);
+
+// Run the Jacobi solver
+// Returns the number of iterations performed
+int run(double *restrict A, double *restrict xtmp)
+{
+    int iter = 0, iterations_print = 1;
+    double err = 0.0;
+
+    do
+    {
+        err = 0.0;
+#pragma omp target data map(to                                              \
+                            : A [0:N * N]) map(from                         \
+                                               : xtmp [0:N * N]) map(tofrom \
+                                                                     : err)
+#pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(to                                              \
+                                                                                    : A [0:N * N]) map(from                         \
+                                                                                                       : xtmp [0:N * N]) map(tofrom \
+                                                                                                                             : err)
+#pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU) reduction(max \
+                                                                                                                        : err) schedule(static, 1)
+        for (int i = 1; i < N - 1; i++)
+        {
+            for (int j = 1; j < N - 1; j++)
+            {
+                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
+                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
+            }
+        }
+
+#pragma omp target data map(from                  \
+                            : A [0:N * N]) map(to \
+                                               : xtmp [0:N * N])
+#pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(from                  \
+                                                                                    : A [0:N * N]) map(to \
+                                                                                                       : xtmp [0:N * N])
+#pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU) schedule(static, 1)
+        for (int i = 0; i < N; i++)
+        {
+            for (int j = 0; j < N; j++)
+            {
+                A[i * N + j] = xtmp[i * N + j];
+            }
+        }
+        iter++;
+
+#ifdef DEBUG
+        if (iter == iterations_print)
+        {
+            printf("  %8d  %f\n", iter, err);
+            iterations_print = 2 * iterations_print;
+        }
+#endif
+    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
+
+    return iter;
+}
+
+int main(int argc, char *argv[])
+{
+    parse_arguments(argc, argv);
+
+    double *A = malloc(N * N * sizeof(double));
+    double *xtmp = malloc(N * N * sizeof(double));
+
+    printf(SEPARATOR);
+    printf("Matrix size:            %dx%d\n", N, N);
+    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
+    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
+    printf(SEPARATOR);
+
+    for (int ii = 0; ii < N; ii++)
+    {
+        for (int jj = 0; jj < N; jj++)
+        {
+            double f;
+            fread(&f, sizeof(double), 1, data);
+            A[ii * N + jj] = f;
+        }
+    }
+
+    // Run Jacobi solver
+    start_timer();
+    int itr = run(A, xtmp);
+    stop_timer();
+
+    printf("Iterations     = %d\n", itr);
+    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
+    if (itr == MAX_ITERATIONS)
+        printf("WARNING: solution did not converge\n");
+    printf(SEPARATOR);
+
+    free(A);
+    free(xtmp);
+    fclose(data);
+    return 0;
+}
+
+int parse_int(const char *str)
+{
+    char *next;
+    int value = strtoul(str, &next, 10);
+    return strlen(next) ? -1 : value;
+}
+
+double parse_double(const char *str)
+{
+    char *next;
+    double value = strtod(str, &next);
+    return strlen(next) ? -1 : value;
+}
+
+void parse_arguments(int argc, char *argv[])
+{
+    // Set default values
+    N = 500;
+    MAX_ITERATIONS = 2000;
+    CONVERGENCE_THRESHOLD = 0.001;
+    SEED = 0;
+
+    for (int i = 1; i < argc; i++)
+    {
+        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
+        {
+            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
+            {
+                printf("Invalid convergence threshold\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
+        {
+            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid number of iterations\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
+        {
+            if (++i >= argc || (N = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid matrix order\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
+        {
+            printf("\n");
+            printf("Usage: ./jacobi [OPTIONS]\n\n");
+            printf("Options:\n");
+            printf("  -h  --help               Print this message\n");
+            printf("  -c  --convergence  C     Set convergence threshold\n");
+            printf("  -i  --iterations   I     Set maximum number of iterations\n");
+            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
+            printf("\n");
+            exit(0);
+        }
+        else
+        {
+            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
+            exit(1);
+        }
+    }
+
+    if (N == 1000)
+        data = fopen("data/jacobi-1000.bin", "rb");
+    else if (N == 500)
+        data = fopen("data/jacobi-500.bin", "rb");
+    else
+    {
+        printf("Invalid matrix order\n");
+        exit(1);
+    }
+}
--- a/openmp/lab3/.solutions/jacobi-omp4.c
+++ b/openmp/lab3/.solutions/jacobi-omp4.c
@ -0,0 +1,292 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file jacobi.c
+ * @author Alessandro Capotondi
+ * @date 27 Mar 2020
+ * @brief This code solves the steady state heat equation on a rectangular region.
+ * This code solves the steady state heat equation on a rectangular region.
+ *  The sequential version of this program needs approximately
+ *  18/epsilon iterations to complete. 
+ *  The physical region, and the boundary conditions, are suggested
+ *  by this diagram;
+ *                 W = 0
+ *           +------------------+
+ *           |                  |
+ *  W = 100  |                  | W = 100
+ *           |                  |
+ *           +------------------+
+ *                 W = 100
+ *  The region is covered with a grid of M by N nodes, and an N by N
+ *  array W is used to record the temperature.  The correspondence between
+ *  array indices and locations in the region is suggested by giving the
+ *  indices of the four corners:
+ *                I = 0
+ *        [0][0]-------------[0][N-1]
+ *           |                  |
+ *    J = 0  |                  |  J = N-1
+ *           |                  |
+ *      [M-1][0]-----------[M-1][N-1]
+ *                I = M-1
+ *  The steady state solution to the discrete heat equation satisfies the
+ *  following condition at an interior grid point:
+ *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  where "Central" is the index of the grid point, "North" is the index
+ *  of its immediate neighbor to the "north", and so on.
+ * 
+ *  Given an approximate solution of the steady state heat equation, a
+ *  "better" solution is given by replacing each interior point by the
+ *  average of its 4 neighbors - in other words, by using the condition
+ *  as an ASSIGNMENT statement:
+ *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  If this process is repeated often enough, the difference between successive 
+ *  estimates of the solution will go to zero.
+ *  This program carries out such an iteration, using a tolerance specified by
+ *  the user, and writes the final estimate of the solution to a file that can
+ *  be used for graphic processing.
+ * icensing:
+ *  This code is distributed under the GNU LGPL license. 
+ * odified:
+ *  18 October 2011
+ * uthor:
+ *  Original C version by Michael Quinn.
+ *  This C version by John Burkardt.
+ * eference:
+ *  Michael Quinn,
+ *  Parallel Programming in C with MPI and OpenMP,
+ *  McGraw-Hill, 2004,
+ *  ISBN13: 978-0071232654,
+ *  LC: QA76.73.C15.Q55.
+ * ocal parameters:
+ *  Local, double DIFF, the norm of the change in the solution from one iteration
+ *  to the next.
+ *  Local, double MEAN, the average of the boundary values, used to initialize
+ *  the values of the solution in the interior.
+ *  Local, double U[M][N], the solution at the previous iteration.
+ *  Local, double W[M][N], the solution computed at the latest iteration.
+ * 
+ * 
+ * @see https://en.wikipedia.org/wiki/Jacobi_method
+ * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "utils.h"
+
+static int N;
+static int MAX_ITERATIONS;
+static int SEED;
+static double CONVERGENCE_THRESHOLD;
+static FILE *data;
+
+#define SEPARATOR "------------------------------------\n"
+
+// Return the current time in seconds since the Epoch
+double get_timestamp();
+
+// Parse command line arguments to set solver parameters
+void parse_arguments(int argc, char *argv[]);
+
+// Run the Jacobi solver
+// Returns the number of iterations performed
+int run(double *restrict A, double *restrict xtmp)
+{
+    int iter = 0, iterations_print = 1;
+    double err = 0.0;
+
+#pragma omp target enter data map(to                       \
+                                  : A [0:N * N]) map(alloc \
+                                                     : xtmp [0:N * N])
+    do
+    {
+        err = 0.0;
+
+#pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(tofrom \
+                                                                                    : err)
+#pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU) reduction(max \
+                                                                                                                        : err)
+        for (int i = 1; i < N - 1; i++)
+        {
+            for (int j = 1; j < N - 1; j++)
+            {
+                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
+                double diff = fabs(xtmp[i * N + j] - A[i * N + j]);
+                int swap = diff > err;
+                err = diff * swap + err * !swap;
+            }
+        }
+
+#pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU)
+#pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU)
+        for (int i = 0; i < N; i++)
+        {
+            for (int j = 0; j < N; j++)
+            {
+                A[i * N + j] = xtmp[i * N + j];
+            }
+        }
+        iter++;
+
+#ifdef DEBUG
+        if (iter == iterations_print)
+        {
+            printf("  %8d  %f\n", iter, err);
+            iterations_print = 2 * iterations_print;
+        }
+#endif
+    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
+
+#pragma omp target exit data map(from                       \
+                                 : A [0:N * N]) map(release \
+                                                    : xtmp)
+
+    return iter;
+}
+
+int main(int argc, char *argv[])
+{
+    parse_arguments(argc, argv);
+
+    double *A = malloc(N * N * sizeof(double));
+    double *xtmp = malloc(N * N * sizeof(double));
+
+    printf(SEPARATOR);
+    printf("Matrix size:            %dx%d\n", N, N);
+    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
+    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
+    printf(SEPARATOR);
+
+    for (int ii = 0; ii < N; ii++)
+    {
+        for (int jj = 0; jj < N; jj++)
+        {
+            double f;
+            fread(&f, sizeof(double), 1, data);
+            A[ii * N + jj] = f;
+        }
+    }
+
+    // Run Jacobi solver
+    start_timer();
+    int itr = run(A, xtmp);
+    stop_timer();
+
+    printf("Iterations     = %d\n", itr);
+    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
+    if (itr == MAX_ITERATIONS)
+        printf("WARNING: solution did not converge\n");
+    printf(SEPARATOR);
+
+    free(A);
+    free(xtmp);
+    fclose(data);
+    return 0;
+}
+
+int parse_int(const char *str)
+{
+    char *next;
+    int value = strtoul(str, &next, 10);
+    return strlen(next) ? -1 : value;
+}
+
+double parse_double(const char *str)
+{
+    char *next;
+    double value = strtod(str, &next);
+    return strlen(next) ? -1 : value;
+}
+
+void parse_arguments(int argc, char *argv[])
+{
+    // Set default values
+    N = 500;
+    MAX_ITERATIONS = 2000;
+    CONVERGENCE_THRESHOLD = 0.001;
+    SEED = 0;
+
+    for (int i = 1; i < argc; i++)
+    {
+        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
+        {
+            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
+            {
+                printf("Invalid convergence threshold\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
+        {
+            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid number of iterations\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
+        {
+            if (++i >= argc || (N = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid matrix order\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
+        {
+            printf("\n");
+            printf("Usage: ./jacobi [OPTIONS]\n\n");
+            printf("Options:\n");
+            printf("  -h  --help               Print this message\n");
+            printf("  -c  --convergence  C     Set convergence threshold\n");
+            printf("  -i  --iterations   I     Set maximum number of iterations\n");
+            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
+            printf("\n");
+            exit(0);
+        }
+        else
+        {
+            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
+            exit(1);
+        }
+    }
+
+    if (N == 1000)
+        data = fopen("data/jacobi-1000.bin", "rb");
+    else if (N == 500)
+        data = fopen("data/jacobi-500.bin", "rb");
+    else
+    {
+        printf("Invalid matrix order\n");
+        exit(1);
+    }
+}
--- a/openmp/lab3/.solutions/jacobi-omp5.c
+++ b/openmp/lab3/.solutions/jacobi-omp5.c
@ -0,0 +1,291 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file jacobi.c
+ * @author Alessandro Capotondi
+ * @date 27 Mar 2020
+ * @brief This code solves the steady state heat equation on a rectangular region.
+ * This code solves the steady state heat equation on a rectangular region.
+ *  The sequential version of this program needs approximately
+ *  18/epsilon iterations to complete. 
+ *  The physical region, and the boundary conditions, are suggested
+ *  by this diagram;
+ *                 W = 0
+ *           +------------------+
+ *           |                  |
+ *  W = 100  |                  | W = 100
+ *           |                  |
+ *           +------------------+
+ *                 W = 100
+ *  The region is covered with a grid of M by N nodes, and an N by N
+ *  array W is used to record the temperature.  The correspondence between
+ *  array indices and locations in the region is suggested by giving the
+ *  indices of the four corners:
+ *                I = 0
+ *        [0][0]-------------[0][N-1]
+ *           |                  |
+ *    J = 0  |                  |  J = N-1
+ *           |                  |
+ *      [M-1][0]-----------[M-1][N-1]
+ *                I = M-1
+ *  The steady state solution to the discrete heat equation satisfies the
+ *  following condition at an interior grid point:
+ *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  where "Central" is the index of the grid point, "North" is the index
+ *  of its immediate neighbor to the "north", and so on.
+ * 
+ *  Given an approximate solution of the steady state heat equation, a
+ *  "better" solution is given by replacing each interior point by the
+ *  average of its 4 neighbors - in other words, by using the condition
+ *  as an ASSIGNMENT statement:
+ *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  If this process is repeated often enough, the difference between successive 
+ *  estimates of the solution will go to zero.
+ *  This program carries out such an iteration, using a tolerance specified by
+ *  the user, and writes the final estimate of the solution to a file that can
+ *  be used for graphic processing.
+ * icensing:
+ *  This code is distributed under the GNU LGPL license. 
+ * odified:
+ *  18 October 2011
+ * uthor:
+ *  Original C version by Michael Quinn.
+ *  This C version by John Burkardt.
+ * eference:
+ *  Michael Quinn,
+ *  Parallel Programming in C with MPI and OpenMP,
+ *  McGraw-Hill, 2004,
+ *  ISBN13: 978-0071232654,
+ *  LC: QA76.73.C15.Q55.
+ * ocal parameters:
+ *  Local, double DIFF, the norm of the change in the solution from one iteration
+ *  to the next.
+ *  Local, double MEAN, the average of the boundary values, used to initialize
+ *  the values of the solution in the interior.
+ *  Local, double U[M][N], the solution at the previous iteration.
+ *  Local, double W[M][N], the solution computed at the latest iteration.
+ * 
+ * 
+ * @see https://en.wikipedia.org/wiki/Jacobi_method
+ * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "utils.h"
+
+static int N;
+static int MAX_ITERATIONS;
+static int SEED;
+static double CONVERGENCE_THRESHOLD;
+static FILE *data;
+
+#define SEPARATOR "------------------------------------\n"
+
+// Return the current time in seconds since the Epoch
+double get_timestamp();
+
+// Parse command line arguments to set solver parameters
+void parse_arguments(int argc, char *argv[]);
+
+// Run the Jacobi solver
+// Returns the number of iterations performed
+int run(double *restrict A, double *restrict xtmp)
+{
+    int iter = 0, iterations_print = 1;
+    double err = 0.0;
+
+#pragma omp target enter data map(to                       \
+                                  : A [0:N * N]) map(alloc \
+                                                     : xtmp [0:N * N])
+    do
+    {
+        err = 0.0;
+
+#pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(tofrom \
+                                                                                    : err)
+#pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU) reduction(max \
+                                                                                                                        : err)
+        for (int i = 1; i < N - 1; i++)
+        {
+            for (int j = 1; j < N - 1; j++)
+            {
+                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
+                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
+            }
+        }
+
+//#pragma omp target update from(xtmp[0:N*N])
+#pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU)
+#pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU)
+        for (int i = 0; i < N; i++)
+        {
+            for (int j = 0; j < N; j++)
+            {
+                A[i * N + j] = xtmp[i * N + j];
+            }
+        }
+        iter++;
+
+#ifdef DEBUG
+        if (iter == iterations_print)
+        {
+            printf("  %8d  %f\n", iter, err);
+            iterations_print = 2 * iterations_print;
+        }
+#endif
+    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
+
+#pragma omp target exit data map(from                       \
+                                 : A [0:N * N]) map(release \
+                                                    : xtmp)
+
+    return iter;
+}
+
+int main(int argc, char *argv[])
+{
+    parse_arguments(argc, argv);
+
+    double *A = malloc(N * N * sizeof(double));
+    double *xtmp = malloc(N * N * sizeof(double));
+
+    printf(SEPARATOR);
+    printf("Matrix size:            %dx%d\n", N, N);
+    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
+    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
+    printf(SEPARATOR);
+
+    for (int ii = 0; ii < N; ii++)
+    {
+        for (int jj = 0; jj < N; jj++)
+        {
+            double f;
+            fread(&f, sizeof(double), 1, data);
+            A[ii * N + jj] = f;
+        }
+    }
+
+    // Run Jacobi solver
+    start_timer();
+    int itr = run(A, xtmp);
+    stop_timer();
+
+    printf("Iterations     = %d\n", itr);
+    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
+    if (itr == MAX_ITERATIONS)
+        printf("WARNING: solution did not converge\n");
+    printf(SEPARATOR);
+
+    free(A);
+    free(xtmp);
+    fclose(data);
+    return 0;
+}
+
+int parse_int(const char *str)
+{
+    char *next;
+    int value = strtoul(str, &next, 10);
+    return strlen(next) ? -1 : value;
+}
+
+double parse_double(const char *str)
+{
+    char *next;
+    double value = strtod(str, &next);
+    return strlen(next) ? -1 : value;
+}
+
+void parse_arguments(int argc, char *argv[])
+{
+    // Set default values
+    N = 500;
+    MAX_ITERATIONS = 2000;
+    CONVERGENCE_THRESHOLD = 0.001;
+    SEED = 0;
+
+    for (int i = 1; i < argc; i++)
+    {
+        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
+        {
+            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
+            {
+                printf("Invalid convergence threshold\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
+        {
+            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid number of iterations\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
+        {
+            if (++i >= argc || (N = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid matrix order\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
+        {
+            printf("\n");
+            printf("Usage: ./jacobi [OPTIONS]\n\n");
+            printf("Options:\n");
+            printf("  -h  --help               Print this message\n");
+            printf("  -c  --convergence  C     Set convergence threshold\n");
+            printf("  -i  --iterations   I     Set maximum number of iterations\n");
+            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
+            printf("\n");
+            exit(0);
+        }
+        else
+        {
+            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
+            exit(1);
+        }
+    }
+
+    if (N == 1000)
+        data = fopen("data/jacobi-1000.bin", "rb");
+    else if (N == 500)
+        data = fopen("data/jacobi-500.bin", "rb");
+    else
+    {
+        printf("Invalid matrix order\n");
+        exit(1);
+    }
+}
--- a/openmp/lab3/.solutions/matmul-omp1.c
+++ b/openmp/lab3/.solutions/matmul-omp1.c
@ -0,0 +1,175 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#include "utils.h"
+
+#ifndef N
+#define N (1 << 11)
+#endif
+
+#pragma omp declare target
+#define SM 64
+
+static void reorder2(float *restrict a, float *restrict b, int n)
+{
+    for (int i = 0; i < SM; i++)
+        for (int j = 0; j < SM; j++)
+            b[i * SM + j] = a[i * n + j];
+}
+
+static void kernel(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    for (int i = 0; i < SM; i++)
+    {
+        for (int k = 0; k < SM; k++)
+        {
+            for (int j = 0; j < SM; j++)
+            {
+                c[i * n + j] += a[i * n + k] * b[k * SM + j];
+            }
+        }
+    }
+}
+
+void gemm_acc(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    int bk = n / SM;
+#pragma omp target data map(to                                          \
+                            : n, bk, a [0:n * n], b [0:n * n]) map(from \
+                                                                   : c[:n * n])
+#pragma omp target teams num_teams(bk / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(to                                          \
+                                                                                     : n, bk, a [0:n * n], b [0:n * n]) map(from \
+                                                                                                                            : c[:n * n])
+#pragma omp distribute parallel for num_threads(NTHREADS_GPU) collapse(3) dist_schedule(static, NTHREADS_GPU)
+    for (int i = 0; i < bk; i++)
+    {
+        for (int j = 0; j < bk; j++)
+        {
+            for (int k = 0; k < bk; k++)
+            {
+                float b2[SM * SM];
+                reorder2(&b[SM * (k * n + j)], b2, n);
+                kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
+            }
+        }
+    }
+}
+
+#pragma omp end declare target
+
+void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    int bk = n / SM;
+    float b2[SM * SM];
+    for (int i = 0; i < bk; i++)
+    {
+        for (int j = 0; j < bk; j++)
+        {
+            for (int k = 0; k < bk; k++)
+            {
+                reorder2(&b[SM * (k * n + j)], b2, n);
+                kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
+            }
+        }
+    }
+}
+
+void gemm(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    int i, j, k;
+    for (int i = 0; i < n; ++i)
+    {
+        for (int j = 0; j < n; ++j)
+        {
+            float sum = 0.0;
+            for (int k = 0; k < n; ++k)
+            {
+                sum += a[i + k * n] * b[k + j * n];
+            }
+            c[i * n + j] += sum;
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    int i, n = N,
+           iret = 0;
+    float *a, *b, *c, *g;
+    struct timespec rt[2];
+    double wt; // walltime
+
+    if (argc > 1)
+        n = atoi(argv[1]);
+
+    /*
+   * 0. prepare x, y, and z
+   *
+   * y := a * x + y (on host)
+   * z := a * x + z (on accel)
+   */
+    if (NULL == (a = (float *)malloc(sizeof(*a) * n * n)))
+    {
+        printf("error: memory allocation for 'x'\n");
+        iret = -1;
+    }
+    if (NULL == (b = (float *)malloc(sizeof(*b) * n * n)))
+    {
+        printf("error: memory allocation for 'y'\n");
+        iret = -1;
+    }
+    if (NULL == (c = (float *)malloc(sizeof(*c) * n * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+    if (NULL == (g = (float *)malloc(sizeof(*g) * n * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+
+    if (0 != iret)
+    {
+        free(a);
+        free(b);
+        free(c);
+        free(g);
+        exit(EXIT_FAILURE);
+    }
+
+    if (n <= 1024)
+    {
+        clock_gettime(CLOCK_REALTIME, rt + 0);
+        gemm(a, b, c, n);
+        clock_gettime(CLOCK_REALTIME, rt + 1);
+        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+        printf("gemm on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+    }
+
+    if (n <= 4096)
+    {
+        clock_gettime(CLOCK_REALTIME, rt + 0);
+        gemm_opt(a, b, c, n);
+        clock_gettime(CLOCK_REALTIME, rt + 1);
+        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+        printf("gemm_opt on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+    }
+
+    clock_gettime(CLOCK_REALTIME, rt + 0);
+    gemm_acc(a, b, c, n);
+    clock_gettime(CLOCK_REALTIME, rt + 1);
+    wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+    printf("gemm_acc : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+
+    if (n <= 4096)
+        for (i = 0; i < n; i++)
+        {
+            iret = *(int *)(g + i) ^ *(int *)(c + i);
+            assert(iret == 0);
+        }
+    return 0;
+}
--- a/openmp/lab3/.solutions/matmul-omp2.c
+++ b/openmp/lab3/.solutions/matmul-omp2.c
@ -0,0 +1,500 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+
+#ifndef N
+#define N (1 << 10)
+#endif
+
+#pragma omp declare target
+#define SM 64
+
+#define NTHRDS7 (1 << 0x7) /* 2^{7}  */
+#define NTHRDS8 (1 << 0x8) /* 2^{8}  */
+#define NTHRDS9 (1 << 0x9) /* 2^{9}  */
+
+#define LTEAMSD (1 << 0xD) /* 2^{13} */
+#define LTEAMSE (1 << 0xE) /* 2^{14} */
+#define LTEAMSF (1 << 0xF) /* 2^{15} */
+#define LTEAMSG (1 << 020) /* 2^{16} */
+
+#define BLKROW (512) /* 4x number of threads in each team */
+#define BLKDIM (16)
+
+void gemm_accel_opt2(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+/*
+ * - jik-loop
+ * - 2^7 threads per team and 2^13 teams
+ * - collapse(3)
+ * - 4x j-loop unrolling (stride of 1   col )
+ * - 4x i-loop unrolling (stride of 2^7 rows)
+ * - 4x k-loop unrolling
+ * - rb: 4x data re-use
+ * - ra: 4x data re-use
+ * - register blocking
+ */
+#pragma omp target data                           \
+    map(to                                        \
+        : n, a [0:n * n], b [0:n * n]) map(tofrom \
+                                           : c [0:n * n])
+    {
+#pragma omp target teams num_teams(LTEAMSD) thread_limit(NTHRDS7) \
+    map(to                                                        \
+        : n, a [0:n * n], b [0:n * n]) map(tofrom                 \
+                                           : c [0:n * n]) default(none) shared(a, b, c, n)
+#pragma omp distribute parallel for num_threads(NTHRDS7) \
+    dist_schedule(static, NTHRDS7) collapse(3) default(none) shared(a, b, c, n)
+        for (int j = 0; j < n; j += 4)
+        { /* 4x unrolling */
+            for (int iblk = 0; iblk < n / BLKROW; ++iblk)
+            {
+                for (int i = 0; i < NTHRDS7; ++i)
+                { /* 4x unrolling */
+                    /* register for c: 4x j-loop * 4x i-loop */
+                    float rc0, rc1, rc2, rc3,
+                        rc4, rc5, rc6, rc7,
+                        rc8, rc9, rca, rcb,
+                        rcc, rcd, rce, rcf;
+                    rc0 = c[j * n + iblk * BLKROW + i];
+                    rc1 = c[j * n + iblk * BLKROW + i + NTHRDS7];
+                    rc2 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 2];
+                    rc3 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 3];
+                    rc4 = c[(j + 1) * n + iblk * BLKROW + i];
+                    rc5 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7];
+                    rc6 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+                    rc7 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+                    rc8 = c[(j + 2) * n + iblk * BLKROW + i];
+                    rc9 = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7];
+                    rca = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+                    rcb = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+                    rcc = c[(j + 3) * n + iblk * BLKROW + i];
+                    rcd = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7];
+                    rce = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+                    rcf = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+                    for (int k = 0; k < n; k += 4)
+                    { /* 4x unrolling */
+                        /* register for b: 4x j-loop * 4x k-loop */
+                        float rb0, rb1, rb2, rb3,
+                            rb4, rb5, rb6, rb7,
+                            rb8, rb9, rba, rbb,
+                            rbc, rbd, rbe, rbf;
+                        rb0 = b[j * n + k];
+                        rb1 = b[j * n + k + 1];
+                        rb2 = b[j * n + k + 2];
+                        rb3 = b[j * n + k + 3];
+                        rb4 = b[(j + 1) * n + k];
+                        rb5 = b[(j + 1) * n + k + 1];
+                        rb6 = b[(j + 1) * n + k + 2];
+                        rb7 = b[(j + 1) * n + k + 3];
+                        rb8 = b[(j + 2) * n + k];
+                        rb9 = b[(j + 2) * n + k + 1];
+                        rba = b[(j + 2) * n + k + 2];
+                        rbb = b[(j + 2) * n + k + 3];
+                        rbc = b[(j + 3) * n + k];
+                        rbd = b[(j + 3) * n + k + 1];
+                        rbe = b[(j + 3) * n + k + 2];
+                        rbf = b[(j + 3) * n + k + 3];
+                        /* register for a: 4x i-loop * 4x k-loop */
+                        float ra0, ra1, ra2, ra3,
+                            ra4, ra5, ra6, ra7,
+                            ra8, ra9, raa, rab,
+                            rac, rad, rae, raf;
+                        ra0 = a[k * n + iblk * BLKROW + i];
+                        ra1 = a[k * n + iblk * BLKROW + i + NTHRDS7];
+                        ra2 = a[k * n + iblk * BLKROW + i + NTHRDS7 * 2];
+                        ra3 = a[k * n + iblk * BLKROW + i + NTHRDS7 * 3];
+                        ra4 = a[(k + 1) * n + iblk * BLKROW + i];
+                        ra5 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7];
+                        ra6 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+                        ra7 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+                        ra8 = a[(k + 2) * n + iblk * BLKROW + i];
+                        ra9 = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7];
+                        raa = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+                        rab = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+                        rac = a[(k + 3) * n + iblk * BLKROW + i];
+                        rad = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7];
+                        rae = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2];
+                        raf = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3];
+                        /*
+     * register blocking
+     */
+                        // col 1 of c:
+                        rc0 += ra0 * rb0;
+                        rc0 += ra4 * rb1;
+                        rc0 += ra8 * rb2;
+                        rc0 += rac * rb3;
+                        rc1 += ra1 * rb0;
+                        rc1 += ra5 * rb1;
+                        rc1 += ra9 * rb2;
+                        rc1 += rad * rb3;
+                        rc2 += ra2 * rb0;
+                        rc2 += ra6 * rb1;
+                        rc2 += raa * rb2;
+                        rc2 += rae * rb3;
+                        rc3 += ra3 * rb0;
+                        rc3 += ra7 * rb1;
+                        rc3 += rab * rb2;
+                        rc3 += raf * rb3;
+                        // col 2 of c:
+                        rc4 += ra0 * rb4;
+                        rc4 += ra4 * rb5;
+                        rc4 += ra8 * rb6;
+                        rc4 += rac * rb7;
+                        rc5 += ra1 * rb4;
+                        rc5 += ra5 * rb5;
+                        rc5 += ra9 * rb6;
+                        rc5 += rad * rb7;
+                        rc6 += ra2 * rb4;
+                        rc6 += ra6 * rb5;
+                        rc6 += raa * rb6;
+                        rc6 += rae * rb7;
+                        rc7 += ra3 * rb4;
+                        rc7 += ra7 * rb5;
+                        rc7 += rab * rb6;
+                        rc7 += raf * rb7;
+                        // col 3 of c:
+                        rc8 += ra0 * rb8;
+                        rc8 += ra4 * rb9;
+                        rc8 += ra8 * rba;
+                        rc8 += rac * rbb;
+                        rc9 += ra1 * rb8;
+                        rc9 += ra5 * rb9;
+                        rc9 += ra9 * rba;
+                        rc9 += rad * rbb;
+                        rca += ra2 * rb8;
+                        rca += ra6 * rb9;
+                        rca += raa * rba;
+                        rca += rae * rbb;
+                        rcb += ra3 * rb8;
+                        rcb += ra7 * rb9;
+                        rcb += rab * rba;
+                        rcb += raf * rbb;
+                        // col 4 of c:
+                        rcc += ra0 * rbc;
+                        rcc += ra4 * rbd;
+                        rcc += ra8 * rbe;
+                        rcc += rac * rbf;
+                        rcd += ra1 * rbc;
+                        rcd += ra5 * rbd;
+                        rcd += ra9 * rbe;
+                        rcd += rad * rbf;
+                        rce += ra2 * rbc;
+                        rce += ra6 * rbd;
+                        rce += raa * rbe;
+                        rce += rae * rbf;
+                        rcf += ra3 * rbc;
+                        rcf += ra7 * rbd;
+                        rcf += rab * rbe;
+                        rcf += raf * rbf;
+                    }
+                    c[j * n + iblk * BLKROW + i] = rc0;
+                    c[j * n + iblk * BLKROW + i + NTHRDS7] = rc1;
+                    c[j * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2;
+                    c[j * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3;
+                    c[(j + 1) * n + iblk * BLKROW + i] = rc4;
+                    c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7] = rc5;
+                    c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc6;
+                    c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc7;
+                    c[(j + 2) * n + iblk * BLKROW + i] = rc8;
+                    c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7] = rc9;
+                    c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rca;
+                    c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcb;
+                    c[(j + 3) * n + iblk * BLKROW + i] = rcc;
+                    c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7] = rcd;
+                    c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rce;
+                    c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcf;
+                } /* end i-loop */
+            }     /* end iblk-loop */
+        }         /* end j-loop */
+    }
+}
+
+void gemm_cublas(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    cublasHandle_t handle;
+    float alfa = 1.0f,
+          beta = 1.0f,
+          *a_dev = NULL,
+          *b_dev = NULL,
+          *c_dev = NULL;
+    /*
+ * cublasSgemm in CUBLAS
+ */
+    if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle))
+    {
+        printf("error: initialization (CUBLAS)\n");
+        cublasDestroy(handle);
+        exit(EXIT_FAILURE);
+    }
+    if (cudaSuccess != cudaMalloc((void **)&a_dev, sizeof(*a) * n * n) ||
+        cudaSuccess != cudaMalloc((void **)&b_dev, sizeof(*b) * n * n) ||
+        cudaSuccess != cudaMalloc((void **)&c_dev, sizeof(*c) * n * n))
+    {
+        printf("error: memory allocation (CUDA)\n");
+        cudaFree(a_dev);
+        cudaFree(b_dev);
+        cudaFree(c_dev);
+        cublasDestroy(handle);
+        exit(EXIT_FAILURE);
+    }
+    if (CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*a), a, n, a_dev, n) ||
+        CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*b), b, n, b_dev, n) ||
+        CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*c), c, n, c_dev, n))
+    {
+        printf("error: host --> accl (CUBLAS)\n");
+        cudaFree(a_dev);
+        cudaFree(b_dev);
+        cudaFree(c_dev);
+        cublasDestroy(handle);
+        exit(EXIT_FAILURE);
+    }
+    if (CUBLAS_STATUS_SUCCESS != cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                                             n, n, n, &alfa, a_dev, n, b_dev, n, &beta, c_dev, n))
+    {
+        printf("error: cublasSgemm (CUBLAS)\n");
+        cudaFree(a_dev);
+        cudaFree(b_dev);
+        cudaFree(c_dev);
+        cublasDestroy(handle);
+        exit(EXIT_FAILURE);
+    }
+    if (cudaSuccess != cudaDeviceSynchronize())
+    {
+        printf("error: device synchronization (CUDA)\n");
+        cudaFree(a_dev);
+        cudaFree(b_dev);
+        cudaFree(c_dev);
+        cublasDestroy(handle);
+        exit(EXIT_FAILURE);
+    }
+    if (CUBLAS_STATUS_SUCCESS != cublasGetMatrix(n, n, sizeof(*c), c_dev, n, c, n))
+    {
+        printf("error: accl --> host (CUBLAS)\n");
+        cudaFree(a_dev);
+        cudaFree(b_dev);
+        cudaFree(c_dev);
+        cublasDestroy(handle);
+        exit(EXIT_FAILURE);
+    }
+    cudaFree(a_dev);
+    cudaFree(b_dev);
+    cudaFree(c_dev);
+    cublasDestroy(handle);
+}
+
+static void reorder2(float *restrict a, float *restrict b, int n)
+{
+    for (int i = 0; i < SM; i++)
+        for (int j = 0; j < SM; j++)
+            b[i * SM + j] = a[i * n + j];
+}
+
+static void kernel(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    for (int i = 0; i < SM; i++)
+    {
+        for (int k = 0; k < SM; k++)
+        {
+            for (int j = 0; j < SM; j++)
+            {
+                c[i * n + j] += a[i * n + k] * b[k * SM + j];
+            }
+        }
+    }
+}
+
+void gemm_accel_opt(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+#pragma omp target teams distribute parallel for collapse(3) map(to                                      \
+                                                                 : n, a [0:n * n], b [0:n * n]) map(from \
+                                                                                                    : c [0:n * n]) schedule(static, 1)
+    for (int i = 0; i < n / SM; i++)
+    {
+        for (int j = 0; j < n / SM; j++)
+        {
+            for (int k = 0; k < n / SM; k++)
+            {
+                float b2[SM * SM];
+                reorder2(&b[SM * (k * n + j)], b2, n);
+                kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
+            }
+        }
+    }
+}
+
+#pragma omp end declare target
+
+void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    int bk = n / SM;
+#pragma omp parallel
+    {
+        float b2[SM * SM];
+#pragma omp for collapse(3)
+        for (int i = 0; i < bk; i++)
+        {
+            for (int j = 0; j < bk; j++)
+            {
+                for (int k = 0; k < bk; k++)
+                {
+                    reorder2(&b[SM * (k * n + j)], b2, n);
+                    kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
+                }
+            }
+        }
+    }
+}
+
+void gemm(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    int i, j, k;
+#pragma omp parallel for simd collapse(2) schedule(simd \
+                                                   : static)
+    for (int i = 0; i < n; ++i)
+    {
+        for (int j = 0; j < n; ++j)
+        {
+            float sum = 0.0;
+            for (int k = 0; k < n; ++k)
+            {
+                sum += a[i + k * n] * b[k + j * n];
+            }
+            c[i * n + j] += sum;
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    int i, n = N,
+           iret = 0;
+    float *a, *b, *c, *g;
+    struct timespec rt[2];
+    double wt; // walltime
+
+    if (argc > 1)
+        n = atoi(argv[1]);
+
+    /*
+   * 0. prepare x, y, and z
+   *
+   * y := a * x + y (on host)
+   * z := a * x + z (on accel)
+   */
+    if (NULL == (a = (float *)malloc(sizeof(*a) * n * n)))
+    {
+        printf("error: memory allocation for 'x'\n");
+        iret = -1;
+    }
+    if (NULL == (b = (float *)malloc(sizeof(*b) * n * n)))
+    {
+        printf("error: memory allocation for 'y'\n");
+        iret = -1;
+    }
+    if (NULL == (c = (float *)malloc(sizeof(*c) * n * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+    if (NULL == (g = (float *)malloc(sizeof(*g) * n * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+
+    if (0 != iret)
+    {
+        free(a);
+        free(b);
+        free(c);
+        free(g);
+        exit(EXIT_FAILURE);
+    }
+
+    if (n <= 1024)
+    {
+        clock_gettime(CLOCK_REALTIME, rt + 0);
+        gemm(a, b, c, n);
+        clock_gettime(CLOCK_REALTIME, rt + 1);
+        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+        printf("gemm on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+    }
+
+    if (n <= 4096)
+    {
+        clock_gettime(CLOCK_REALTIME, rt + 0);
+        gemm_opt(a, b, c, n);
+        clock_gettime(CLOCK_REALTIME, rt + 1);
+        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+        printf("gemm_opt on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+    }
+
+#if 0
+#pragma omp target teams distribute parallel for map(to                                   \
+                                                     : a [0:n * n], b [0:n * n]) map(from \
+                                                                                     : c [0:n * n]) collapse(2)
+        for(int i = 0; i < n; ++i){
+            for(int j = 0; j < n; ++j){
+                float sum = 0.0;
+                for(int k = 0; k < n; ++k){
+
+                    sum += a[i+k*n]*b[k+j*n];
+                }
+                c[i*n+j] += sum;
+            }
+        }
+#endif
+
+    if (n <= 4096)
+    {
+        clock_gettime(CLOCK_REALTIME, rt + 0);
+        gemm_accel_opt(a, b, c, n);
+        clock_gettime(CLOCK_REALTIME, rt + 1);
+        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+        printf("GEMM-opt1 on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+
+        for (i = 0; i < n; i++)
+        {
+            iret = *(int *)(g + i) ^ *(int *)(c + i);
+            assert(iret == 0);
+        }
+    }
+    clock_gettime(CLOCK_REALTIME, rt + 0);
+    gemm_accel_opt2(a, b, c, n);
+    clock_gettime(CLOCK_REALTIME, rt + 1);
+    wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+    printf("GEMM-opt2 on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+
+    if (n <= 4096)
+        for (i = 0; i < n; i++)
+        {
+            iret = *(int *)(g + i) ^ *(int *)(c + i);
+            assert(iret == 0);
+        }
+
+    clock_gettime(CLOCK_REALTIME, rt + 0);
+    gemm_cublas(a, b, c, n);
+    clock_gettime(CLOCK_REALTIME, rt + 1);
+    wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+    printf("CUBLAS on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+
+    if (n <= 4096)
+        for (i = 0; i < n; i++)
+        {
+            iret = *(int *)(g + i) ^ *(int *)(c + i);
+            assert(iret == 0);
+        }
+
+    free(a);
+    free(b);
+    free(c);
+    free(g);
+
+    return 0;
+}
--- a/openmp/lab3/.solutions/saxpy-omp1.c
+++ b/openmp/lab3/.solutions/saxpy-omp1.c
@ -0,0 +1,122 @@
+/**
+ * @file saxpy.c
+ *
+ * @brief saxpy performs the \c axpy computation in single-precision on both
+ * host and accelerator. The performance (in MFLOPS) on host and accelerator is
+ * compared and the numerical results are also verified for consistency.
+ *
+ * The \c axpy computation is defined as:
+ *
+ * y := a * x + y
+ *
+ * where:
+ *
+ * - a is a scalar.
+ * - x and y are vectors each with n elements.
+ *
+ * Please note that in this version only <em>one GPU thread</em> is used.
+ *
+ * Offload to GPU:
+ *
+ * gcc -fopenmp -foffload=nvptx-none saxpy.c
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#include "utils.h"
+
+#define TWO02 (1 << 2)
+#define TWO04 (1 << 4)
+#define TWO08 (1 << 8)
+#ifndef N
+#define N (1 << 20)
+#endif
+
+int main(int argc, char *argv[])
+{
+  int i, n = N,
+         iret = 0;
+  float a = 101.0f / TWO02,
+        b, c,
+        *x, *y, *z;
+  struct timespec rt[2];
+  double wt; // walltime
+
+  if (argc > 1)
+    n = atoi(argv[1]);
+
+  /*
+   * 0. prepare x, y, and z
+   *
+   * y := a * x + y (on host)
+   * z := a * x + z (on accel)
+   */
+  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
+  {
+    printf("error: memory allocation for 'x'\n");
+    iret = -1;
+  }
+  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
+  {
+    printf("error: memory allocation for 'y'\n");
+    iret = -1;
+  }
+  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
+  {
+    printf("error: memory allocation for 'z'\n");
+    iret = -1;
+  }
+  if (0 != iret)
+  {
+    free(x);
+    free(y);
+    free(z);
+    exit(EXIT_FAILURE);
+  }
+  b = rand() % TWO04;
+  c = rand() % TWO08;
+  for (i = 0; i < n; i++)
+  {
+    x[i] = b / (float)TWO02;
+    y[i] = z[i] = c / (float)TWO04;
+  }
+  /*
+   * 1. saxpy on host
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+  for (i = 0; i < n; i++)
+  {
+    y[i] = a * x[i] + y[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  /*
+   * 2. saxpy on accel
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target map(to                          \
+                       : a, n, x [0:n]) map(tofrom \
+                                            : z [0:n])
+  for (int i = 0; i < n; i++)
+  {
+    z[i] = a * x[i] + z[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  /*
+   * 3. verify numerical consistency
+   */
+  for (i = 0; i < n; i++)
+  {
+    iret = *(int *)(y + i) ^ *(int *)(z + i);
+    assert(iret == 0);
+  }
+  return 0;
+}
--- a/openmp/lab3/.solutions/saxpy-omp2.c
+++ b/openmp/lab3/.solutions/saxpy-omp2.c
@ -0,0 +1,122 @@
+/**
+ * @file saxpy.c
+ *
+ * @brief saxpy performs the \c axpy computation in single-precision on both
+ * host and accelerator. The performance (in MFLOPS) on host and accelerator is
+ * compared and the numerical results are also verified for consistency.
+ *
+ * The \c axpy computation is defined as:
+ *
+ * y := a * x + y
+ *
+ * where:
+ *
+ * - a is a scalar.
+ * - x and y are vectors each with n elements.
+ *
+ * Please note that in this version only <em>one GPU thread</em> is used.
+ *
+ * Offload to GPU:
+ *
+ * gcc -fopenmp -foffload=nvptx-none saxpy.c
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#include "utils.h"
+
+#define TWO02 (1 << 2)
+#define TWO04 (1 << 4)
+#define TWO08 (1 << 8)
+#ifndef N
+#define N (1 << 20)
+#endif
+
+int main(int argc, char *argv[])
+{
+  int i, n = N,
+         iret = 0;
+  float a = 101.0f / TWO02,
+        b, c,
+        *x, *y, *z;
+  struct timespec rt[2];
+  double wt; // walltime
+
+  if (argc > 1)
+    n = atoi(argv[1]);
+
+  /*
+   * 0. prepare x, y, and z
+   *
+   * y := a * x + y (on host)
+   * z := a * x + z (on accel)
+   */
+  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
+  {
+    printf("error: memory allocation for 'x'\n");
+    iret = -1;
+  }
+  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
+  {
+    printf("error: memory allocation for 'y'\n");
+    iret = -1;
+  }
+  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
+  {
+    printf("error: memory allocation for 'z'\n");
+    iret = -1;
+  }
+  if (0 != iret)
+  {
+    free(x);
+    free(y);
+    free(z);
+    exit(EXIT_FAILURE);
+  }
+  b = rand() % TWO04;
+  c = rand() % TWO08;
+  for (i = 0; i < n; i++)
+  {
+    x[i] = b / (float)TWO02;
+    y[i] = z[i] = c / (float)TWO04;
+  }
+  /*
+   * 1. saxpy on host
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+  for (i = 0; i < n; i++)
+  {
+    y[i] = a * x[i] + y[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  /*
+   * 2. saxpy on accel
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target parallel for map(to                          \
+                                    : a, n, x [0:n]) map(tofrom \
+                                                         : z [0:n])
+  for (int i = 0; i < n; i++)
+  {
+    z[i] = a * x[i] + z[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  /*
+   * 3. verify numerical consistency
+   */
+  for (i = 0; i < n; i++)
+  {
+    iret = *(int *)(y + i) ^ *(int *)(z + i);
+    assert(iret == 0);
+  }
+  return 0;
+}
--- a/openmp/lab3/.solutions/saxpy-omp3.c
+++ b/openmp/lab3/.solutions/saxpy-omp3.c
@ -0,0 +1,129 @@
+/**
+ * @file saxpy.c
+ *
+ * @brief saxpy performs the \c axpy computation in single-precision on both
+ * host and accelerator. The performance (in MFLOPS) on host and accelerator is
+ * compared and the numerical results are also verified for consistency.
+ *
+ * The \c axpy computation is defined as:
+ *
+ * y := a * x + y
+ *
+ * where:
+ *
+ * - a is a scalar.
+ * - x and y are vectors each with n elements.
+ *
+ * Please note that in this version only <em>one GPU thread</em> is used.
+ *
+ * Offload to GPU:
+ *
+ * gcc -fopenmp -foffload=nvptx-none saxpy.c
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#include "utils.h"
+
+#define TWO02 (1 << 2)
+#define TWO04 (1 << 4)
+#define TWO08 (1 << 8)
+#ifndef N
+#define N (1 << 27)
+#endif
+
+int main(int argc, char *argv[])
+{
+  int i, n = N,
+         iret = 0;
+  float a = 101.0f / TWO02,
+        b, c,
+        *x, *y, *z;
+  struct timespec rt[2];
+  double wt; // walltime
+
+  if (argc > 1)
+    n = atoi(argv[1]);
+
+  /*
+   * 0. prepare x, y, and z
+   *
+   * y := a * x + y (on host)
+   * z := a * x + z (on accel)
+   */
+  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
+  {
+    printf("error: memory allocation for 'x'\n");
+    iret = -1;
+  }
+  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
+  {
+    printf("error: memory allocation for 'y'\n");
+    iret = -1;
+  }
+  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
+  {
+    printf("error: memory allocation for 'z'\n");
+    iret = -1;
+  }
+  if (0 != iret)
+  {
+    free(x);
+    free(y);
+    free(z);
+    exit(EXIT_FAILURE);
+  }
+  b = rand() % TWO04;
+  c = rand() % TWO08;
+  for (i = 0; i < n; i++)
+  {
+    x[i] = b / (float)TWO02;
+    y[i] = z[i] = c / (float)TWO04;
+  }
+  /*
+   * 1. saxpy on host
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+  for (i = 0; i < n; i++)
+  {
+    y[i] = a * x[i] + y[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  /*
+   * 2. saxpy on accel
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+#pragma omp target data map(to                          \
+                            : a, n, x [0:n]) map(tofrom \
+                                                 : z [0:n])
+#pragma omp target teams num_teams(n / NTHREADS_GPU) thread_limit(NTHREADS_GPU) \
+    map(to                                                                      \
+        : a, n, x [0:n]) map(tofrom                                             \
+                             : z [0:n])
+#pragma omp distribute parallel for num_threads(NTHREADS_GPU) \
+    dist_schedule(static, NTHREADS_GPU)
+
+  for (int i = 0; i < n; i++)
+  {
+    z[i] = a * x[i] + z[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  /*
+   * 3. verify numerical consistency
+   */
+  for (i = 0; i < n; i++)
+  {
+    iret = *(int *)(y + i) ^ *(int *)(z + i);
+    assert(iret == 0);
+  }
+  return 0;
+}
--- a/openmp/lab3/.solutions/saxpy-omp4.c
+++ b/openmp/lab3/.solutions/saxpy-omp4.c
@ -0,0 +1,128 @@
+/**
+ * @file saxpy.c
+ *
+ * @brief saxpy performs the \c axpy computation in single-precision on both
+ * host and accelerator. The performance (in MFLOPS) on host and accelerator is
+ * compared and the numerical results are also verified for consistency.
+ *
+ * The \c axpy computation is defined as:
+ *
+ * y := a * x + y
+ *
+ * where:
+ *
+ * - a is a scalar.
+ * - x and y are vectors each with n elements.
+ *
+ * Please note that in this version only <em>one GPU thread</em> is used.
+ *
+ * Offload to GPU:
+ *
+ * gcc -fopenmp -foffload=nvptx-none saxpy.c
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#include "utils.h"
+
+#define TWO02 (1 << 2)
+#define TWO04 (1 << 4)
+#define TWO08 (1 << 8)
+#ifndef N
+#define N (1 << 27)
+#endif
+
+int main(int argc, char *argv[])
+{
+  int i, n = N,
+         iret = 0;
+  float a = 101.0f / TWO02,
+        b, c,
+        *x, *y, *z;
+  struct timespec rt[2];
+  double wt; // walltime
+
+  if (argc > 1)
+    n = atoi(argv[1]);
+
+  /*
+   * 0. prepare x, y, and z
+   *
+   * y := a * x + y (on host)
+   * z := a * x + z (on accel)
+   */
+  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
+  {
+    printf("error: memory allocation for 'x'\n");
+    iret = -1;
+  }
+  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
+  {
+    printf("error: memory allocation for 'y'\n");
+    iret = -1;
+  }
+  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
+  {
+    printf("error: memory allocation for 'z'\n");
+    iret = -1;
+  }
+  if (0 != iret)
+  {
+    free(x);
+    free(y);
+    free(z);
+    exit(EXIT_FAILURE);
+  }
+  b = rand() % TWO04;
+  c = rand() % TWO08;
+  for (i = 0; i < n; i++)
+  {
+    x[i] = b / (float)TWO02;
+    y[i] = z[i] = c / (float)TWO04;
+  }
+  /*
+   * 1. saxpy on host
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+  for (i = 0; i < n; i++)
+  {
+    y[i] = a * x[i] + y[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+
+  /*
+   * 2. saxpy on accel
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+  int BLOCK=n/8;
+
+  for (int i = 0; i < n; i+=BLOCK)
+  {
+#pragma omp target teams distribute parallel for map(to: a, x [i:BLOCK]) map(tofrom: z [i:BLOCK]) nowait
+    for (int ii = 0; ii < BLOCK; ii++)
+    {
+      z[i+ii] = a * x[i+ii] + z[i+ii];
+    }
+  }
+  #pragma omp taskwait
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+
+  /*
+   * 3. verify numerical consistency
+   */
+  for (i = 0; i < n; i++)
+  {
+    iret = *(int *)(y + i) ^ *(int *)(z + i);
+    assert(iret == 0);
+  }
+  return 0;
+}
--- a/openmp/lab3/Makefile
+++ b/openmp/lab3/Makefile
@ -0,0 +1,32 @@
+ifndef EXERCISE
+EXERCISE=exercise1.c
+endif
+
+CC=clang
+LD=ld
+OBJDUMP=objdump
+
+OPT=-O3 -g
+OMP=-fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
+CFLAGS=$(OPT) $(OMP) -I. $(EXT_CFLAGS)
+LDFLAGS=-lm $(EXT_LDFLAGS)
+
+SRCS=utils.c
+OBJS=$(SRCS:.c=.o) $(EXERCISE:.c=.o)
+EXE=$(EXERCISE:.c=.exe)
+
+$(EXE):	$(OBJS)
+	$(CC) $(CFLAGS) $(OBJS) -o $@ $(LDFLAGS)
+
+all: $(EXE)
+
+.PHONY: run profile clean
+run: $(EXE)
+	./$(EXE)
+
+profile: $(EXE)
+	sudo LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof ./$(EXE)
+
+clean:
+	rm -f $(OBJS) *.o *.exe *.out *~
+
--- a/openmp/lab3/data/jacobi-1000.bin
+++ b/openmp/lab3/data/jacobi-1000.bin
--- a/openmp/lab3/data/jacobi-500.bin
+++ b/openmp/lab3/data/jacobi-500.bin
--- a/openmp/lab3/jacobi.c
+++ b/openmp/lab3/jacobi.c
@ -0,0 +1,279 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file jacobi.c
+ * @author Alessandro Capotondi
+ * @date 27 Mar 2020
+ * @brief This code solves the steady state heat equation on a rectangular region.
+ * This code solves the steady state heat equation on a rectangular region.
+ *  The sequential version of this program needs approximately
+ *  18/epsilon iterations to complete. 
+ *  The physical region, and the boundary conditions, are suggested
+ *  by this diagram;
+ *                 W = 0
+ *           +------------------+
+ *           |                  |
+ *  W = 100  |                  | W = 100
+ *           |                  |
+ *           +------------------+
+ *                 W = 100
+ *  The region is covered with a grid of M by N nodes, and an N by N
+ *  array W is used to record the temperature.  The correspondence between
+ *  array indices and locations in the region is suggested by giving the
+ *  indices of the four corners:
+ *                I = 0
+ *        [0][0]-------------[0][N-1]
+ *           |                  |
+ *    J = 0  |                  |  J = N-1
+ *           |                  |
+ *      [M-1][0]-----------[M-1][N-1]
+ *                I = M-1
+ *  The steady state solution to the discrete heat equation satisfies the
+ *  following condition at an interior grid point:
+ *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  where "Central" is the index of the grid point, "North" is the index
+ *  of its immediate neighbor to the "north", and so on.
+ * 
+ *  Given an approximate solution of the steady state heat equation, a
+ *  "better" solution is given by replacing each interior point by the
+ *  average of its 4 neighbors - in other words, by using the condition
+ *  as an ASSIGNMENT statement:
+ *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
+ *  If this process is repeated often enough, the difference between successive 
+ *  estimates of the solution will go to zero.
+ *  This program carries out such an iteration, using a tolerance specified by
+ *  the user, and writes the final estimate of the solution to a file that can
+ *  be used for graphic processing.
+ * icensing:
+ *  This code is distributed under the GNU LGPL license. 
+ * odified:
+ *  18 October 2011
+ * uthor:
+ *  Original C version by Michael Quinn.
+ *  This C version by John Burkardt.
+ * eference:
+ *  Michael Quinn,
+ *  Parallel Programming in C with MPI and OpenMP,
+ *  McGraw-Hill, 2004,
+ *  ISBN13: 978-0071232654,
+ *  LC: QA76.73.C15.Q55.
+ * ocal parameters:
+ *  Local, double DIFF, the norm of the change in the solution from one iteration
+ *  to the next.
+ *  Local, double MEAN, the average of the boundary values, used to initialize
+ *  the values of the solution in the interior.
+ *  Local, double U[M][N], the solution at the previous iteration.
+ *  Local, double W[M][N], the solution computed at the latest iteration.
+ * 
+ * 
+ * @see https://en.wikipedia.org/wiki/Jacobi_method
+ * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "utils.h"
+
+static int N;
+static int MAX_ITERATIONS;
+static int SEED;
+static double CONVERGENCE_THRESHOLD;
+static FILE *data;
+
+#define SEPARATOR "------------------------------------\n"
+
+// Return the current time in seconds since the Epoch
+double get_timestamp();
+
+// Parse command line arguments to set solver parameters
+void parse_arguments(int argc, char *argv[]);
+
+// Run the Jacobi solver
+// Returns the number of iterations performed
+int run(double *A, double *xtmp)
+{
+    int iter = 0, iterations_print = 1;
+    double err = 0.0;
+
+    do
+    {
+        err = 0.0;
+#pragma omp parallel for reduction(max \
+                                   : err) num_threads(NTHREADS)
+        for (int i = 1; i < N - 1; i++)
+        {
+            for (int j = 1; j < N - 1; j++)
+            {
+                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
+                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
+            }
+        }
+
+#pragma omp parallel for num_threads(NTHREADS)
+        for (int i = 0; i < N; i++)
+        {
+            for (int j = 0; j < N; j++)
+            {
+                A[i * N + j] = xtmp[i * N + j];
+            }
+        }
+        iter++;
+
+#ifdef DEBUG
+        if (iter == iterations_print)
+        {
+            printf("  %8d  %f\n", iter, err);
+            iterations_print = 2 * iterations_print;
+        }
+#endif
+    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
+
+    return iter;
+}
+
+int main(int argc, char *argv[])
+{
+    parse_arguments(argc, argv);
+
+    double *A = malloc(N * N * sizeof(double));
+    double *xtmp = malloc(N * N * sizeof(double));
+
+    printf(SEPARATOR);
+    printf("Matrix size:            %dx%d\n", N, N);
+    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
+    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
+    printf(SEPARATOR);
+
+    for (int ii = 0; ii < N; ii++)
+    {
+        for (int jj = 0; jj < N; jj++)
+        {
+            double f;
+            fread(&f, sizeof(double), 1, data);
+            A[ii * N + jj] = f;
+        }
+    }
+
+    // Run Jacobi solver
+    start_timer();
+    int itr = run(A, xtmp);
+    stop_timer();
+
+    printf("Iterations     = %d\n", itr);
+    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
+    if (itr == MAX_ITERATIONS)
+        printf("WARNING: solution did not converge\n");
+    printf(SEPARATOR);
+
+    free(A);
+    free(xtmp);
+    fclose(data);
+    return 0;
+}
+
+int parse_int(const char *str)
+{
+    char *next;
+    int value = strtoul(str, &next, 10);
+    return strlen(next) ? -1 : value;
+}
+
+double parse_double(const char *str)
+{
+    char *next;
+    double value = strtod(str, &next);
+    return strlen(next) ? -1 : value;
+}
+
+void parse_arguments(int argc, char *argv[])
+{
+    // Set default values
+    N = 500;
+    MAX_ITERATIONS = 2000;
+    CONVERGENCE_THRESHOLD = 0.001;
+    SEED = 0;
+
+    for (int i = 1; i < argc; i++)
+    {
+        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
+        {
+            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
+            {
+                printf("Invalid convergence threshold\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
+        {
+            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid number of iterations\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
+        {
+            if (++i >= argc || (N = parse_int(argv[i])) < 0)
+            {
+                printf("Invalid matrix order\n");
+                exit(1);
+            }
+        }
+        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
+        {
+            printf("\n");
+            printf("Usage: ./jacobi [OPTIONS]\n\n");
+            printf("Options:\n");
+            printf("  -h  --help               Print this message\n");
+            printf("  -c  --convergence  C     Set convergence threshold\n");
+            printf("  -i  --iterations   I     Set maximum number of iterations\n");
+            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
+            printf("\n");
+            exit(0);
+        }
+        else
+        {
+            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
+            exit(1);
+        }
+    }
+
+    if (N == 1000)
+        data = fopen("data/jacobi-1000.bin", "rb");
+    else if (N == 500)
+        data = fopen("data/jacobi-500.bin", "rb");
+    else
+    {
+        printf("Invalid matrix order\n");
+        exit(1);
+    }
+}
--- a/openmp/lab3/matmul.c
+++ b/openmp/lab3/matmul.c
@ -0,0 +1,174 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#ifndef N
+#define N (1 << 10)
+#endif
+
+#pragma omp declare target
+#define SM 64
+
+static void reorder2(float *restrict a, float *restrict b, int n)
+{
+    for (int i = 0; i < SM; i++)
+        for (int j = 0; j < SM; j++)
+            b[i * SM + j] = a[i * n + j];
+}
+
+static void kernel(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    for (int i = 0; i < SM; i++)
+    {
+        for (int k = 0; k < SM; k++)
+        {
+            for (int j = 0; j < SM; j++)
+            {
+                c[i * n + j] += a[i * n + k] * b[k * SM + j];
+            }
+        }
+    }
+}
+
+void gemm_accel(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    int bk = n / SM;
+    float b2[SM * SM];
+
+    for (int i = 0; i < bk; i++)
+    {
+        for (int j = 0; j < bk; j++)
+        {
+            for (int k = 0; k < bk; k++)
+            {
+                reorder2(&b[SM * (k * n + j)], b2, n);
+                kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
+            }
+        }
+    }
+}
+
+#pragma omp end declare target
+
+void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    int bk = n / SM;
+    {
+        float b2[SM * SM];
+        for (int i = 0; i < bk; i++)
+        {
+            for (int j = 0; j < bk; j++)
+            {
+                for (int k = 0; k < bk; k++)
+                {
+                    reorder2(&b[SM * (k * n + j)], b2, n);
+                    kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
+                }
+            }
+        }
+    }
+}
+
+void gemm(float *restrict a, float *restrict b, float *restrict c, int n)
+{
+    int i, j, k;
+
+    for (int i = 0; i < n; ++i)
+    {
+        for (int j = 0; j < n; ++j)
+        {
+            float sum = 0.0;
+            for (int k = 0; k < n; ++k)
+            {
+                sum += a[i + k * n] * b[k + j * n];
+            }
+            c[i * n + j] += sum;
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    int i, n = N,
+           iret = 0;
+    float *a, *b, *c, *g;
+    struct timespec rt[2];
+    double wt; // walltime
+
+    if (argc > 1)
+        n = atoi(argv[1]);
+
+    /*
+   * 0. prepare x, y, and z
+   *
+   * y := a * x + y (on host)
+   * z := a * x + z (on accel)
+   */
+    if (NULL == (a = (float *)malloc(sizeof(*a) * n * n)))
+    {
+        printf("error: memory allocation for 'x'\n");
+        iret = -1;
+    }
+    if (NULL == (b = (float *)malloc(sizeof(*b) * n * n)))
+    {
+        printf("error: memory allocation for 'y'\n");
+        iret = -1;
+    }
+    if (NULL == (c = (float *)malloc(sizeof(*c) * n * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+    if (NULL == (g = (float *)malloc(sizeof(*g) * n * n)))
+    {
+        printf("error: memory allocation for 'z'\n");
+        iret = -1;
+    }
+
+    if (0 != iret)
+    {
+        free(a);
+        free(b);
+        free(c);
+        free(g);
+        exit(EXIT_FAILURE);
+    }
+
+    if (n <= 1024)
+    {
+        clock_gettime(CLOCK_REALTIME, rt + 0);
+        gemm(a, b, c, n);
+        clock_gettime(CLOCK_REALTIME, rt + 1);
+        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+        printf("gemm on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+    }
+
+    if (n <= 4096)
+    {
+        clock_gettime(CLOCK_REALTIME, rt + 0);
+        gemm_opt(a, b, c, n);
+        clock_gettime(CLOCK_REALTIME, rt + 1);
+        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+        printf("gemm_opt on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+    }
+
+    clock_gettime(CLOCK_REALTIME, rt + 0);
+    gemm_accel(a, b, c, n);
+    clock_gettime(CLOCK_REALTIME, rt + 1);
+    wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+    printf("GEMM-opt1 on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
+
+    for (i = 0; i < n; i++)
+    {
+        iret = *(int *)(g + i) ^ *(int *)(c + i);
+        assert(iret == 0);
+    }
+    free(a);
+    free(b);
+    free(c);
+    free(g);
+
+    return 0;
+}
--- a/openmp/lab3/saxpy.c
+++ b/openmp/lab3/saxpy.c
@ -0,0 +1,120 @@
+/**
+ * @file saxpy.c
+ *
+ * @brief saxpy performs the \c axpy computation in single-precision on both
+ * host and accelerator. The performance (in MFLOPS) on host and accelerator is
+ * compared and the numerical results are also verified for consistency.
+ *
+ * The \c axpy computation is defined as:
+ *
+ * y := a * x + y
+ *
+ * where:
+ *
+ * - a is a scalar.
+ * - x and y are vectors each with n elements.
+ *
+ * Please note that in this version only <em>one GPU thread</em> is used.
+ *
+ * Offload to GPU:
+ *
+ * gcc -fopenmp -foffload=nvptx-none saxpy.c
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+
+#include "utils.h"
+
+#define TWO02 (1 << 2)
+#define TWO04 (1 << 4)
+#define TWO08 (1 << 8)
+#ifndef N
+#define N (1 << 26)
+#endif
+
+int main(int argc, char *argv[])
+{
+  int i, n = N,
+         iret = 0;
+  float a = 101.0f / TWO02,
+        b, c,
+        *x, *y, *z;
+  struct timespec rt[2];
+  double wt; // walltime
+
+  if (argc > 1)
+    n = atoi(argv[1]);
+
+  /*
+   * 0. prepare x, y, and z
+   *
+   * y := a * x + y (on host)
+   * z := a * x + z (on accel)
+   */
+  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
+  {
+    printf("error: memory allocation for 'x'\n");
+    iret = -1;
+  }
+  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
+  {
+    printf("error: memory allocation for 'y'\n");
+    iret = -1;
+  }
+  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
+  {
+    printf("error: memory allocation for 'z'\n");
+    iret = -1;
+  }
+  if (0 != iret)
+  {
+    free(x);
+    free(y);
+    free(z);
+    exit(EXIT_FAILURE);
+  }
+  b = rand() % TWO04;
+  c = rand() % TWO08;
+  for (i = 0; i < n; i++)
+  {
+    x[i] = b / (float)TWO02;
+    y[i] = z[i] = c / (float)TWO04;
+  }
+  /*
+   * 1. saxpy on host
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+  for (i = 0; i < n; i++)
+  {
+    y[i] = a * x[i] + y[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  /*
+   * 2. saxpy on accel
+   */
+  clock_gettime(CLOCK_REALTIME, rt + 0);
+
+  for (i = 0; i < n; i++)
+  {
+    z[i] = a * x[i] + z[i];
+  }
+  clock_gettime(CLOCK_REALTIME, rt + 1);
+  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
+  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
+  /*
+   * 3. verify numerical consistency
+   */
+  for (i = 0; i < n; i++)
+  {
+    iret = *(int *)(y + i) ^ *(int *)(z + i);
+    assert(iret == 0);
+  }
+  return 0;
+}
--- a/openmp/lab3/setup_clang.sh
+++ b/openmp/lab3/setup_clang.sh
@ -0,0 +1,2 @@
+#!/bin/bash
+module load clang/11.0.0 cuda/10.0
--- a/openmp/lab3/utils.c
+++ b/openmp/lab3/utils.c
@ -0,0 +1,150 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file utils.c
+ * @author Alessandro Capotondi
+ * @date 27 Mar 2020
+ * @brief File containing utilities functions for HPC Unimore Class
+ *
+ * Utilities for OpenMP lab.
+ * 
+ * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
+ */
+
+#define _POSIX_C_SOURCE 199309L
+#include <time.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "utils.h"
+
+#define MAX_ITERATIONS 100
+static struct timespec timestampA, timestampB;
+static unsigned long long statistics[MAX_ITERATIONS];
+static int iterations = 0;
+
+static unsigned long long __diff_ns(struct timespec start, struct timespec end)
+{
+    struct timespec temp;
+    if ((end.tv_nsec - start.tv_nsec) < 0)
+    {
+        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
+        temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
+    }
+    else
+    {
+        temp.tv_sec = end.tv_sec - start.tv_sec;
+        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
+    }
+
+    return temp.tv_nsec + temp.tv_sec * 1000000000ULL;
+}
+
+void start_timer()
+{
+    asm volatile("" ::
+                     : "memory");
+    clock_gettime(CLOCK_MONOTONIC_RAW, &timestampA);
+    asm volatile("" ::
+                     : "memory");
+}
+
+void stop_timer()
+{
+    unsigned long long elapsed = 0ULL;
+    asm volatile("" ::
+                     : "memory");
+    clock_gettime(CLOCK_MONOTONIC_RAW, &timestampB);
+    asm volatile("" ::
+                     : "memory");
+}
+
+unsigned long long elapsed_ns()
+{
+    return __diff_ns(timestampA, timestampB);
+}
+
+void start_stats()
+{
+    start_timer();
+}
+
+void collect_stats()
+{
+    assert(iterations < MAX_ITERATIONS);
+    stop_timer();
+    statistics[iterations++] = elapsed_ns();
+}
+
+void print_stats()
+{
+    unsigned long long min = ULLONG_MAX;
+    unsigned long long max = 0LL;
+    double average = 0.0;
+    double std_deviation = 0.0;
+    double sum = 0.0;
+
+    /*  Compute the sum of all elements */
+    for (int i = 0; i < iterations; i++)
+    {
+        if (statistics[i] > max)
+            max = statistics[i];
+        if (statistics[i] < min)
+            min = statistics[i];
+        sum = sum + statistics[i] / 1E6;
+    }
+    average = sum / (double)iterations;
+
+    /*  Compute  variance  and standard deviation  */
+    for (int i = 0; i < iterations; i++)
+    {
+        sum = sum + pow((statistics[i] / 1E6 - average), 2);
+    }
+    std_deviation = sqrt(sum / (double)iterations);
+
+    printf("AvgTime\tMinTime\tMaxTime\tStdDev\n");
+    printf("%.4f ms\t%.4f ms\t%.4f ms\t%.4f\n", (double)average, (double)min / 1E6, (double)max / 1E6, (double)std_deviation);
+}
+
+#if defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC optimize("O0")
+void work(unsigned long num)
+#else
+void work __attribute__((optnone)) (unsigned long num)
+#endif
+{
+    volatile int cnt = 0;
+    for (int i = 0; i < num; i++)
+        cnt += i;
+}
+#if defined(__GNUC__)
+#pragma GCC pop_options
+#endif
--- a/openmp/lab3/utils.h
+++ b/openmp/lab3/utils.h
@ -0,0 +1,162 @@
+/*
+ * BSD 2-Clause License
+ * 
+ * Copyright (c) 2020, Alessandro Capotondi
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file utils.h
+ * @author Alessandro Capotondi
+ * @date 27 Mar 2020
+ * @brief File containing utilities functions for HPC Unimore Class
+ *
+ * The header define time functions and dummy workload used on the example tests.
+ * 
+ * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
+ */
+#ifndef __UTILS_H__
+#define __UTILS_H__
+
+#include <stdarg.h>
+
+#if defined(VERBOSE)
+#define DEBUG_PRINT(x, ...) printf((x), ##__VA_ARGS__)
+#else
+#define DEBUG_PRINT(x, ...)
+#endif
+
+#if !defined(NTHREADS)
+#define NTHREADS (4)
+#endif
+
+#if !defined(NTHREADS_GPU)
+#define NTHREADS_GPU (1024)
+#endif
+
+/**
+ * @brief The function set the timestampA
+ *
+ * The function is used to measure elapsed time between two execution points.
+ * The function start_timer() sets the starting point timestamp, while the function
+ * stop_timer() sets the termination timestamp. The elapsed time, expressed in nanoseconds,
+ * between the two points can be retrieved using the function elapsed_ns().
+ * 
+ * Example usage:
+ * @code
+ * start_timer(); // Point A
+ * //SOME CODE HERE
+ * stop_timer(); // Point B
+ * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B
+ * //SOME OTHER CODE HERE
+ * stop_timer(); // Point C
+ * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C
+ * @endcode
+ * 
+ * @return void
+ * @see start_timer()
+ * @see stop_timer()
+ * @see elapsed_ns()
+ */
+void start_timer();
+
+/**
+ * @brief The function set the second timestamps
+ *
+ * The function is used to measure elapsed time between two execution points.
+ * The function start_timer() sets the starting point timestamp, while the function
+ * stop_timer() returns the elapsed time, expressed in nanoseconds between the last call
+ * of start_timer() and the current execution point.
+ * 
+ * Example usage:
+ * @code
+ * start_timer(); // Point A
+ * //SOME CODE HERE
+ * stop_timer(); // Point B
+ * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B
+ * //SOME OTHER CODE HERE
+ * stop_timer(); // Point C
+ * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C
+ * @endcode
+ * 
+ * @return void
+ * @see start_timer()
+ * @see stop_timer()
+ * @see elapsed_ns()
+ */
+void stop_timer();
+
+/**
+ * @brief Elapsed nano seconds between start_timer() and stop_timer().
+ *
+ * @return Elapsed nano seconds
+ * @see start_timer()
+ * @see stop_timer()
+ */
+unsigned long long elapsed_ns();
+
+/**
+ * @brief The function init the starting point of stat measurement.
+ *
+ * The function is similar to start_timer().
+ * 
+ * @return void
+ * @see start_timer
+ */
+void start_stats();
+
+/**
+ * @brief The function collects the elapsed time between the current exeuction point and the 
+ * last call of start_stats().
+ * 
+ * @return void
+ */
+void collect_stats();
+
+/**
+ * @brief The function display the collected statistics.
+ * @return void
+ */
+void print_stats();
+
+/**
+ * @brief The dummy work function
+ *
+ * The function is used to emulate some usefull workload.
+ * 
+ * @param @num work duration in terms of loop iterations.
+ * @return void
+ */
+#if defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC optimize("O0")
+void work(unsigned long num);
+#else
+void work __attribute__((optnone)) (unsigned long num);
+#endif
+#if defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+
+#endif /*__UTILS_H__*/