HPC OpenMP Lab 3

2024-11-25 01:24:22 +00:00 · 2021-04-21 10:16:41 +02:00 · 2021-04-21 10:16:41 +02:00 · 6454fbf443
commit 6454fbf443
parent 26ca5f4a19
21 changed files with 3539 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -12,4 +12,4 @@ This repo contains the exercises and the tutorials used for Unimore's HPC class
 The exercises related to OpenMP programming model can be found in the folder `openmp`. Here the list of currectly available classes:
 - `openmp\lab1`: OpenMP basics: *parallel*, *for-loop*, *sections*, and *tasking*.
 - `openmp\lab2`: OpenMP Advanced: *reduction*, *tasking*, *optimizations*.
-
+- `openmp\lab3`: OpenMP 4.x+: *Accelerator Model (targeting: Nvidia GP-GPU)*
--- a/openmp/lab3/.solutions/jacobi-omp1.c
+++ b/openmp/lab3/.solutions/jacobi-omp1.c
@ -0,0 +1,282 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file jacobi.c
 * @author Alessandro Capotondi
 * @date 27 Mar 2020
 * @brief This code solves the steady state heat equation on a rectangular region.
 * This code solves the steady state heat equation on a rectangular region.
 *  The sequential version of this program needs approximately
 *  18/epsilon iterations to complete. 
 *  The physical region, and the boundary conditions, are suggested
 *  by this diagram;
 *                 W = 0
 *           +------------------+
 *           |                  |
 *  W = 100  |                  | W = 100
 *           |                  |
 *           +------------------+
 *                 W = 100
 *  The region is covered with a grid of M by N nodes, and an N by N
 *  array W is used to record the temperature.  The correspondence between
 *  array indices and locations in the region is suggested by giving the
 *  indices of the four corners:
 *                I = 0
 *        [0][0]-------------[0][N-1]
 *           |                  |
 *    J = 0  |                  |  J = N-1
 *           |                  |
 *      [M-1][0]-----------[M-1][N-1]
 *                I = M-1
 *  The steady state solution to the discrete heat equation satisfies the
 *  following condition at an interior grid point:
 *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  where "Central" is the index of the grid point, "North" is the index
 *  of its immediate neighbor to the "north", and so on.
 * 
 *  Given an approximate solution of the steady state heat equation, a
 *  "better" solution is given by replacing each interior point by the
 *  average of its 4 neighbors - in other words, by using the condition
 *  as an ASSIGNMENT statement:
 *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  If this process is repeated often enough, the difference between successive 
 *  estimates of the solution will go to zero.
 *  This program carries out such an iteration, using a tolerance specified by
 *  the user, and writes the final estimate of the solution to a file that can
 *  be used for graphic processing.
 * icensing:
 *  This code is distributed under the GNU LGPL license. 
 * odified:
 *  18 October 2011
 * uthor:
 *  Original C version by Michael Quinn.
 *  This C version by John Burkardt.
 * eference:
 *  Michael Quinn,
 *  Parallel Programming in C with MPI and OpenMP,
 *  McGraw-Hill, 2004,
 *  ISBN13: 978-0071232654,
 *  LC: QA76.73.C15.Q55.
 * ocal parameters:
 *  Local, double DIFF, the norm of the change in the solution from one iteration
 *  to the next.
 *  Local, double MEAN, the average of the boundary values, used to initialize
 *  the values of the solution in the interior.
 *  Local, double U[M][N], the solution at the previous iteration.
 *  Local, double W[M][N], the solution computed at the latest iteration.
 * 
 * 
 * @see https://en.wikipedia.org/wiki/Jacobi_method
 * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
 */
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #include "utils.h"
 static int N;
 static int MAX_ITERATIONS;
 static int SEED;
 static double CONVERGENCE_THRESHOLD;
 static FILE *data;
 #define SEPARATOR "------------------------------------\n"
 // Return the current time in seconds since the Epoch
 double get_timestamp();
 // Parse command line arguments to set solver parameters
 void parse_arguments(int argc, char *argv[]);
 // Run the Jacobi solver
 // Returns the number of iterations performed
 int run(double *restrict A, double *restrict xtmp)
 {
    int iter = 0, iterations_print = 1;
    double err = 0.0;
    do
    {
        err = 0.0;
 #pragma omp target map(to                                              \
                       : A [0:N * N]) map(from                         \
                                          : xtmp [0:N * N]) map(tofrom \
                                                                : err)
        for (int i = 1; i < N - 1; i++)
        {
            for (int j = 1; j < N - 1; j++)
            {
                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
            }
        }
 #pragma omp target map(to                         \
                       : xtmp [0:N * N]) map(from \
                                             : A [0:N * N])
        for (int i = 0; i < N; i++)
        {
            for (int j = 0; j < N; j++)
            {
                A[i * N + j] = xtmp[i * N + j];
            }
        }
        iter++;
 #ifdef DEBUG
        if (iter == iterations_print)
        {
            printf("  %8d  %f\n", iter, err);
            iterations_print = 2 * iterations_print;
        }
 #endif
    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
    return iter;
 }
 int main(int argc, char *argv[])
 {
    parse_arguments(argc, argv);
    double *A = malloc(N * N * sizeof(double));
    double *xtmp = malloc(N * N * sizeof(double));
    printf(SEPARATOR);
    printf("Matrix size:            %dx%d\n", N, N);
    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
    printf(SEPARATOR);
    for (int ii = 0; ii < N; ii++)
    {
        for (int jj = 0; jj < N; jj++)
        {
            double f;
            fread(&f, sizeof(double), 1, data);
            A[ii * N + jj] = f;
        }
    }
    // Run Jacobi solver
    start_timer();
    int itr = run(A, xtmp);
    stop_timer();
    printf("Iterations     = %d\n", itr);
    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
    if (itr == MAX_ITERATIONS)
        printf("WARNING: solution did not converge\n");
    printf(SEPARATOR);
    free(A);
    free(xtmp);
    fclose(data);
    return 0;
 }
 int parse_int(const char *str)
 {
    char *next;
    int value = strtoul(str, &next, 10);
    return strlen(next) ? -1 : value;
 }
 double parse_double(const char *str)
 {
    char *next;
    double value = strtod(str, &next);
    return strlen(next) ? -1 : value;
 }
 void parse_arguments(int argc, char *argv[])
 {
    // Set default values
    N = 500;
    MAX_ITERATIONS = 2000;
    CONVERGENCE_THRESHOLD = 0.001;
    SEED = 0;
    for (int i = 1; i < argc; i++)
    {
        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
        {
            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
            {
                printf("Invalid convergence threshold\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
        {
            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
            {
                printf("Invalid number of iterations\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
        {
            if (++i >= argc || (N = parse_int(argv[i])) < 0)
            {
                printf("Invalid matrix order\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
            printf("\n");
            printf("Usage: ./jacobi [OPTIONS]\n\n");
            printf("Options:\n");
            printf("  -h  --help               Print this message\n");
            printf("  -c  --convergence  C     Set convergence threshold\n");
            printf("  -i  --iterations   I     Set maximum number of iterations\n");
            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
            printf("\n");
            exit(0);
        }
        else
        {
            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
            exit(1);
        }
    }
    if (N == 1000)
        data = fopen("data/jacobi-1000.bin", "rb");
    else if (N == 500)
        data = fopen("data/jacobi-500.bin", "rb");
    else
    {
        printf("Invalid matrix order\n");
        exit(1);
    }
 }
--- a/openmp/lab3/.solutions/jacobi-omp2.c
+++ b/openmp/lab3/.solutions/jacobi-omp2.c
@ -0,0 +1,285 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file jacobi.c
 * @author Alessandro Capotondi
 * @date 27 Mar 2020
 * @brief This code solves the steady state heat equation on a rectangular region.
 * This code solves the steady state heat equation on a rectangular region.
 *  The sequential version of this program needs approximately
 *  18/epsilon iterations to complete. 
 *  The physical region, and the boundary conditions, are suggested
 *  by this diagram;
 *                 W = 0
 *           +------------------+
 *           |                  |
 *  W = 100  |                  | W = 100
 *           |                  |
 *           +------------------+
 *                 W = 100
 *  The region is covered with a grid of M by N nodes, and an N by N
 *  array W is used to record the temperature.  The correspondence between
 *  array indices and locations in the region is suggested by giving the
 *  indices of the four corners:
 *                I = 0
 *        [0][0]-------------[0][N-1]
 *           |                  |
 *    J = 0  |                  |  J = N-1
 *           |                  |
 *      [M-1][0]-----------[M-1][N-1]
 *                I = M-1
 *  The steady state solution to the discrete heat equation satisfies the
 *  following condition at an interior grid point:
 *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  where "Central" is the index of the grid point, "North" is the index
 *  of its immediate neighbor to the "north", and so on.
 * 
 *  Given an approximate solution of the steady state heat equation, a
 *  "better" solution is given by replacing each interior point by the
 *  average of its 4 neighbors - in other words, by using the condition
 *  as an ASSIGNMENT statement:
 *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  If this process is repeated often enough, the difference between successive 
 *  estimates of the solution will go to zero.
 *  This program carries out such an iteration, using a tolerance specified by
 *  the user, and writes the final estimate of the solution to a file that can
 *  be used for graphic processing.
 * icensing:
 *  This code is distributed under the GNU LGPL license. 
 * odified:
 *  18 October 2011
 * uthor:
 *  Original C version by Michael Quinn.
 *  This C version by John Burkardt.
 * eference:
 *  Michael Quinn,
 *  Parallel Programming in C with MPI and OpenMP,
 *  McGraw-Hill, 2004,
 *  ISBN13: 978-0071232654,
 *  LC: QA76.73.C15.Q55.
 * ocal parameters:
 *  Local, double DIFF, the norm of the change in the solution from one iteration
 *  to the next.
 *  Local, double MEAN, the average of the boundary values, used to initialize
 *  the values of the solution in the interior.
 *  Local, double U[M][N], the solution at the previous iteration.
 *  Local, double W[M][N], the solution computed at the latest iteration.
 * 
 * 
 * @see https://en.wikipedia.org/wiki/Jacobi_method
 * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
 */
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #include "utils.h"
 static int N;
 static int MAX_ITERATIONS;
 static int SEED;
 static double CONVERGENCE_THRESHOLD;
 static FILE *data;
 #define SEPARATOR "------------------------------------\n"
 // Return the current time in seconds since the Epoch
 double get_timestamp();
 // Parse command line arguments to set solver parameters
 void parse_arguments(int argc, char *argv[]);
 // Run the Jacobi solver
 // Returns the number of iterations performed
 int run(double *restrict A, double *restrict xtmp)
 {
    int iter = 0, iterations_print = 1;
    double err = 0.0;
    do
    {
        err = 0.0;
 #pragma omp target map(to                                              \
                       : A [0:N * N]) map(from                         \
                                          : xtmp [0:N * N]) map(tofrom \
                                                                : err)
 #pragma omp teams distribute parallel for reduction(max \
                                                    : err)
        for (int i = 1; i < N - 1; i++)
        {
            for (int j = 1; j < N - 1; j++)
            {
                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
            }
        }
 #pragma omp target map(to                         \
                       : xtmp [0:N * N]) map(from \
                                             : A [0:N * N])
 #pragma omp teams distribute parallel for
        for (int i = 0; i < N; i++)
        {
            for (int j = 0; j < N; j++)
            {
                A[i * N + j] = xtmp[i * N + j];
            }
        }
        iter++;
 #ifdef DEBUG
        if (iter == iterations_print)
        {
            printf("  %8d  %f\n", iter, err);
            iterations_print = 2 * iterations_print;
        }
 #endif
    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
    return iter;
 }
 int main(int argc, char *argv[])
 {
    parse_arguments(argc, argv);
    double *A = malloc(N * N * sizeof(double));
    double *xtmp = malloc(N * N * sizeof(double));
    printf(SEPARATOR);
    printf("Matrix size:            %dx%d\n", N, N);
    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
    printf(SEPARATOR);
    for (int ii = 0; ii < N; ii++)
    {
        for (int jj = 0; jj < N; jj++)
        {
            double f;
            fread(&f, sizeof(double), 1, data);
            A[ii * N + jj] = f;
        }
    }
    // Run Jacobi solver
    start_timer();
    int itr = run(A, xtmp);
    stop_timer();
    printf("Iterations     = %d\n", itr);
    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
    if (itr == MAX_ITERATIONS)
        printf("WARNING: solution did not converge\n");
    printf(SEPARATOR);
    free(A);
    free(xtmp);
    fclose(data);
    return 0;
 }
 int parse_int(const char *str)
 {
    char *next;
    int value = strtoul(str, &next, 10);
    return strlen(next) ? -1 : value;
 }
 double parse_double(const char *str)
 {
    char *next;
    double value = strtod(str, &next);
    return strlen(next) ? -1 : value;
 }
 void parse_arguments(int argc, char *argv[])
 {
    // Set default values
    N = 500;
    MAX_ITERATIONS = 2000;
    CONVERGENCE_THRESHOLD = 0.001;
    SEED = 0;
    for (int i = 1; i < argc; i++)
    {
        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
        {
            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
            {
                printf("Invalid convergence threshold\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
        {
            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
            {
                printf("Invalid number of iterations\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
        {
            if (++i >= argc || (N = parse_int(argv[i])) < 0)
            {
                printf("Invalid matrix order\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
            printf("\n");
            printf("Usage: ./jacobi [OPTIONS]\n\n");
            printf("Options:\n");
            printf("  -h  --help               Print this message\n");
            printf("  -c  --convergence  C     Set convergence threshold\n");
            printf("  -i  --iterations   I     Set maximum number of iterations\n");
            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
            printf("\n");
            exit(0);
        }
        else
        {
            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
            exit(1);
        }
    }
    if (N == 1000)
        data = fopen("data/jacobi-1000.bin", "rb");
    else if (N == 500)
        data = fopen("data/jacobi-500.bin", "rb");
    else
    {
        printf("Invalid matrix order\n");
        exit(1);
    }
 }
--- a/openmp/lab3/.solutions/jacobi-omp3.c
+++ b/openmp/lab3/.solutions/jacobi-omp3.c
@ -0,0 +1,293 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file jacobi.c
 * @author Alessandro Capotondi
 * @date 27 Mar 2020
 * @brief This code solves the steady state heat equation on a rectangular region.
 * This code solves the steady state heat equation on a rectangular region.
 *  The sequential version of this program needs approximately
 *  18/epsilon iterations to complete. 
 *  The physical region, and the boundary conditions, are suggested
 *  by this diagram;
 *                 W = 0
 *           +------------------+
 *           |                  |
 *  W = 100  |                  | W = 100
 *           |                  |
 *           +------------------+
 *                 W = 100
 *  The region is covered with a grid of M by N nodes, and an N by N
 *  array W is used to record the temperature.  The correspondence between
 *  array indices and locations in the region is suggested by giving the
 *  indices of the four corners:
 *                I = 0
 *        [0][0]-------------[0][N-1]
 *           |                  |
 *    J = 0  |                  |  J = N-1
 *           |                  |
 *      [M-1][0]-----------[M-1][N-1]
 *                I = M-1
 *  The steady state solution to the discrete heat equation satisfies the
 *  following condition at an interior grid point:
 *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  where "Central" is the index of the grid point, "North" is the index
 *  of its immediate neighbor to the "north", and so on.
 * 
 *  Given an approximate solution of the steady state heat equation, a
 *  "better" solution is given by replacing each interior point by the
 *  average of its 4 neighbors - in other words, by using the condition
 *  as an ASSIGNMENT statement:
 *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  If this process is repeated often enough, the difference between successive 
 *  estimates of the solution will go to zero.
 *  This program carries out such an iteration, using a tolerance specified by
 *  the user, and writes the final estimate of the solution to a file that can
 *  be used for graphic processing.
 * icensing:
 *  This code is distributed under the GNU LGPL license. 
 * odified:
 *  18 October 2011
 * uthor:
 *  Original C version by Michael Quinn.
 *  This C version by John Burkardt.
 * eference:
 *  Michael Quinn,
 *  Parallel Programming in C with MPI and OpenMP,
 *  McGraw-Hill, 2004,
 *  ISBN13: 978-0071232654,
 *  LC: QA76.73.C15.Q55.
 * ocal parameters:
 *  Local, double DIFF, the norm of the change in the solution from one iteration
 *  to the next.
 *  Local, double MEAN, the average of the boundary values, used to initialize
 *  the values of the solution in the interior.
 *  Local, double U[M][N], the solution at the previous iteration.
 *  Local, double W[M][N], the solution computed at the latest iteration.
 * 
 * 
 * @see https://en.wikipedia.org/wiki/Jacobi_method
 * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
 */
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #include "utils.h"
 static int N;
 static int MAX_ITERATIONS;
 static int SEED;
 static double CONVERGENCE_THRESHOLD;
 static FILE *data;
 #define SEPARATOR "------------------------------------\n"
 // Return the current time in seconds since the Epoch
 double get_timestamp();
 // Parse command line arguments to set solver parameters
 void parse_arguments(int argc, char *argv[]);
 // Run the Jacobi solver
 // Returns the number of iterations performed
 int run(double *restrict A, double *restrict xtmp)
 {
    int iter = 0, iterations_print = 1;
    double err = 0.0;
    do
    {
        err = 0.0;
 #pragma omp target data map(to                                              \
                            : A [0:N * N]) map(from                         \
                                               : xtmp [0:N * N]) map(tofrom \
                                                                     : err)
 #pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(to                                              \
                                                                                    : A [0:N * N]) map(from                         \
                                                                                                       : xtmp [0:N * N]) map(tofrom \
                                                                                                                             : err)
 #pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU) reduction(max \
                                                                                                                        : err) schedule(static, 1)
        for (int i = 1; i < N - 1; i++)
        {
            for (int j = 1; j < N - 1; j++)
            {
                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
            }
        }
 #pragma omp target data map(from                  \
                            : A [0:N * N]) map(to \
                                               : xtmp [0:N * N])
 #pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(from                  \
                                                                                    : A [0:N * N]) map(to \
                                                                                                       : xtmp [0:N * N])
 #pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU) schedule(static, 1)
        for (int i = 0; i < N; i++)
        {
            for (int j = 0; j < N; j++)
            {
                A[i * N + j] = xtmp[i * N + j];
            }
        }
        iter++;
 #ifdef DEBUG
        if (iter == iterations_print)
        {
            printf("  %8d  %f\n", iter, err);
            iterations_print = 2 * iterations_print;
        }
 #endif
    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
    return iter;
 }
 int main(int argc, char *argv[])
 {
    parse_arguments(argc, argv);
    double *A = malloc(N * N * sizeof(double));
    double *xtmp = malloc(N * N * sizeof(double));
    printf(SEPARATOR);
    printf("Matrix size:            %dx%d\n", N, N);
    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
    printf(SEPARATOR);
    for (int ii = 0; ii < N; ii++)
    {
        for (int jj = 0; jj < N; jj++)
        {
            double f;
            fread(&f, sizeof(double), 1, data);
            A[ii * N + jj] = f;
        }
    }
    // Run Jacobi solver
    start_timer();
    int itr = run(A, xtmp);
    stop_timer();
    printf("Iterations     = %d\n", itr);
    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
    if (itr == MAX_ITERATIONS)
        printf("WARNING: solution did not converge\n");
    printf(SEPARATOR);
    free(A);
    free(xtmp);
    fclose(data);
    return 0;
 }
 int parse_int(const char *str)
 {
    char *next;
    int value = strtoul(str, &next, 10);
    return strlen(next) ? -1 : value;
 }
 double parse_double(const char *str)
 {
    char *next;
    double value = strtod(str, &next);
    return strlen(next) ? -1 : value;
 }
 void parse_arguments(int argc, char *argv[])
 {
    // Set default values
    N = 500;
    MAX_ITERATIONS = 2000;
    CONVERGENCE_THRESHOLD = 0.001;
    SEED = 0;
    for (int i = 1; i < argc; i++)
    {
        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
        {
            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
            {
                printf("Invalid convergence threshold\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
        {
            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
            {
                printf("Invalid number of iterations\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
        {
            if (++i >= argc || (N = parse_int(argv[i])) < 0)
            {
                printf("Invalid matrix order\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
            printf("\n");
            printf("Usage: ./jacobi [OPTIONS]\n\n");
            printf("Options:\n");
            printf("  -h  --help               Print this message\n");
            printf("  -c  --convergence  C     Set convergence threshold\n");
            printf("  -i  --iterations   I     Set maximum number of iterations\n");
            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
            printf("\n");
            exit(0);
        }
        else
        {
            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
            exit(1);
        }
    }
    if (N == 1000)
        data = fopen("data/jacobi-1000.bin", "rb");
    else if (N == 500)
        data = fopen("data/jacobi-500.bin", "rb");
    else
    {
        printf("Invalid matrix order\n");
        exit(1);
    }
 }
--- a/openmp/lab3/.solutions/jacobi-omp4.c
+++ b/openmp/lab3/.solutions/jacobi-omp4.c
@ -0,0 +1,292 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file jacobi.c
 * @author Alessandro Capotondi
 * @date 27 Mar 2020
 * @brief This code solves the steady state heat equation on a rectangular region.
 * This code solves the steady state heat equation on a rectangular region.
 *  The sequential version of this program needs approximately
 *  18/epsilon iterations to complete. 
 *  The physical region, and the boundary conditions, are suggested
 *  by this diagram;
 *                 W = 0
 *           +------------------+
 *           |                  |
 *  W = 100  |                  | W = 100
 *           |                  |
 *           +------------------+
 *                 W = 100
 *  The region is covered with a grid of M by N nodes, and an N by N
 *  array W is used to record the temperature.  The correspondence between
 *  array indices and locations in the region is suggested by giving the
 *  indices of the four corners:
 *                I = 0
 *        [0][0]-------------[0][N-1]
 *           |                  |
 *    J = 0  |                  |  J = N-1
 *           |                  |
 *      [M-1][0]-----------[M-1][N-1]
 *                I = M-1
 *  The steady state solution to the discrete heat equation satisfies the
 *  following condition at an interior grid point:
 *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  where "Central" is the index of the grid point, "North" is the index
 *  of its immediate neighbor to the "north", and so on.
 * 
 *  Given an approximate solution of the steady state heat equation, a
 *  "better" solution is given by replacing each interior point by the
 *  average of its 4 neighbors - in other words, by using the condition
 *  as an ASSIGNMENT statement:
 *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  If this process is repeated often enough, the difference between successive 
 *  estimates of the solution will go to zero.
 *  This program carries out such an iteration, using a tolerance specified by
 *  the user, and writes the final estimate of the solution to a file that can
 *  be used for graphic processing.
 * icensing:
 *  This code is distributed under the GNU LGPL license. 
 * odified:
 *  18 October 2011
 * uthor:
 *  Original C version by Michael Quinn.
 *  This C version by John Burkardt.
 * eference:
 *  Michael Quinn,
 *  Parallel Programming in C with MPI and OpenMP,
 *  McGraw-Hill, 2004,
 *  ISBN13: 978-0071232654,
 *  LC: QA76.73.C15.Q55.
 * ocal parameters:
 *  Local, double DIFF, the norm of the change in the solution from one iteration
 *  to the next.
 *  Local, double MEAN, the average of the boundary values, used to initialize
 *  the values of the solution in the interior.
 *  Local, double U[M][N], the solution at the previous iteration.
 *  Local, double W[M][N], the solution computed at the latest iteration.
 * 
 * 
 * @see https://en.wikipedia.org/wiki/Jacobi_method
 * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
 */
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #include "utils.h"
 static int N;
 static int MAX_ITERATIONS;
 static int SEED;
 static double CONVERGENCE_THRESHOLD;
 static FILE *data;
 #define SEPARATOR "------------------------------------\n"
 // Return the current time in seconds since the Epoch
 double get_timestamp();
 // Parse command line arguments to set solver parameters
 void parse_arguments(int argc, char *argv[]);
 // Run the Jacobi solver
 // Returns the number of iterations performed
 int run(double *restrict A, double *restrict xtmp)
 {
    int iter = 0, iterations_print = 1;
    double err = 0.0;
 #pragma omp target enter data map(to                       \
                                  : A [0:N * N]) map(alloc \
                                                     : xtmp [0:N * N])
    do
    {
        err = 0.0;
 #pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(tofrom \
                                                                                    : err)
 #pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU) reduction(max \
                                                                                                                        : err)
        for (int i = 1; i < N - 1; i++)
        {
            for (int j = 1; j < N - 1; j++)
            {
                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
                double diff = fabs(xtmp[i * N + j] - A[i * N + j]);
                int swap = diff > err;
                err = diff * swap + err * !swap;
            }
        }
 #pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU)
 #pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU)
        for (int i = 0; i < N; i++)
        {
            for (int j = 0; j < N; j++)
            {
                A[i * N + j] = xtmp[i * N + j];
            }
        }
        iter++;
 #ifdef DEBUG
        if (iter == iterations_print)
        {
            printf("  %8d  %f\n", iter, err);
            iterations_print = 2 * iterations_print;
        }
 #endif
    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
 #pragma omp target exit data map(from                       \
                                 : A [0:N * N]) map(release \
                                                    : xtmp)
    return iter;
 }
 int main(int argc, char *argv[])
 {
    parse_arguments(argc, argv);
    double *A = malloc(N * N * sizeof(double));
    double *xtmp = malloc(N * N * sizeof(double));
    printf(SEPARATOR);
    printf("Matrix size:            %dx%d\n", N, N);
    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
    printf(SEPARATOR);
    for (int ii = 0; ii < N; ii++)
    {
        for (int jj = 0; jj < N; jj++)
        {
            double f;
            fread(&f, sizeof(double), 1, data);
            A[ii * N + jj] = f;
        }
    }
    // Run Jacobi solver
    start_timer();
    int itr = run(A, xtmp);
    stop_timer();
    printf("Iterations     = %d\n", itr);
    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
    if (itr == MAX_ITERATIONS)
        printf("WARNING: solution did not converge\n");
    printf(SEPARATOR);
    free(A);
    free(xtmp);
    fclose(data);
    return 0;
 }
 int parse_int(const char *str)
 {
    char *next;
    int value = strtoul(str, &next, 10);
    return strlen(next) ? -1 : value;
 }
 double parse_double(const char *str)
 {
    char *next;
    double value = strtod(str, &next);
    return strlen(next) ? -1 : value;
 }
 void parse_arguments(int argc, char *argv[])
 {
    // Set default values
    N = 500;
    MAX_ITERATIONS = 2000;
    CONVERGENCE_THRESHOLD = 0.001;
    SEED = 0;
    for (int i = 1; i < argc; i++)
    {
        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
        {
            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
            {
                printf("Invalid convergence threshold\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
        {
            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
            {
                printf("Invalid number of iterations\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
        {
            if (++i >= argc || (N = parse_int(argv[i])) < 0)
            {
                printf("Invalid matrix order\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
            printf("\n");
            printf("Usage: ./jacobi [OPTIONS]\n\n");
            printf("Options:\n");
            printf("  -h  --help               Print this message\n");
            printf("  -c  --convergence  C     Set convergence threshold\n");
            printf("  -i  --iterations   I     Set maximum number of iterations\n");
            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
            printf("\n");
            exit(0);
        }
        else
        {
            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
            exit(1);
        }
    }
    if (N == 1000)
        data = fopen("data/jacobi-1000.bin", "rb");
    else if (N == 500)
        data = fopen("data/jacobi-500.bin", "rb");
    else
    {
        printf("Invalid matrix order\n");
        exit(1);
    }
 }
--- a/openmp/lab3/.solutions/jacobi-omp5.c
+++ b/openmp/lab3/.solutions/jacobi-omp5.c
@ -0,0 +1,291 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file jacobi.c
 * @author Alessandro Capotondi
 * @date 27 Mar 2020
 * @brief This code solves the steady state heat equation on a rectangular region.
 * This code solves the steady state heat equation on a rectangular region.
 *  The sequential version of this program needs approximately
 *  18/epsilon iterations to complete. 
 *  The physical region, and the boundary conditions, are suggested
 *  by this diagram;
 *                 W = 0
 *           +------------------+
 *           |                  |
 *  W = 100  |                  | W = 100
 *           |                  |
 *           +------------------+
 *                 W = 100
 *  The region is covered with a grid of M by N nodes, and an N by N
 *  array W is used to record the temperature.  The correspondence between
 *  array indices and locations in the region is suggested by giving the
 *  indices of the four corners:
 *                I = 0
 *        [0][0]-------------[0][N-1]
 *           |                  |
 *    J = 0  |                  |  J = N-1
 *           |                  |
 *      [M-1][0]-----------[M-1][N-1]
 *                I = M-1
 *  The steady state solution to the discrete heat equation satisfies the
 *  following condition at an interior grid point:
 *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  where "Central" is the index of the grid point, "North" is the index
 *  of its immediate neighbor to the "north", and so on.
 * 
 *  Given an approximate solution of the steady state heat equation, a
 *  "better" solution is given by replacing each interior point by the
 *  average of its 4 neighbors - in other words, by using the condition
 *  as an ASSIGNMENT statement:
 *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  If this process is repeated often enough, the difference between successive 
 *  estimates of the solution will go to zero.
 *  This program carries out such an iteration, using a tolerance specified by
 *  the user, and writes the final estimate of the solution to a file that can
 *  be used for graphic processing.
 * icensing:
 *  This code is distributed under the GNU LGPL license. 
 * odified:
 *  18 October 2011
 * uthor:
 *  Original C version by Michael Quinn.
 *  This C version by John Burkardt.
 * eference:
 *  Michael Quinn,
 *  Parallel Programming in C with MPI and OpenMP,
 *  McGraw-Hill, 2004,
 *  ISBN13: 978-0071232654,
 *  LC: QA76.73.C15.Q55.
 * ocal parameters:
 *  Local, double DIFF, the norm of the change in the solution from one iteration
 *  to the next.
 *  Local, double MEAN, the average of the boundary values, used to initialize
 *  the values of the solution in the interior.
 *  Local, double U[M][N], the solution at the previous iteration.
 *  Local, double W[M][N], the solution computed at the latest iteration.
 * 
 * 
 * @see https://en.wikipedia.org/wiki/Jacobi_method
 * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
 */
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #include "utils.h"
 static int N;
 static int MAX_ITERATIONS;
 static int SEED;
 static double CONVERGENCE_THRESHOLD;
 static FILE *data;
 #define SEPARATOR "------------------------------------\n"
 // Return the current time in seconds since the Epoch
 double get_timestamp();
 // Parse command line arguments to set solver parameters
 void parse_arguments(int argc, char *argv[]);
 // Run the Jacobi solver
 // Returns the number of iterations performed
 int run(double *restrict A, double *restrict xtmp)
 {
    int iter = 0, iterations_print = 1;
    double err = 0.0;
 #pragma omp target enter data map(to                       \
                                  : A [0:N * N]) map(alloc \
                                                     : xtmp [0:N * N])
    do
    {
        err = 0.0;
 #pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(tofrom \
                                                                                    : err)
 #pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU) reduction(max \
                                                                                                                        : err)
        for (int i = 1; i < N - 1; i++)
        {
            for (int j = 1; j < N - 1; j++)
            {
                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
            }
        }
 //#pragma omp target update from(xtmp[0:N*N])
 #pragma omp target teams num_teams(N / NTHREADS_GPU) thread_limit(NTHREADS_GPU)
 #pragma omp distribute parallel for collapse(2) num_threads(NTHREADS_GPU) dist_schedule(static, NTHREADS_GPU)
        for (int i = 0; i < N; i++)
        {
            for (int j = 0; j < N; j++)
            {
                A[i * N + j] = xtmp[i * N + j];
            }
        }
        iter++;
 #ifdef DEBUG
        if (iter == iterations_print)
        {
            printf("  %8d  %f\n", iter, err);
            iterations_print = 2 * iterations_print;
        }
 #endif
    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
 #pragma omp target exit data map(from                       \
                                 : A [0:N * N]) map(release \
                                                    : xtmp)
    return iter;
 }
 int main(int argc, char *argv[])
 {
    parse_arguments(argc, argv);
    double *A = malloc(N * N * sizeof(double));
    double *xtmp = malloc(N * N * sizeof(double));
    printf(SEPARATOR);
    printf("Matrix size:            %dx%d\n", N, N);
    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
    printf(SEPARATOR);
    for (int ii = 0; ii < N; ii++)
    {
        for (int jj = 0; jj < N; jj++)
        {
            double f;
            fread(&f, sizeof(double), 1, data);
            A[ii * N + jj] = f;
        }
    }
    // Run Jacobi solver
    start_timer();
    int itr = run(A, xtmp);
    stop_timer();
    printf("Iterations     = %d\n", itr);
    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
    if (itr == MAX_ITERATIONS)
        printf("WARNING: solution did not converge\n");
    printf(SEPARATOR);
    free(A);
    free(xtmp);
    fclose(data);
    return 0;
 }
 int parse_int(const char *str)
 {
    char *next;
    int value = strtoul(str, &next, 10);
    return strlen(next) ? -1 : value;
 }
 double parse_double(const char *str)
 {
    char *next;
    double value = strtod(str, &next);
    return strlen(next) ? -1 : value;
 }
 void parse_arguments(int argc, char *argv[])
 {
    // Set default values
    N = 500;
    MAX_ITERATIONS = 2000;
    CONVERGENCE_THRESHOLD = 0.001;
    SEED = 0;
    for (int i = 1; i < argc; i++)
    {
        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
        {
            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
            {
                printf("Invalid convergence threshold\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
        {
            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
            {
                printf("Invalid number of iterations\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
        {
            if (++i >= argc || (N = parse_int(argv[i])) < 0)
            {
                printf("Invalid matrix order\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
            printf("\n");
            printf("Usage: ./jacobi [OPTIONS]\n\n");
            printf("Options:\n");
            printf("  -h  --help               Print this message\n");
            printf("  -c  --convergence  C     Set convergence threshold\n");
            printf("  -i  --iterations   I     Set maximum number of iterations\n");
            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
            printf("\n");
            exit(0);
        }
        else
        {
            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
            exit(1);
        }
    }
    if (N == 1000)
        data = fopen("data/jacobi-1000.bin", "rb");
    else if (N == 500)
        data = fopen("data/jacobi-500.bin", "rb");
    else
    {
        printf("Invalid matrix order\n");
        exit(1);
    }
 }
--- a/openmp/lab3/.solutions/matmul-omp1.c
+++ b/openmp/lab3/.solutions/matmul-omp1.c
@ -0,0 +1,175 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <omp.h>
 #include "utils.h"
 #ifndef N
 #define N (1 << 11)
 #endif
 #pragma omp declare target
 #define SM 64
 static void reorder2(float *restrict a, float *restrict b, int n)
 {
    for (int i = 0; i < SM; i++)
        for (int j = 0; j < SM; j++)
            b[i * SM + j] = a[i * n + j];
 }
 static void kernel(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    for (int i = 0; i < SM; i++)
    {
        for (int k = 0; k < SM; k++)
        {
            for (int j = 0; j < SM; j++)
            {
                c[i * n + j] += a[i * n + k] * b[k * SM + j];
            }
        }
    }
 }
 void gemm_acc(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    int bk = n / SM;
 #pragma omp target data map(to                                          \
                            : n, bk, a [0:n * n], b [0:n * n]) map(from \
                                                                   : c[:n * n])
 #pragma omp target teams num_teams(bk / NTHREADS_GPU) thread_limit(NTHREADS_GPU) map(to                                          \
                                                                                     : n, bk, a [0:n * n], b [0:n * n]) map(from \
                                                                                                                            : c[:n * n])
 #pragma omp distribute parallel for num_threads(NTHREADS_GPU) collapse(3) dist_schedule(static, NTHREADS_GPU)
    for (int i = 0; i < bk; i++)
    {
        for (int j = 0; j < bk; j++)
        {
            for (int k = 0; k < bk; k++)
            {
                float b2[SM * SM];
                reorder2(&b[SM * (k * n + j)], b2, n);
                kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
            }
        }
    }
 }
 #pragma omp end declare target
 void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    int bk = n / SM;
    float b2[SM * SM];
    for (int i = 0; i < bk; i++)
    {
        for (int j = 0; j < bk; j++)
        {
            for (int k = 0; k < bk; k++)
            {
                reorder2(&b[SM * (k * n + j)], b2, n);
                kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
            }
        }
    }
 }
 void gemm(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    int i, j, k;
    for (int i = 0; i < n; ++i)
    {
        for (int j = 0; j < n; ++j)
        {
            float sum = 0.0;
            for (int k = 0; k < n; ++k)
            {
                sum += a[i + k * n] * b[k + j * n];
            }
            c[i * n + j] += sum;
        }
    }
 }
 int main(int argc, char *argv[])
 {
    int i, n = N,
           iret = 0;
    float *a, *b, *c, *g;
    struct timespec rt[2];
    double wt; // walltime
    if (argc > 1)
        n = atoi(argv[1]);
    /*
   * 0. prepare x, y, and z
   *
   * y := a * x + y (on host)
   * z := a * x + z (on accel)
   */
    if (NULL == (a = (float *)malloc(sizeof(*a) * n * n)))
    {
        printf("error: memory allocation for 'x'\n");
        iret = -1;
    }
    if (NULL == (b = (float *)malloc(sizeof(*b) * n * n)))
    {
        printf("error: memory allocation for 'y'\n");
        iret = -1;
    }
    if (NULL == (c = (float *)malloc(sizeof(*c) * n * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (NULL == (g = (float *)malloc(sizeof(*g) * n * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (0 != iret)
    {
        free(a);
        free(b);
        free(c);
        free(g);
        exit(EXIT_FAILURE);
    }
    if (n <= 1024)
    {
        clock_gettime(CLOCK_REALTIME, rt + 0);
        gemm(a, b, c, n);
        clock_gettime(CLOCK_REALTIME, rt + 1);
        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
        printf("gemm on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    }
    if (n <= 4096)
    {
        clock_gettime(CLOCK_REALTIME, rt + 0);
        gemm_opt(a, b, c, n);
        clock_gettime(CLOCK_REALTIME, rt + 1);
        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
        printf("gemm_opt on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    }
    clock_gettime(CLOCK_REALTIME, rt + 0);
    gemm_acc(a, b, c, n);
    clock_gettime(CLOCK_REALTIME, rt + 1);
    wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
    printf("gemm_acc : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    if (n <= 4096)
        for (i = 0; i < n; i++)
        {
            iret = *(int *)(g + i) ^ *(int *)(c + i);
            assert(iret == 0);
        }
    return 0;
 }
--- a/openmp/lab3/.solutions/matmul-omp2.c
+++ b/openmp/lab3/.solutions/matmul-omp2.c
@ -0,0 +1,500 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <omp.h>
 #include <cuda_runtime.h>
 #include "cublas_v2.h"
 #ifndef N
 #define N (1 << 10)
 #endif
 #pragma omp declare target
 #define SM 64
 #define NTHRDS7 (1 << 0x7) /* 2^{7}  */
 #define NTHRDS8 (1 << 0x8) /* 2^{8}  */
 #define NTHRDS9 (1 << 0x9) /* 2^{9}  */
 #define LTEAMSD (1 << 0xD) /* 2^{13} */
 #define LTEAMSE (1 << 0xE) /* 2^{14} */
 #define LTEAMSF (1 << 0xF) /* 2^{15} */
 #define LTEAMSG (1 << 020) /* 2^{16} */
 #define BLKROW (512) /* 4x number of threads in each team */
 #define BLKDIM (16)
 void gemm_accel_opt2(float *restrict a, float *restrict b, float *restrict c, int n)
 {
 /*
 * - jik-loop
 * - 2^7 threads per team and 2^13 teams
 * - collapse(3)
 * - 4x j-loop unrolling (stride of 1   col )
 * - 4x i-loop unrolling (stride of 2^7 rows)
 * - 4x k-loop unrolling
 * - rb: 4x data re-use
 * - ra: 4x data re-use
 * - register blocking
 */
 #pragma omp target data                           \
    map(to                                        \
        : n, a [0:n * n], b [0:n * n]) map(tofrom \
                                           : c [0:n * n])
    {
 #pragma omp target teams num_teams(LTEAMSD) thread_limit(NTHRDS7) \
    map(to                                                        \
        : n, a [0:n * n], b [0:n * n]) map(tofrom                 \
                                           : c [0:n * n]) default(none) shared(a, b, c, n)
 #pragma omp distribute parallel for num_threads(NTHRDS7) \
    dist_schedule(static, NTHRDS7) collapse(3) default(none) shared(a, b, c, n)
        for (int j = 0; j < n; j += 4)
        { /* 4x unrolling */
            for (int iblk = 0; iblk < n / BLKROW; ++iblk)
            {
                for (int i = 0; i < NTHRDS7; ++i)
                { /* 4x unrolling */
                    /* register for c: 4x j-loop * 4x i-loop */
                    float rc0, rc1, rc2, rc3,
                        rc4, rc5, rc6, rc7,
                        rc8, rc9, rca, rcb,
                        rcc, rcd, rce, rcf;
                    rc0 = c[j * n + iblk * BLKROW + i];
                    rc1 = c[j * n + iblk * BLKROW + i + NTHRDS7];
                    rc2 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 2];
                    rc3 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 3];
                    rc4 = c[(j + 1) * n + iblk * BLKROW + i];
                    rc5 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7];
                    rc6 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2];
                    rc7 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3];
                    rc8 = c[(j + 2) * n + iblk * BLKROW + i];
                    rc9 = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7];
                    rca = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2];
                    rcb = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3];
                    rcc = c[(j + 3) * n + iblk * BLKROW + i];
                    rcd = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7];
                    rce = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2];
                    rcf = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3];
                    for (int k = 0; k < n; k += 4)
                    { /* 4x unrolling */
                        /* register for b: 4x j-loop * 4x k-loop */
                        float rb0, rb1, rb2, rb3,
                            rb4, rb5, rb6, rb7,
                            rb8, rb9, rba, rbb,
                            rbc, rbd, rbe, rbf;
                        rb0 = b[j * n + k];
                        rb1 = b[j * n + k + 1];
                        rb2 = b[j * n + k + 2];
                        rb3 = b[j * n + k + 3];
                        rb4 = b[(j + 1) * n + k];
                        rb5 = b[(j + 1) * n + k + 1];
                        rb6 = b[(j + 1) * n + k + 2];
                        rb7 = b[(j + 1) * n + k + 3];
                        rb8 = b[(j + 2) * n + k];
                        rb9 = b[(j + 2) * n + k + 1];
                        rba = b[(j + 2) * n + k + 2];
                        rbb = b[(j + 2) * n + k + 3];
                        rbc = b[(j + 3) * n + k];
                        rbd = b[(j + 3) * n + k + 1];
                        rbe = b[(j + 3) * n + k + 2];
                        rbf = b[(j + 3) * n + k + 3];
                        /* register for a: 4x i-loop * 4x k-loop */
                        float ra0, ra1, ra2, ra3,
                            ra4, ra5, ra6, ra7,
                            ra8, ra9, raa, rab,
                            rac, rad, rae, raf;
                        ra0 = a[k * n + iblk * BLKROW + i];
                        ra1 = a[k * n + iblk * BLKROW + i + NTHRDS7];
                        ra2 = a[k * n + iblk * BLKROW + i + NTHRDS7 * 2];
                        ra3 = a[k * n + iblk * BLKROW + i + NTHRDS7 * 3];
                        ra4 = a[(k + 1) * n + iblk * BLKROW + i];
                        ra5 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7];
                        ra6 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2];
                        ra7 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3];
                        ra8 = a[(k + 2) * n + iblk * BLKROW + i];
                        ra9 = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7];
                        raa = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2];
                        rab = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3];
                        rac = a[(k + 3) * n + iblk * BLKROW + i];
                        rad = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7];
                        rae = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2];
                        raf = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3];
                        /*
     * register blocking
     */
                        // col 1 of c:
                        rc0 += ra0 * rb0;
                        rc0 += ra4 * rb1;
                        rc0 += ra8 * rb2;
                        rc0 += rac * rb3;
                        rc1 += ra1 * rb0;
                        rc1 += ra5 * rb1;
                        rc1 += ra9 * rb2;
                        rc1 += rad * rb3;
                        rc2 += ra2 * rb0;
                        rc2 += ra6 * rb1;
                        rc2 += raa * rb2;
                        rc2 += rae * rb3;
                        rc3 += ra3 * rb0;
                        rc3 += ra7 * rb1;
                        rc3 += rab * rb2;
                        rc3 += raf * rb3;
                        // col 2 of c:
                        rc4 += ra0 * rb4;
                        rc4 += ra4 * rb5;
                        rc4 += ra8 * rb6;
                        rc4 += rac * rb7;
                        rc5 += ra1 * rb4;
                        rc5 += ra5 * rb5;
                        rc5 += ra9 * rb6;
                        rc5 += rad * rb7;
                        rc6 += ra2 * rb4;
                        rc6 += ra6 * rb5;
                        rc6 += raa * rb6;
                        rc6 += rae * rb7;
                        rc7 += ra3 * rb4;
                        rc7 += ra7 * rb5;
                        rc7 += rab * rb6;
                        rc7 += raf * rb7;
                        // col 3 of c:
                        rc8 += ra0 * rb8;
                        rc8 += ra4 * rb9;
                        rc8 += ra8 * rba;
                        rc8 += rac * rbb;
                        rc9 += ra1 * rb8;
                        rc9 += ra5 * rb9;
                        rc9 += ra9 * rba;
                        rc9 += rad * rbb;
                        rca += ra2 * rb8;
                        rca += ra6 * rb9;
                        rca += raa * rba;
                        rca += rae * rbb;
                        rcb += ra3 * rb8;
                        rcb += ra7 * rb9;
                        rcb += rab * rba;
                        rcb += raf * rbb;
                        // col 4 of c:
                        rcc += ra0 * rbc;
                        rcc += ra4 * rbd;
                        rcc += ra8 * rbe;
                        rcc += rac * rbf;
                        rcd += ra1 * rbc;
                        rcd += ra5 * rbd;
                        rcd += ra9 * rbe;
                        rcd += rad * rbf;
                        rce += ra2 * rbc;
                        rce += ra6 * rbd;
                        rce += raa * rbe;
                        rce += rae * rbf;
                        rcf += ra3 * rbc;
                        rcf += ra7 * rbd;
                        rcf += rab * rbe;
                        rcf += raf * rbf;
                    }
                    c[j * n + iblk * BLKROW + i] = rc0;
                    c[j * n + iblk * BLKROW + i + NTHRDS7] = rc1;
                    c[j * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2;
                    c[j * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3;
                    c[(j + 1) * n + iblk * BLKROW + i] = rc4;
                    c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7] = rc5;
                    c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc6;
                    c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc7;
                    c[(j + 2) * n + iblk * BLKROW + i] = rc8;
                    c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7] = rc9;
                    c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rca;
                    c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcb;
                    c[(j + 3) * n + iblk * BLKROW + i] = rcc;
                    c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7] = rcd;
                    c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rce;
                    c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcf;
                } /* end i-loop */
            }     /* end iblk-loop */
        }         /* end j-loop */
    }
 }
 void gemm_cublas(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    cublasHandle_t handle;
    float alfa = 1.0f,
          beta = 1.0f,
          *a_dev = NULL,
          *b_dev = NULL,
          *c_dev = NULL;
    /*
 * cublasSgemm in CUBLAS
 */
    if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle))
    {
        printf("error: initialization (CUBLAS)\n");
        cublasDestroy(handle);
        exit(EXIT_FAILURE);
    }
    if (cudaSuccess != cudaMalloc((void **)&a_dev, sizeof(*a) * n * n) ||
        cudaSuccess != cudaMalloc((void **)&b_dev, sizeof(*b) * n * n) ||
        cudaSuccess != cudaMalloc((void **)&c_dev, sizeof(*c) * n * n))
    {
        printf("error: memory allocation (CUDA)\n");
        cudaFree(a_dev);
        cudaFree(b_dev);
        cudaFree(c_dev);
        cublasDestroy(handle);
        exit(EXIT_FAILURE);
    }
    if (CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*a), a, n, a_dev, n) ||
        CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*b), b, n, b_dev, n) ||
        CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*c), c, n, c_dev, n))
    {
        printf("error: host --> accl (CUBLAS)\n");
        cudaFree(a_dev);
        cudaFree(b_dev);
        cudaFree(c_dev);
        cublasDestroy(handle);
        exit(EXIT_FAILURE);
    }
    if (CUBLAS_STATUS_SUCCESS != cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                                             n, n, n, &alfa, a_dev, n, b_dev, n, &beta, c_dev, n))
    {
        printf("error: cublasSgemm (CUBLAS)\n");
        cudaFree(a_dev);
        cudaFree(b_dev);
        cudaFree(c_dev);
        cublasDestroy(handle);
        exit(EXIT_FAILURE);
    }
    if (cudaSuccess != cudaDeviceSynchronize())
    {
        printf("error: device synchronization (CUDA)\n");
        cudaFree(a_dev);
        cudaFree(b_dev);
        cudaFree(c_dev);
        cublasDestroy(handle);
        exit(EXIT_FAILURE);
    }
    if (CUBLAS_STATUS_SUCCESS != cublasGetMatrix(n, n, sizeof(*c), c_dev, n, c, n))
    {
        printf("error: accl --> host (CUBLAS)\n");
        cudaFree(a_dev);
        cudaFree(b_dev);
        cudaFree(c_dev);
        cublasDestroy(handle);
        exit(EXIT_FAILURE);
    }
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    cublasDestroy(handle);
 }
 static void reorder2(float *restrict a, float *restrict b, int n)
 {
    for (int i = 0; i < SM; i++)
        for (int j = 0; j < SM; j++)
            b[i * SM + j] = a[i * n + j];
 }
 static void kernel(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    for (int i = 0; i < SM; i++)
    {
        for (int k = 0; k < SM; k++)
        {
            for (int j = 0; j < SM; j++)
            {
                c[i * n + j] += a[i * n + k] * b[k * SM + j];
            }
        }
    }
 }
 void gemm_accel_opt(float *restrict a, float *restrict b, float *restrict c, int n)
 {
 #pragma omp target teams distribute parallel for collapse(3) map(to                                      \
                                                                 : n, a [0:n * n], b [0:n * n]) map(from \
                                                                                                    : c [0:n * n]) schedule(static, 1)
    for (int i = 0; i < n / SM; i++)
    {
        for (int j = 0; j < n / SM; j++)
        {
            for (int k = 0; k < n / SM; k++)
            {
                float b2[SM * SM];
                reorder2(&b[SM * (k * n + j)], b2, n);
                kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
            }
        }
    }
 }
 #pragma omp end declare target
 void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    int bk = n / SM;
 #pragma omp parallel
    {
        float b2[SM * SM];
 #pragma omp for collapse(3)
        for (int i = 0; i < bk; i++)
        {
            for (int j = 0; j < bk; j++)
            {
                for (int k = 0; k < bk; k++)
                {
                    reorder2(&b[SM * (k * n + j)], b2, n);
                    kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
                }
            }
        }
    }
 }
 void gemm(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    int i, j, k;
 #pragma omp parallel for simd collapse(2) schedule(simd \
                                                   : static)
    for (int i = 0; i < n; ++i)
    {
        for (int j = 0; j < n; ++j)
        {
            float sum = 0.0;
            for (int k = 0; k < n; ++k)
            {
                sum += a[i + k * n] * b[k + j * n];
            }
            c[i * n + j] += sum;
        }
    }
 }
 int main(int argc, char *argv[])
 {
    int i, n = N,
           iret = 0;
    float *a, *b, *c, *g;
    struct timespec rt[2];
    double wt; // walltime
    if (argc > 1)
        n = atoi(argv[1]);
    /*
   * 0. prepare x, y, and z
   *
   * y := a * x + y (on host)
   * z := a * x + z (on accel)
   */
    if (NULL == (a = (float *)malloc(sizeof(*a) * n * n)))
    {
        printf("error: memory allocation for 'x'\n");
        iret = -1;
    }
    if (NULL == (b = (float *)malloc(sizeof(*b) * n * n)))
    {
        printf("error: memory allocation for 'y'\n");
        iret = -1;
    }
    if (NULL == (c = (float *)malloc(sizeof(*c) * n * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (NULL == (g = (float *)malloc(sizeof(*g) * n * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (0 != iret)
    {
        free(a);
        free(b);
        free(c);
        free(g);
        exit(EXIT_FAILURE);
    }
    if (n <= 1024)
    {
        clock_gettime(CLOCK_REALTIME, rt + 0);
        gemm(a, b, c, n);
        clock_gettime(CLOCK_REALTIME, rt + 1);
        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
        printf("gemm on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    }
    if (n <= 4096)
    {
        clock_gettime(CLOCK_REALTIME, rt + 0);
        gemm_opt(a, b, c, n);
        clock_gettime(CLOCK_REALTIME, rt + 1);
        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
        printf("gemm_opt on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    }
 #if 0
 #pragma omp target teams distribute parallel for map(to                                   \
                                                     : a [0:n * n], b [0:n * n]) map(from \
                                                                                     : c [0:n * n]) collapse(2)
        for(int i = 0; i < n; ++i){
            for(int j = 0; j < n; ++j){
                float sum = 0.0;
                for(int k = 0; k < n; ++k){
                    sum += a[i+k*n]*b[k+j*n];
                }
                c[i*n+j] += sum;
            }
        }
 #endif
    if (n <= 4096)
    {
        clock_gettime(CLOCK_REALTIME, rt + 0);
        gemm_accel_opt(a, b, c, n);
        clock_gettime(CLOCK_REALTIME, rt + 1);
        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
        printf("GEMM-opt1 on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
        for (i = 0; i < n; i++)
        {
            iret = *(int *)(g + i) ^ *(int *)(c + i);
            assert(iret == 0);
        }
    }
    clock_gettime(CLOCK_REALTIME, rt + 0);
    gemm_accel_opt2(a, b, c, n);
    clock_gettime(CLOCK_REALTIME, rt + 1);
    wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
    printf("GEMM-opt2 on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    if (n <= 4096)
        for (i = 0; i < n; i++)
        {
            iret = *(int *)(g + i) ^ *(int *)(c + i);
            assert(iret == 0);
        }
    clock_gettime(CLOCK_REALTIME, rt + 0);
    gemm_cublas(a, b, c, n);
    clock_gettime(CLOCK_REALTIME, rt + 1);
    wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
    printf("CUBLAS on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    if (n <= 4096)
        for (i = 0; i < n; i++)
        {
            iret = *(int *)(g + i) ^ *(int *)(c + i);
            assert(iret == 0);
        }
    free(a);
    free(b);
    free(c);
    free(g);
    return 0;
 }
--- a/openmp/lab3/.solutions/saxpy-omp1.c
+++ b/openmp/lab3/.solutions/saxpy-omp1.c
@ -0,0 +1,122 @@
 /**
 * @file saxpy.c
 *
 * @brief saxpy performs the \c axpy computation in single-precision on both
 * host and accelerator. The performance (in MFLOPS) on host and accelerator is
 * compared and the numerical results are also verified for consistency.
 *
 * The \c axpy computation is defined as:
 *
 * y := a * x + y
 *
 * where:
 *
 * - a is a scalar.
 * - x and y are vectors each with n elements.
 *
 * Please note that in this version only <em>one GPU thread</em> is used.
 *
 * Offload to GPU:
 *
 * gcc -fopenmp -foffload=nvptx-none saxpy.c
 *
 */
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <omp.h>
 #include "utils.h"
 #define TWO02 (1 << 2)
 #define TWO04 (1 << 4)
 #define TWO08 (1 << 8)
 #ifndef N
 #define N (1 << 20)
 #endif
 int main(int argc, char *argv[])
 {
  int i, n = N,
         iret = 0;
  float a = 101.0f / TWO02,
        b, c,
        *x, *y, *z;
  struct timespec rt[2];
  double wt; // walltime
  if (argc > 1)
    n = atoi(argv[1]);
  /*
   * 0. prepare x, y, and z
   *
   * y := a * x + y (on host)
   * z := a * x + z (on accel)
   */
  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
  {
    printf("error: memory allocation for 'x'\n");
    iret = -1;
  }
  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
  {
    printf("error: memory allocation for 'y'\n");
    iret = -1;
  }
  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
  {
    printf("error: memory allocation for 'z'\n");
    iret = -1;
  }
  if (0 != iret)
  {
    free(x);
    free(y);
    free(z);
    exit(EXIT_FAILURE);
  }
  b = rand() % TWO04;
  c = rand() % TWO08;
  for (i = 0; i < n; i++)
  {
    x[i] = b / (float)TWO02;
    y[i] = z[i] = c / (float)TWO04;
  }
  /*
   * 1. saxpy on host
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
  for (i = 0; i < n; i++)
  {
    y[i] = a * x[i] + y[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 2. saxpy on accel
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
 #pragma omp target map(to                          \
                       : a, n, x [0:n]) map(tofrom \
                                            : z [0:n])
  for (int i = 0; i < n; i++)
  {
    z[i] = a * x[i] + z[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 3. verify numerical consistency
   */
  for (i = 0; i < n; i++)
  {
    iret = *(int *)(y + i) ^ *(int *)(z + i);
    assert(iret == 0);
  }
  return 0;
 }
--- a/openmp/lab3/.solutions/saxpy-omp2.c
+++ b/openmp/lab3/.solutions/saxpy-omp2.c
@ -0,0 +1,122 @@
 /**
 * @file saxpy.c
 *
 * @brief saxpy performs the \c axpy computation in single-precision on both
 * host and accelerator. The performance (in MFLOPS) on host and accelerator is
 * compared and the numerical results are also verified for consistency.
 *
 * The \c axpy computation is defined as:
 *
 * y := a * x + y
 *
 * where:
 *
 * - a is a scalar.
 * - x and y are vectors each with n elements.
 *
 * Please note that in this version only <em>one GPU thread</em> is used.
 *
 * Offload to GPU:
 *
 * gcc -fopenmp -foffload=nvptx-none saxpy.c
 *
 */
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <omp.h>
 #include "utils.h"
 #define TWO02 (1 << 2)
 #define TWO04 (1 << 4)
 #define TWO08 (1 << 8)
 #ifndef N
 #define N (1 << 20)
 #endif
 int main(int argc, char *argv[])
 {
  int i, n = N,
         iret = 0;
  float a = 101.0f / TWO02,
        b, c,
        *x, *y, *z;
  struct timespec rt[2];
  double wt; // walltime
  if (argc > 1)
    n = atoi(argv[1]);
  /*
   * 0. prepare x, y, and z
   *
   * y := a * x + y (on host)
   * z := a * x + z (on accel)
   */
  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
  {
    printf("error: memory allocation for 'x'\n");
    iret = -1;
  }
  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
  {
    printf("error: memory allocation for 'y'\n");
    iret = -1;
  }
  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
  {
    printf("error: memory allocation for 'z'\n");
    iret = -1;
  }
  if (0 != iret)
  {
    free(x);
    free(y);
    free(z);
    exit(EXIT_FAILURE);
  }
  b = rand() % TWO04;
  c = rand() % TWO08;
  for (i = 0; i < n; i++)
  {
    x[i] = b / (float)TWO02;
    y[i] = z[i] = c / (float)TWO04;
  }
  /*
   * 1. saxpy on host
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
  for (i = 0; i < n; i++)
  {
    y[i] = a * x[i] + y[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 2. saxpy on accel
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
 #pragma omp target parallel for map(to                          \
                                    : a, n, x [0:n]) map(tofrom \
                                                         : z [0:n])
  for (int i = 0; i < n; i++)
  {
    z[i] = a * x[i] + z[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 3. verify numerical consistency
   */
  for (i = 0; i < n; i++)
  {
    iret = *(int *)(y + i) ^ *(int *)(z + i);
    assert(iret == 0);
  }
  return 0;
 }
--- a/openmp/lab3/.solutions/saxpy-omp3.c
+++ b/openmp/lab3/.solutions/saxpy-omp3.c
@ -0,0 +1,129 @@
 /**
 * @file saxpy.c
 *
 * @brief saxpy performs the \c axpy computation in single-precision on both
 * host and accelerator. The performance (in MFLOPS) on host and accelerator is
 * compared and the numerical results are also verified for consistency.
 *
 * The \c axpy computation is defined as:
 *
 * y := a * x + y
 *
 * where:
 *
 * - a is a scalar.
 * - x and y are vectors each with n elements.
 *
 * Please note that in this version only <em>one GPU thread</em> is used.
 *
 * Offload to GPU:
 *
 * gcc -fopenmp -foffload=nvptx-none saxpy.c
 *
 */
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <omp.h>
 #include "utils.h"
 #define TWO02 (1 << 2)
 #define TWO04 (1 << 4)
 #define TWO08 (1 << 8)
 #ifndef N
 #define N (1 << 27)
 #endif
 int main(int argc, char *argv[])
 {
  int i, n = N,
         iret = 0;
  float a = 101.0f / TWO02,
        b, c,
        *x, *y, *z;
  struct timespec rt[2];
  double wt; // walltime
  if (argc > 1)
    n = atoi(argv[1]);
  /*
   * 0. prepare x, y, and z
   *
   * y := a * x + y (on host)
   * z := a * x + z (on accel)
   */
  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
  {
    printf("error: memory allocation for 'x'\n");
    iret = -1;
  }
  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
  {
    printf("error: memory allocation for 'y'\n");
    iret = -1;
  }
  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
  {
    printf("error: memory allocation for 'z'\n");
    iret = -1;
  }
  if (0 != iret)
  {
    free(x);
    free(y);
    free(z);
    exit(EXIT_FAILURE);
  }
  b = rand() % TWO04;
  c = rand() % TWO08;
  for (i = 0; i < n; i++)
  {
    x[i] = b / (float)TWO02;
    y[i] = z[i] = c / (float)TWO04;
  }
  /*
   * 1. saxpy on host
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
  for (i = 0; i < n; i++)
  {
    y[i] = a * x[i] + y[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 2. saxpy on accel
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
 #pragma omp target data map(to                          \
                            : a, n, x [0:n]) map(tofrom \
                                                 : z [0:n])
 #pragma omp target teams num_teams(n / NTHREADS_GPU) thread_limit(NTHREADS_GPU) \
    map(to                                                                      \
        : a, n, x [0:n]) map(tofrom                                             \
                             : z [0:n])
 #pragma omp distribute parallel for num_threads(NTHREADS_GPU) \
    dist_schedule(static, NTHREADS_GPU)
  for (int i = 0; i < n; i++)
  {
    z[i] = a * x[i] + z[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 3. verify numerical consistency
   */
  for (i = 0; i < n; i++)
  {
    iret = *(int *)(y + i) ^ *(int *)(z + i);
    assert(iret == 0);
  }
  return 0;
 }
--- a/openmp/lab3/.solutions/saxpy-omp4.c
+++ b/openmp/lab3/.solutions/saxpy-omp4.c
@ -0,0 +1,128 @@
 /**
 * @file saxpy.c
 *
 * @brief saxpy performs the \c axpy computation in single-precision on both
 * host and accelerator. The performance (in MFLOPS) on host and accelerator is
 * compared and the numerical results are also verified for consistency.
 *
 * The \c axpy computation is defined as:
 *
 * y := a * x + y
 *
 * where:
 *
 * - a is a scalar.
 * - x and y are vectors each with n elements.
 *
 * Please note that in this version only <em>one GPU thread</em> is used.
 *
 * Offload to GPU:
 *
 * gcc -fopenmp -foffload=nvptx-none saxpy.c
 *
 */
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <omp.h>
 #include "utils.h"
 #define TWO02 (1 << 2)
 #define TWO04 (1 << 4)
 #define TWO08 (1 << 8)
 #ifndef N
 #define N (1 << 27)
 #endif
 int main(int argc, char *argv[])
 {
  int i, n = N,
         iret = 0;
  float a = 101.0f / TWO02,
        b, c,
        *x, *y, *z;
  struct timespec rt[2];
  double wt; // walltime
  if (argc > 1)
    n = atoi(argv[1]);
  /*
   * 0. prepare x, y, and z
   *
   * y := a * x + y (on host)
   * z := a * x + z (on accel)
   */
  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
  {
    printf("error: memory allocation for 'x'\n");
    iret = -1;
  }
  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
  {
    printf("error: memory allocation for 'y'\n");
    iret = -1;
  }
  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
  {
    printf("error: memory allocation for 'z'\n");
    iret = -1;
  }
  if (0 != iret)
  {
    free(x);
    free(y);
    free(z);
    exit(EXIT_FAILURE);
  }
  b = rand() % TWO04;
  c = rand() % TWO08;
  for (i = 0; i < n; i++)
  {
    x[i] = b / (float)TWO02;
    y[i] = z[i] = c / (float)TWO04;
  }
  /*
   * 1. saxpy on host
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
  for (i = 0; i < n; i++)
  {
    y[i] = a * x[i] + y[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 2. saxpy on accel
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
  int BLOCK=n/8;
  for (int i = 0; i < n; i+=BLOCK)
  {
 #pragma omp target teams distribute parallel for map(to: a, x [i:BLOCK]) map(tofrom: z [i:BLOCK]) nowait
    for (int ii = 0; ii < BLOCK; ii++)
    {
      z[i+ii] = a * x[i+ii] + z[i+ii];
    }
  }
  #pragma omp taskwait
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 3. verify numerical consistency
   */
  for (i = 0; i < n; i++)
  {
    iret = *(int *)(y + i) ^ *(int *)(z + i);
    assert(iret == 0);
  }
  return 0;
 }
--- a/openmp/lab3/Makefile
+++ b/openmp/lab3/Makefile
@ -0,0 +1,32 @@
 ifndef EXERCISE
 EXERCISE=exercise1.c
 endif
 CC=clang
 LD=ld
 OBJDUMP=objdump
 OPT=-O3 -g
 OMP=-fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
 CFLAGS=$(OPT) $(OMP) -I. $(EXT_CFLAGS)
 LDFLAGS=-lm $(EXT_LDFLAGS)
 SRCS=utils.c
 OBJS=$(SRCS:.c=.o) $(EXERCISE:.c=.o)
 EXE=$(EXERCISE:.c=.exe)
 $(EXE):	$(OBJS)
 	$(CC) $(CFLAGS) $(OBJS) -o $@ $(LDFLAGS)
 all: $(EXE)
 .PHONY: run profile clean
 run: $(EXE)
 	./$(EXE)
 profile: $(EXE)
 	sudo LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/ext/lib:${LD_LIBRARY_PATH} LIBRARY_PATH=/usr/ext/lib:${LIBRARY_PATH} nvprof ./$(EXE)
 clean:
 	rm -f $(OBJS) *.o *.exe *.out *~
--- a/openmp/lab3/data/jacobi-1000.bin
+++ b/openmp/lab3/data/jacobi-1000.bin
--- a/openmp/lab3/data/jacobi-500.bin
+++ b/openmp/lab3/data/jacobi-500.bin
--- a/openmp/lab3/jacobi.c
+++ b/openmp/lab3/jacobi.c
@ -0,0 +1,279 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file jacobi.c
 * @author Alessandro Capotondi
 * @date 27 Mar 2020
 * @brief This code solves the steady state heat equation on a rectangular region.
 * This code solves the steady state heat equation on a rectangular region.
 *  The sequential version of this program needs approximately
 *  18/epsilon iterations to complete. 
 *  The physical region, and the boundary conditions, are suggested
 *  by this diagram;
 *                 W = 0
 *           +------------------+
 *           |                  |
 *  W = 100  |                  | W = 100
 *           |                  |
 *           +------------------+
 *                 W = 100
 *  The region is covered with a grid of M by N nodes, and an N by N
 *  array W is used to record the temperature.  The correspondence between
 *  array indices and locations in the region is suggested by giving the
 *  indices of the four corners:
 *                I = 0
 *        [0][0]-------------[0][N-1]
 *           |                  |
 *    J = 0  |                  |  J = N-1
 *           |                  |
 *      [M-1][0]-----------[M-1][N-1]
 *                I = M-1
 *  The steady state solution to the discrete heat equation satisfies the
 *  following condition at an interior grid point:
 *    W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  where "Central" is the index of the grid point, "North" is the index
 *  of its immediate neighbor to the "north", and so on.
 * 
 *  Given an approximate solution of the steady state heat equation, a
 *  "better" solution is given by replacing each interior point by the
 *  average of its 4 neighbors - in other words, by using the condition
 *  as an ASSIGNMENT statement:
 *    W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
 *  If this process is repeated often enough, the difference between successive 
 *  estimates of the solution will go to zero.
 *  This program carries out such an iteration, using a tolerance specified by
 *  the user, and writes the final estimate of the solution to a file that can
 *  be used for graphic processing.
 * icensing:
 *  This code is distributed under the GNU LGPL license. 
 * odified:
 *  18 October 2011
 * uthor:
 *  Original C version by Michael Quinn.
 *  This C version by John Burkardt.
 * eference:
 *  Michael Quinn,
 *  Parallel Programming in C with MPI and OpenMP,
 *  McGraw-Hill, 2004,
 *  ISBN13: 978-0071232654,
 *  LC: QA76.73.C15.Q55.
 * ocal parameters:
 *  Local, double DIFF, the norm of the change in the solution from one iteration
 *  to the next.
 *  Local, double MEAN, the average of the boundary values, used to initialize
 *  the values of the solution in the interior.
 *  Local, double U[M][N], the solution at the previous iteration.
 *  Local, double W[M][N], the solution computed at the latest iteration.
 * 
 * 
 * @see https://en.wikipedia.org/wiki/Jacobi_method
 * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
 */
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
 #include "utils.h"
 static int N;
 static int MAX_ITERATIONS;
 static int SEED;
 static double CONVERGENCE_THRESHOLD;
 static FILE *data;
 #define SEPARATOR "------------------------------------\n"
 // Return the current time in seconds since the Epoch
 double get_timestamp();
 // Parse command line arguments to set solver parameters
 void parse_arguments(int argc, char *argv[]);
 // Run the Jacobi solver
 // Returns the number of iterations performed
 int run(double *A, double *xtmp)
 {
    int iter = 0, iterations_print = 1;
    double err = 0.0;
    do
    {
        err = 0.0;
 #pragma omp parallel for reduction(max \
                                   : err) num_threads(NTHREADS)
        for (int i = 1; i < N - 1; i++)
        {
            for (int j = 1; j < N - 1; j++)
            {
                xtmp[i * N + j] = 0.25 * (A[(i - 1) * N + j] + A[(i + 1) * N + j] + A[i * N + j - 1] + A[i * N + j + 1]);
                err = fmax(err, fabs(xtmp[i * N + j] - A[i * N + j]));
            }
        }
 #pragma omp parallel for num_threads(NTHREADS)
        for (int i = 0; i < N; i++)
        {
            for (int j = 0; j < N; j++)
            {
                A[i * N + j] = xtmp[i * N + j];
            }
        }
        iter++;
 #ifdef DEBUG
        if (iter == iterations_print)
        {
            printf("  %8d  %f\n", iter, err);
            iterations_print = 2 * iterations_print;
        }
 #endif
    } while (err > CONVERGENCE_THRESHOLD && iter < MAX_ITERATIONS);
    return iter;
 }
 int main(int argc, char *argv[])
 {
    parse_arguments(argc, argv);
    double *A = malloc(N * N * sizeof(double));
    double *xtmp = malloc(N * N * sizeof(double));
    printf(SEPARATOR);
    printf("Matrix size:            %dx%d\n", N, N);
    printf("Maximum iterations:     %d\n", MAX_ITERATIONS);
    printf("Convergence threshold:  %lf\n", CONVERGENCE_THRESHOLD);
    printf(SEPARATOR);
    for (int ii = 0; ii < N; ii++)
    {
        for (int jj = 0; jj < N; jj++)
        {
            double f;
            fread(&f, sizeof(double), 1, data);
            A[ii * N + jj] = f;
        }
    }
    // Run Jacobi solver
    start_timer();
    int itr = run(A, xtmp);
    stop_timer();
    printf("Iterations     = %d\n", itr);
    printf("Solver runtime = %lf ms\n", elapsed_ns() / 1E6);
    if (itr == MAX_ITERATIONS)
        printf("WARNING: solution did not converge\n");
    printf(SEPARATOR);
    free(A);
    free(xtmp);
    fclose(data);
    return 0;
 }
 int parse_int(const char *str)
 {
    char *next;
    int value = strtoul(str, &next, 10);
    return strlen(next) ? -1 : value;
 }
 double parse_double(const char *str)
 {
    char *next;
    double value = strtod(str, &next);
    return strlen(next) ? -1 : value;
 }
 void parse_arguments(int argc, char *argv[])
 {
    // Set default values
    N = 500;
    MAX_ITERATIONS = 2000;
    CONVERGENCE_THRESHOLD = 0.001;
    SEED = 0;
    for (int i = 1; i < argc; i++)
    {
        if (!strcmp(argv[i], "--convergence") || !strcmp(argv[i], "-c"))
        {
            if (++i >= argc || (CONVERGENCE_THRESHOLD = parse_double(argv[i])) < 0)
            {
                printf("Invalid convergence threshold\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i"))
        {
            if (++i >= argc || (MAX_ITERATIONS = parse_int(argv[i])) < 0)
            {
                printf("Invalid number of iterations\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--norder") || !strcmp(argv[i], "-n"))
        {
            if (++i >= argc || (N = parse_int(argv[i])) < 0)
            {
                printf("Invalid matrix order\n");
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
            printf("\n");
            printf("Usage: ./jacobi [OPTIONS]\n\n");
            printf("Options:\n");
            printf("  -h  --help               Print this message\n");
            printf("  -c  --convergence  C     Set convergence threshold\n");
            printf("  -i  --iterations   I     Set maximum number of iterations\n");
            printf("  -n  --norder       N     Set maxtrix order (500 or 1000)\n");
            printf("\n");
            exit(0);
        }
        else
        {
            printf("Unrecognized argument '%s' (try '--help')\n", argv[i]);
            exit(1);
        }
    }
    if (N == 1000)
        data = fopen("data/jacobi-1000.bin", "rb");
    else if (N == 500)
        data = fopen("data/jacobi-500.bin", "rb");
    else
    {
        printf("Invalid matrix order\n");
        exit(1);
    }
 }
--- a/openmp/lab3/matmul.c
+++ b/openmp/lab3/matmul.c
@ -0,0 +1,174 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <omp.h>
 #ifndef N
 #define N (1 << 10)
 #endif
 #pragma omp declare target
 #define SM 64
 static void reorder2(float *restrict a, float *restrict b, int n)
 {
    for (int i = 0; i < SM; i++)
        for (int j = 0; j < SM; j++)
            b[i * SM + j] = a[i * n + j];
 }
 static void kernel(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    for (int i = 0; i < SM; i++)
    {
        for (int k = 0; k < SM; k++)
        {
            for (int j = 0; j < SM; j++)
            {
                c[i * n + j] += a[i * n + k] * b[k * SM + j];
            }
        }
    }
 }
 void gemm_accel(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    int bk = n / SM;
    float b2[SM * SM];
    for (int i = 0; i < bk; i++)
    {
        for (int j = 0; j < bk; j++)
        {
            for (int k = 0; k < bk; k++)
            {
                reorder2(&b[SM * (k * n + j)], b2, n);
                kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
            }
        }
    }
 }
 #pragma omp end declare target
 void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    int bk = n / SM;
    {
        float b2[SM * SM];
        for (int i = 0; i < bk; i++)
        {
            for (int j = 0; j < bk; j++)
            {
                for (int k = 0; k < bk; k++)
                {
                    reorder2(&b[SM * (k * n + j)], b2, n);
                    kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
                }
            }
        }
    }
 }
 void gemm(float *restrict a, float *restrict b, float *restrict c, int n)
 {
    int i, j, k;
    for (int i = 0; i < n; ++i)
    {
        for (int j = 0; j < n; ++j)
        {
            float sum = 0.0;
            for (int k = 0; k < n; ++k)
            {
                sum += a[i + k * n] * b[k + j * n];
            }
            c[i * n + j] += sum;
        }
    }
 }
 int main(int argc, char *argv[])
 {
    int i, n = N,
           iret = 0;
    float *a, *b, *c, *g;
    struct timespec rt[2];
    double wt; // walltime
    if (argc > 1)
        n = atoi(argv[1]);
    /*
   * 0. prepare x, y, and z
   *
   * y := a * x + y (on host)
   * z := a * x + z (on accel)
   */
    if (NULL == (a = (float *)malloc(sizeof(*a) * n * n)))
    {
        printf("error: memory allocation for 'x'\n");
        iret = -1;
    }
    if (NULL == (b = (float *)malloc(sizeof(*b) * n * n)))
    {
        printf("error: memory allocation for 'y'\n");
        iret = -1;
    }
    if (NULL == (c = (float *)malloc(sizeof(*c) * n * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (NULL == (g = (float *)malloc(sizeof(*g) * n * n)))
    {
        printf("error: memory allocation for 'z'\n");
        iret = -1;
    }
    if (0 != iret)
    {
        free(a);
        free(b);
        free(c);
        free(g);
        exit(EXIT_FAILURE);
    }
    if (n <= 1024)
    {
        clock_gettime(CLOCK_REALTIME, rt + 0);
        gemm(a, b, c, n);
        clock_gettime(CLOCK_REALTIME, rt + 1);
        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
        printf("gemm on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    }
    if (n <= 4096)
    {
        clock_gettime(CLOCK_REALTIME, rt + 0);
        gemm_opt(a, b, c, n);
        clock_gettime(CLOCK_REALTIME, rt + 1);
        wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
        printf("gemm_opt on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    }
    clock_gettime(CLOCK_REALTIME, rt + 0);
    gemm_accel(a, b, c, n);
    clock_gettime(CLOCK_REALTIME, rt + 1);
    wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
    printf("GEMM-opt1 on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
    for (i = 0; i < n; i++)
    {
        iret = *(int *)(g + i) ^ *(int *)(c + i);
        assert(iret == 0);
    }
    free(a);
    free(b);
    free(c);
    free(g);
    return 0;
 }
--- a/openmp/lab3/saxpy.c
+++ b/openmp/lab3/saxpy.c
@ -0,0 +1,120 @@
 /**
 * @file saxpy.c
 *
 * @brief saxpy performs the \c axpy computation in single-precision on both
 * host and accelerator. The performance (in MFLOPS) on host and accelerator is
 * compared and the numerical results are also verified for consistency.
 *
 * The \c axpy computation is defined as:
 *
 * y := a * x + y
 *
 * where:
 *
 * - a is a scalar.
 * - x and y are vectors each with n elements.
 *
 * Please note that in this version only <em>one GPU thread</em> is used.
 *
 * Offload to GPU:
 *
 * gcc -fopenmp -foffload=nvptx-none saxpy.c
 *
 */
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <omp.h>
 #include "utils.h"
 #define TWO02 (1 << 2)
 #define TWO04 (1 << 4)
 #define TWO08 (1 << 8)
 #ifndef N
 #define N (1 << 26)
 #endif
 int main(int argc, char *argv[])
 {
  int i, n = N,
         iret = 0;
  float a = 101.0f / TWO02,
        b, c,
        *x, *y, *z;
  struct timespec rt[2];
  double wt; // walltime
  if (argc > 1)
    n = atoi(argv[1]);
  /*
   * 0. prepare x, y, and z
   *
   * y := a * x + y (on host)
   * z := a * x + z (on accel)
   */
  if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
  {
    printf("error: memory allocation for 'x'\n");
    iret = -1;
  }
  if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
  {
    printf("error: memory allocation for 'y'\n");
    iret = -1;
  }
  if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
  {
    printf("error: memory allocation for 'z'\n");
    iret = -1;
  }
  if (0 != iret)
  {
    free(x);
    free(y);
    free(z);
    exit(EXIT_FAILURE);
  }
  b = rand() % TWO04;
  c = rand() % TWO08;
  for (i = 0; i < n; i++)
  {
    x[i] = b / (float)TWO02;
    y[i] = z[i] = c / (float)TWO04;
  }
  /*
   * 1. saxpy on host
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
  for (i = 0; i < n; i++)
  {
    y[i] = a * x[i] + y[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 2. saxpy on accel
   */
  clock_gettime(CLOCK_REALTIME, rt + 0);
  for (i = 0; i < n; i++)
  {
    z[i] = a * x[i] + z[i];
  }
  clock_gettime(CLOCK_REALTIME, rt + 1);
  wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
  printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
  /*
   * 3. verify numerical consistency
   */
  for (i = 0; i < n; i++)
  {
    iret = *(int *)(y + i) ^ *(int *)(z + i);
    assert(iret == 0);
  }
  return 0;
 }
--- a/openmp/lab3/setup_clang.sh
+++ b/openmp/lab3/setup_clang.sh
@ -0,0 +1,2 @@
 #!/bin/bash
 module load clang/11.0.0 cuda/10.0
--- a/openmp/lab3/utils.c
+++ b/openmp/lab3/utils.c
@ -0,0 +1,150 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file utils.c
 * @author Alessandro Capotondi
 * @date 27 Mar 2020
 * @brief File containing utilities functions for HPC Unimore Class
 *
 * Utilities for OpenMP lab.
 * 
 * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
 */
 #define _POSIX_C_SOURCE 199309L
 #include <time.h>
 #include <limits.h>
 #include <math.h>
 #include <stdio.h>
 #include <assert.h>
 #include "utils.h"
 #define MAX_ITERATIONS 100
 static struct timespec timestampA, timestampB;
 static unsigned long long statistics[MAX_ITERATIONS];
 static int iterations = 0;
 static unsigned long long __diff_ns(struct timespec start, struct timespec end)
 {
    struct timespec temp;
    if ((end.tv_nsec - start.tv_nsec) < 0)
    {
        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
        temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
    }
    else
    {
        temp.tv_sec = end.tv_sec - start.tv_sec;
        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
    }
    return temp.tv_nsec + temp.tv_sec * 1000000000ULL;
 }
 void start_timer()
 {
    asm volatile("" ::
                     : "memory");
    clock_gettime(CLOCK_MONOTONIC_RAW, &timestampA);
    asm volatile("" ::
                     : "memory");
 }
 void stop_timer()
 {
    unsigned long long elapsed = 0ULL;
    asm volatile("" ::
                     : "memory");
    clock_gettime(CLOCK_MONOTONIC_RAW, &timestampB);
    asm volatile("" ::
                     : "memory");
 }
 unsigned long long elapsed_ns()
 {
    return __diff_ns(timestampA, timestampB);
 }
 void start_stats()
 {
    start_timer();
 }
 void collect_stats()
 {
    assert(iterations < MAX_ITERATIONS);
    stop_timer();
    statistics[iterations++] = elapsed_ns();
 }
 void print_stats()
 {
    unsigned long long min = ULLONG_MAX;
    unsigned long long max = 0LL;
    double average = 0.0;
    double std_deviation = 0.0;
    double sum = 0.0;
    /*  Compute the sum of all elements */
    for (int i = 0; i < iterations; i++)
    {
        if (statistics[i] > max)
            max = statistics[i];
        if (statistics[i] < min)
            min = statistics[i];
        sum = sum + statistics[i] / 1E6;
    }
    average = sum / (double)iterations;
    /*  Compute  variance  and standard deviation  */
    for (int i = 0; i < iterations; i++)
    {
        sum = sum + pow((statistics[i] / 1E6 - average), 2);
    }
    std_deviation = sqrt(sum / (double)iterations);
    printf("AvgTime\tMinTime\tMaxTime\tStdDev\n");
    printf("%.4f ms\t%.4f ms\t%.4f ms\t%.4f\n", (double)average, (double)min / 1E6, (double)max / 1E6, (double)std_deviation);
 }
 #if defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC optimize("O0")
 void work(unsigned long num)
 #else
 void work __attribute__((optnone)) (unsigned long num)
 #endif
 {
    volatile int cnt = 0;
    for (int i = 0; i < num; i++)
        cnt += i;
 }
 #if defined(__GNUC__)
 #pragma GCC pop_options
 #endif
--- a/openmp/lab3/utils.h
+++ b/openmp/lab3/utils.h
@ -0,0 +1,162 @@
 /*
 * BSD 2-Clause License
 * 
 * Copyright (c) 2020, Alessandro Capotondi
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /**
 * @file utils.h
 * @author Alessandro Capotondi
 * @date 27 Mar 2020
 * @brief File containing utilities functions for HPC Unimore Class
 *
 * The header define time functions and dummy workload used on the example tests.
 * 
 * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
 */
 #ifndef __UTILS_H__
 #define __UTILS_H__
 #include <stdarg.h>
 #if defined(VERBOSE)
 #define DEBUG_PRINT(x, ...) printf((x), ##__VA_ARGS__)
 #else
 #define DEBUG_PRINT(x, ...)
 #endif
 #if !defined(NTHREADS)
 #define NTHREADS (4)
 #endif
 #if !defined(NTHREADS_GPU)
 #define NTHREADS_GPU (1024)
 #endif
 /**
 * @brief The function set the timestampA
 *
 * The function is used to measure elapsed time between two execution points.
 * The function start_timer() sets the starting point timestamp, while the function
 * stop_timer() sets the termination timestamp. The elapsed time, expressed in nanoseconds,
 * between the two points can be retrieved using the function elapsed_ns().
 * 
 * Example usage:
 * @code
 * start_timer(); // Point A
 * //SOME CODE HERE
 * stop_timer(); // Point B
 * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B
 * //SOME OTHER CODE HERE
 * stop_timer(); // Point C
 * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C
 * @endcode
 * 
 * @return void
 * @see start_timer()
 * @see stop_timer()
 * @see elapsed_ns()
 */
 void start_timer();
 /**
 * @brief The function set the second timestamps
 *
 * The function is used to measure elapsed time between two execution points.
 * The function start_timer() sets the starting point timestamp, while the function
 * stop_timer() returns the elapsed time, expressed in nanoseconds between the last call
 * of start_timer() and the current execution point.
 * 
 * Example usage:
 * @code
 * start_timer(); // Point A
 * //SOME CODE HERE
 * stop_timer(); // Point B
 * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B
 * //SOME OTHER CODE HERE
 * stop_timer(); // Point C
 * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C
 * @endcode
 * 
 * @return void
 * @see start_timer()
 * @see stop_timer()
 * @see elapsed_ns()
 */
 void stop_timer();
 /**
 * @brief Elapsed nano seconds between start_timer() and stop_timer().
 *
 * @return Elapsed nano seconds
 * @see start_timer()
 * @see stop_timer()
 */
 unsigned long long elapsed_ns();
 /**
 * @brief The function init the starting point of stat measurement.
 *
 * The function is similar to start_timer().
 * 
 * @return void
 * @see start_timer
 */
 void start_stats();
 /**
 * @brief The function collects the elapsed time between the current exeuction point and the 
 * last call of start_stats().
 * 
 * @return void
 */
 void collect_stats();
 /**
 * @brief The function display the collected statistics.
 * @return void
 */
 void print_stats();
 /**
 * @brief The dummy work function
 *
 * The function is used to emulate some usefull workload.
 * 
 * @param @num work duration in terms of loop iterations.
 * @return void
 */
 #if defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC optimize("O0")
 void work(unsigned long num);
 #else
 void work __attribute__((optnone)) (unsigned long num);
 #endif
 #if defined(__GNUC__)
 #pragma GCC pop_options
 #endif
 #endif /*__UTILS_H__*/
		`@ -0,0 +1,2 @@`
							`#!/bin/bash`
							`module load clang/11.0.0 cuda/10.0`