1
Fork 0
mirror of https://github.com/Steffo99/unimore-hpc-assignments.git synced 2024-11-25 17:44:23 +00:00
hpc-2022-g3/openmp/lab3/.solutions/saxpy-omp4.c

129 lines
2.7 KiB
C
Raw Normal View History

2021-04-21 08:16:41 +00:00
/**
* @file saxpy.c
*
* @brief saxpy performs the \c axpy computation in single-precision on both
* host and accelerator. The performance (in MFLOPS) on host and accelerator is
* compared and the numerical results are also verified for consistency.
*
* The \c axpy computation is defined as:
*
* y := a * x + y
*
* where:
*
* - a is a scalar.
* - x and y are vectors each with n elements.
*
* Please note that in this version only <em>one GPU thread</em> is used.
*
* Offload to GPU:
*
* gcc -fopenmp -foffload=nvptx-none saxpy.c
*
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include "utils.h"
#define TWO02 (1 << 2)
#define TWO04 (1 << 4)
#define TWO08 (1 << 8)
#ifndef N
#define N (1 << 27)
#endif
int main(int argc, char *argv[])
{
int i, n = N,
iret = 0;
float a = 101.0f / TWO02,
b, c,
*x, *y, *z;
struct timespec rt[2];
double wt; // walltime
if (argc > 1)
n = atoi(argv[1]);
/*
* 0. prepare x, y, and z
*
* y := a * x + y (on host)
* z := a * x + z (on accel)
*/
if (NULL == (x = (float *)malloc(sizeof(*x) * n)))
{
printf("error: memory allocation for 'x'\n");
iret = -1;
}
if (NULL == (y = (float *)malloc(sizeof(*y) * n)))
{
printf("error: memory allocation for 'y'\n");
iret = -1;
}
if (NULL == (z = (float *)malloc(sizeof(*z) * n)))
{
printf("error: memory allocation for 'z'\n");
iret = -1;
}
if (0 != iret)
{
free(x);
free(y);
free(z);
exit(EXIT_FAILURE);
}
b = rand() % TWO04;
c = rand() % TWO08;
for (i = 0; i < n; i++)
{
x[i] = b / (float)TWO02;
y[i] = z[i] = c / (float)TWO04;
}
/*
* 1. saxpy on host
*/
clock_gettime(CLOCK_REALTIME, rt + 0);
for (i = 0; i < n; i++)
{
y[i] = a * x[i] + y[i];
}
clock_gettime(CLOCK_REALTIME, rt + 1);
wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
/*
* 2. saxpy on accel
*/
clock_gettime(CLOCK_REALTIME, rt + 0);
int BLOCK=n/8;
for (int i = 0; i < n; i+=BLOCK)
{
#pragma omp target teams distribute parallel for map(to: a, x [i:BLOCK]) map(tofrom: z [i:BLOCK]) nowait
for (int ii = 0; ii < BLOCK; ii++)
{
z[i+ii] = a * x[i+ii] + z[i+ii];
}
}
#pragma omp taskwait
clock_gettime(CLOCK_REALTIME, rt + 1);
wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
/*
* 3. verify numerical consistency
*/
for (i = 0; i < n; i++)
{
iret = *(int *)(y + i) ^ *(int *)(z + i);
assert(iret == 0);
}
return 0;
}