1
Fork 0
mirror of https://github.com/Steffo99/unimore-hpc-assignments.git synced 2024-11-22 08:04:25 +00:00

Gemm for profiling

This commit is contained in:
Alessandro Capotondi 2022-10-24 00:13:55 +02:00
parent 9a9a8ba614
commit b2b6809b64
6 changed files with 648 additions and 0 deletions

28
profile/gemmv1/Makefile Normal file
View file

@ -0,0 +1,28 @@
ifndef EXERCISE
EXERCISE=gemmv1.c
endif
CC=gcc
LD=ld
OBJDUMP=objdump
OPT=-O2 -pg
CFLAGS=$(OPT) $(OMP) -I. -I../ $(EXT_CFLAGS)
LDFLAGS=-lm $(EXT_LDFLAGS)
SRCS=../utils.c
OBJS=$(SRCS:.c=.o) $(EXERCISE:.c=.o)
EXE=$(EXERCISE:.c=.exe)
$(EXE): $(OBJS)
$(CC) $(CFLAGS) $(OBJS) -o $@ $(LDFLAGS)
all: $(EXE)
.PHONY: run clean
run: $(EXE)
./$(EXE) $(EXT_ARGS)
clean:
rm -f $(OBJS) *.o *.exe *.out *~

140
profile/gemmv1/gemmv1.c Normal file
View file

@ -0,0 +1,140 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include "utils.h"
#ifndef N
#define N (1 << 10)
#endif
#define SM 64
static void reorder2(float *restrict a, float *restrict b, int n)
{
for (int i = 0; i < SM; i++)
for (int j = 0; j < SM; j++)
b[i * SM + j] = a[i * n + j];
}
static void kernel(float *restrict a, float *restrict b, float *restrict c, int n)
{
for (int i = 0; i < SM; i++)
{
for (int k = 0; k < SM; k++)
{
for (int j = 0; j < SM; j++)
{
c[i * n + j] += a[i * n + k] * b[k * SM + j];
}
}
}
}
void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n)
{
int bk = n / SM;
{
float b2[SM * SM];
for (int i = 0; i < bk; i++)
{
for (int j = 0; j < bk; j++)
{
for (int k = 0; k < bk; k++)
{
reorder2(&b[SM * (k * n + j)], b2, n);
kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
}
}
}
}
}
void gemm(float *restrict a, float *restrict b, float *restrict c, int n)
{
int i, j, k;
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
float sum = 0.0;
for (int k = 0; k < n; ++k)
{
sum += a[i + k * n] * b[k + j * n];
}
c[i * n + j] += sum;
}
}
}
int main(int argc, char *argv[])
{
int i, n = N,
iret = 0;
float *a, *b, *c, *g;
struct timespec rt[2];
double wt; // walltime
if (argc > 1)
n = atoi(argv[1]);
/*
* 0. prepare x, y, and z
*
* y := a * x + y (on host)
* z := a * x + z (on accel)
*/
if (NULL == (a = (float *)malloc(sizeof(*a) * n * n)))
{
printf("error: memory allocation for 'x'\n");
iret = -1;
}
if (NULL == (b = (float *)malloc(sizeof(*b) * n * n)))
{
printf("error: memory allocation for 'y'\n");
iret = -1;
}
if (NULL == (c = (float *)malloc(sizeof(*c) * n * n)))
{
printf("error: memory allocation for 'z'\n");
iret = -1;
}
if (NULL == (g = (float *)malloc(sizeof(*g) * n * n)))
{
printf("error: memory allocation for 'z'\n");
iret = -1;
}
if (0 != iret)
{
free(a);
free(b);
free(c);
free(g);
exit(EXIT_FAILURE);
}
if (n <= 1024)
{
clock_gettime(CLOCK_REALTIME, rt + 0);
gemm(a, b, c, n);
clock_gettime(CLOCK_REALTIME, rt + 1);
wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
printf("gemm on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
}
for (i = 0; i < n; i++)
{
iret = *(int *)(g + i) ^ *(int *)(c + i);
assert(iret == 0);
}
free(a);
free(b);
free(c);
free(g);
return 0;
}

28
profile/gemmv2/Makefile Normal file
View file

@ -0,0 +1,28 @@
ifndef EXERCISE
EXERCISE=gemmv2.c
endif
CC=gcc
LD=ld
OBJDUMP=objdump
OPT=-O2 -pg
CFLAGS=$(OPT) $(OMP) -I. -I../ $(EXT_CFLAGS)
LDFLAGS=-lm $(EXT_LDFLAGS)
SRCS=../utils.c
OBJS=$(SRCS:.c=.o) $(EXERCISE:.c=.o)
EXE=$(EXERCISE:.c=.exe)
$(EXE): $(OBJS)
$(CC) $(CFLAGS) $(OBJS) -o $@ $(LDFLAGS)
all: $(EXE)
.PHONY: run clean
run: $(EXE)
./$(EXE) $(EXT_ARGS)
clean:
rm -f $(OBJS) *.o *.exe *.out *~

140
profile/gemmv2/gemmv2.c Normal file
View file

@ -0,0 +1,140 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include "utils.h"
#ifndef N
#define N (1 << 10)
#endif
#define SM 64
static void reorder2(float *restrict a, float *restrict b, int n)
{
for (int i = 0; i < SM; i++)
for (int j = 0; j < SM; j++)
b[i * SM + j] = a[i * n + j];
}
static void kernel(float *restrict a, float *restrict b, float *restrict c, int n)
{
for (int i = 0; i < SM; i++)
{
for (int k = 0; k < SM; k++)
{
for (int j = 0; j < SM; j++)
{
c[i * n + j] += a[i * n + k] * b[k * SM + j];
}
}
}
}
void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n)
{
int bk = n / SM;
{
float b2[SM * SM];
for (int i = 0; i < bk; i++)
{
for (int j = 0; j < bk; j++)
{
for (int k = 0; k < bk; k++)
{
reorder2(&b[SM * (k * n + j)], b2, n);
kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n);
}
}
}
}
}
void gemm(float *restrict a, float *restrict b, float *restrict c, int n)
{
int i, j, k;
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
float sum = 0.0;
for (int k = 0; k < n; ++k)
{
sum += a[i + k * n] * b[k + j * n];
}
c[i * n + j] += sum;
}
}
}
int main(int argc, char *argv[])
{
int i, n = N,
iret = 0;
float *a, *b, *c, *g;
struct timespec rt[2];
double wt; // walltime
if (argc > 1)
n = atoi(argv[1]);
/*
* 0. prepare x, y, and z
*
* y := a * x + y (on host)
* z := a * x + z (on accel)
*/
if (NULL == (a = (float *)malloc(sizeof(*a) * n * n)))
{
printf("error: memory allocation for 'x'\n");
iret = -1;
}
if (NULL == (b = (float *)malloc(sizeof(*b) * n * n)))
{
printf("error: memory allocation for 'y'\n");
iret = -1;
}
if (NULL == (c = (float *)malloc(sizeof(*c) * n * n)))
{
printf("error: memory allocation for 'z'\n");
iret = -1;
}
if (NULL == (g = (float *)malloc(sizeof(*g) * n * n)))
{
printf("error: memory allocation for 'z'\n");
iret = -1;
}
if (0 != iret)
{
free(a);
free(b);
free(c);
free(g);
exit(EXIT_FAILURE);
}
if (n <= 4096)
{
clock_gettime(CLOCK_REALTIME, rt + 0);
gemm_opt(a, b, c, n);
clock_gettime(CLOCK_REALTIME, rt + 1);
wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
printf("gemm_opt on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt));
}
for (i = 0; i < n; i++)
{
iret = *(int *)(g + i) ^ *(int *)(c + i);
assert(iret == 0);
}
free(a);
free(b);
free(c);
free(g);
return 0;
}

150
profile/utils.c Normal file
View file

@ -0,0 +1,150 @@
/*
* BSD 2-Clause License
*
* Copyright (c) 2020, Alessandro Capotondi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @file utils.c
* @author Alessandro Capotondi
* @date 27 Mar 2020
* @brief File containing utilities functions for HPC Unimore Class
*
* Utilities for OpenMP lab.
*
* @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
*/
#define _POSIX_C_SOURCE 199309L
#include <time.h>
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <assert.h>
#include "utils.h"
#define MAX_ITERATIONS 100
static struct timespec timestampA, timestampB;
static unsigned long long statistics[MAX_ITERATIONS];
static int iterations = 0;
static unsigned long long __diff_ns(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0)
{
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
}
else
{
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp.tv_nsec + temp.tv_sec * 1000000000ULL;
}
void start_timer()
{
asm volatile("" ::
: "memory");
clock_gettime(CLOCK_MONOTONIC_RAW, &timestampA);
asm volatile("" ::
: "memory");
}
void stop_timer()
{
unsigned long long elapsed = 0ULL;
asm volatile("" ::
: "memory");
clock_gettime(CLOCK_MONOTONIC_RAW, &timestampB);
asm volatile("" ::
: "memory");
}
unsigned long long elapsed_ns()
{
return __diff_ns(timestampA, timestampB);
}
void start_stats()
{
start_timer();
}
void collect_stats()
{
assert(iterations < MAX_ITERATIONS);
stop_timer();
statistics[iterations++] = elapsed_ns();
}
void print_stats()
{
unsigned long long min = ULLONG_MAX;
unsigned long long max = 0LL;
double average = 0.0;
double std_deviation = 0.0;
double sum = 0.0;
/* Compute the sum of all elements */
for (int i = 0; i < iterations; i++)
{
if (statistics[i] > max)
max = statistics[i];
if (statistics[i] < min)
min = statistics[i];
sum = sum + statistics[i] / 1E6;
}
average = sum / (double)iterations;
/* Compute variance and standard deviation */
for (int i = 0; i < iterations; i++)
{
sum = sum + pow((statistics[i] / 1E6 - average), 2);
}
std_deviation = sqrt(sum / (double)iterations);
printf("AvgTime\tMinTime\tMaxTime\tStdDev\n");
printf("%.4f ms\t%.4f ms\t%.4f ms\t%.4f\n", (double)average, (double)min / 1E6, (double)max / 1E6, (double)std_deviation);
}
#if defined(__GNUC__)
#pragma GCC push_options
#pragma GCC optimize("O0")
void work(unsigned long num)
#else
void work __attribute__((optnone)) (unsigned long num)
#endif
{
volatile int cnt = 0;
for (int i = 0; i < num; i++)
cnt += i;
}
#if defined(__GNUC__)
#pragma GCC pop_options
#endif

162
profile/utils.h Normal file
View file

@ -0,0 +1,162 @@
/*
* BSD 2-Clause License
*
* Copyright (c) 2020, Alessandro Capotondi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @file utils.h
* @author Alessandro Capotondi
* @date 27 Mar 2020
* @brief File containing utilities functions for HPC Unimore Class
*
* The header define time functions and dummy workload used on the example tests.
*
* @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html
*/
#ifndef __UTILS_H__
#define __UTILS_H__
#include <stdarg.h>
#if defined(VERBOSE)
#define DEBUG_PRINT(x, ...) printf((x), ##__VA_ARGS__)
#else
#define DEBUG_PRINT(x, ...)
#endif
#if !defined(NTHREADS)
#define NTHREADS (4)
#endif
#if !defined(NTHREADS_GPU)
#define NTHREADS_GPU (1024)
#endif
/**
* @brief The function set the timestampA
*
* The function is used to measure elapsed time between two execution points.
* The function start_timer() sets the starting point timestamp, while the function
* stop_timer() sets the termination timestamp. The elapsed time, expressed in nanoseconds,
* between the two points can be retrieved using the function elapsed_ns().
*
* Example usage:
* @code
* start_timer(); // Point A
* //SOME CODE HERE
* stop_timer(); // Point B
* printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B
* //SOME OTHER CODE HERE
* stop_timer(); // Point C
* printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C
* @endcode
*
* @return void
* @see start_timer()
* @see stop_timer()
* @see elapsed_ns()
*/
void start_timer();
/**
* @brief The function set the second timestamps
*
* The function is used to measure elapsed time between two execution points.
* The function start_timer() sets the starting point timestamp, while the function
* stop_timer() returns the elapsed time, expressed in nanoseconds between the last call
* of start_timer() and the current execution point.
*
* Example usage:
* @code
* start_timer(); // Point A
* //SOME CODE HERE
* stop_timer(); // Point B
* printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B
* //SOME OTHER CODE HERE
* stop_timer(); // Point C
* printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C
* @endcode
*
* @return void
* @see start_timer()
* @see stop_timer()
* @see elapsed_ns()
*/
void stop_timer();
/**
* @brief Elapsed nano seconds between start_timer() and stop_timer().
*
* @return Elapsed nano seconds
* @see start_timer()
* @see stop_timer()
*/
unsigned long long elapsed_ns();
/**
* @brief The function init the starting point of stat measurement.
*
* The function is similar to start_timer().
*
* @return void
* @see start_timer
*/
void start_stats();
/**
* @brief The function collects the elapsed time between the current exeuction point and the
* last call of start_stats().
*
* @return void
*/
void collect_stats();
/**
* @brief The function display the collected statistics.
* @return void
*/
void print_stats();
/**
* @brief The dummy work function
*
* The function is used to emulate some usefull workload.
*
* @param @num work duration in terms of loop iterations.
* @return void
*/
#if defined(__GNUC__)
#pragma GCC push_options
#pragma GCC optimize("O0")
void work(unsigned long num);
#else
void work __attribute__((optnone)) (unsigned long num);
#endif
#if defined(__GNUC__)
#pragma GCC pop_options
#endif
#endif /*__UTILS_H__*/