From b2b6809b6472470e5fe3212b10314688d3b5f14c Mon Sep 17 00:00:00 2001 From: Alessandro Capotondi Date: Mon, 24 Oct 2022 00:13:55 +0200 Subject: [PATCH] Gemm for profiling --- profile/gemmv1/Makefile | 28 +++++++ profile/gemmv1/gemmv1.c | 140 ++++++++++++++++++++++++++++++++++ profile/gemmv2/Makefile | 28 +++++++ profile/gemmv2/gemmv2.c | 140 ++++++++++++++++++++++++++++++++++ profile/utils.c | 150 +++++++++++++++++++++++++++++++++++++ profile/utils.h | 162 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 648 insertions(+) create mode 100644 profile/gemmv1/Makefile create mode 100644 profile/gemmv1/gemmv1.c create mode 100644 profile/gemmv2/Makefile create mode 100644 profile/gemmv2/gemmv2.c create mode 100644 profile/utils.c create mode 100644 profile/utils.h diff --git a/profile/gemmv1/Makefile b/profile/gemmv1/Makefile new file mode 100644 index 0000000..abc137d --- /dev/null +++ b/profile/gemmv1/Makefile @@ -0,0 +1,28 @@ +ifndef EXERCISE +EXERCISE=gemmv1.c +endif + +CC=gcc +LD=ld +OBJDUMP=objdump +OPT=-O2 -pg + +CFLAGS=$(OPT) $(OMP) -I. -I../ $(EXT_CFLAGS) +LDFLAGS=-lm $(EXT_LDFLAGS) + +SRCS=../utils.c +OBJS=$(SRCS:.c=.o) $(EXERCISE:.c=.o) +EXE=$(EXERCISE:.c=.exe) + +$(EXE): $(OBJS) + $(CC) $(CFLAGS) $(OBJS) -o $@ $(LDFLAGS) + +all: $(EXE) + +.PHONY: run clean +run: $(EXE) + ./$(EXE) $(EXT_ARGS) + +clean: + rm -f $(OBJS) *.o *.exe *.out *~ + diff --git a/profile/gemmv1/gemmv1.c b/profile/gemmv1/gemmv1.c new file mode 100644 index 0000000..b894e51 --- /dev/null +++ b/profile/gemmv1/gemmv1.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include + +#include "utils.h" + +#ifndef N +#define N (1 << 10) +#endif + +#define SM 64 + +static void reorder2(float *restrict a, float *restrict b, int n) +{ + for (int i = 0; i < SM; i++) + for (int j = 0; j < SM; j++) + b[i * SM + j] = a[i * n + j]; +} + +static void kernel(float *restrict a, float *restrict b, float *restrict c, int n) +{ + for (int i = 0; i < SM; i++) + { + for (int k = 0; k < SM; k++) + { + for (int j = 0; j < SM; j++) + { + c[i * n + j] += a[i * n + k] * b[k * SM + j]; + } + } + } +} + +void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n) +{ + int bk = n / SM; + { + float b2[SM * SM]; + for (int i = 0; i < bk; i++) + { + for (int j = 0; j < bk; j++) + { + for (int k = 0; k < bk; k++) + { + reorder2(&b[SM * (k * n + j)], b2, n); + kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n); + } + } + } + } +} + +void gemm(float *restrict a, float *restrict b, float *restrict c, int n) +{ + int i, j, k; + + for (int i = 0; i < n; ++i) + { + for (int j = 0; j < n; ++j) + { + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[i + k * n] * b[k + j * n]; + } + c[i * n + j] += sum; + } + } +} + +int main(int argc, char *argv[]) +{ + int i, n = N, + iret = 0; + float *a, *b, *c, *g; + struct timespec rt[2]; + double wt; // walltime + + if (argc > 1) + n = atoi(argv[1]); + + /* + * 0. prepare x, y, and z + * + * y := a * x + y (on host) + * z := a * x + z (on accel) + */ + if (NULL == (a = (float *)malloc(sizeof(*a) * n * n))) + { + printf("error: memory allocation for 'x'\n"); + iret = -1; + } + if (NULL == (b = (float *)malloc(sizeof(*b) * n * n))) + { + printf("error: memory allocation for 'y'\n"); + iret = -1; + } + if (NULL == (c = (float *)malloc(sizeof(*c) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + if (NULL == (g = (float *)malloc(sizeof(*g) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + + if (0 != iret) + { + free(a); + free(b); + free(c); + free(g); + exit(EXIT_FAILURE); + } + + if (n <= 1024) + { + clock_gettime(CLOCK_REALTIME, rt + 0); + gemm(a, b, c, n); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("gemm on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt)); + } + + for (i = 0; i < n; i++) + { + iret = *(int *)(g + i) ^ *(int *)(c + i); + assert(iret == 0); + } + free(a); + free(b); + free(c); + free(g); + + return 0; +} diff --git a/profile/gemmv2/Makefile b/profile/gemmv2/Makefile new file mode 100644 index 0000000..6a0224c --- /dev/null +++ b/profile/gemmv2/Makefile @@ -0,0 +1,28 @@ +ifndef EXERCISE +EXERCISE=gemmv2.c +endif + +CC=gcc +LD=ld +OBJDUMP=objdump +OPT=-O2 -pg + +CFLAGS=$(OPT) $(OMP) -I. -I../ $(EXT_CFLAGS) +LDFLAGS=-lm $(EXT_LDFLAGS) + +SRCS=../utils.c +OBJS=$(SRCS:.c=.o) $(EXERCISE:.c=.o) +EXE=$(EXERCISE:.c=.exe) + +$(EXE): $(OBJS) + $(CC) $(CFLAGS) $(OBJS) -o $@ $(LDFLAGS) + +all: $(EXE) + +.PHONY: run clean +run: $(EXE) + ./$(EXE) $(EXT_ARGS) + +clean: + rm -f $(OBJS) *.o *.exe *.out *~ + diff --git a/profile/gemmv2/gemmv2.c b/profile/gemmv2/gemmv2.c new file mode 100644 index 0000000..ddc126c --- /dev/null +++ b/profile/gemmv2/gemmv2.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include + +#include "utils.h" + +#ifndef N +#define N (1 << 10) +#endif + +#define SM 64 + +static void reorder2(float *restrict a, float *restrict b, int n) +{ + for (int i = 0; i < SM; i++) + for (int j = 0; j < SM; j++) + b[i * SM + j] = a[i * n + j]; +} + +static void kernel(float *restrict a, float *restrict b, float *restrict c, int n) +{ + for (int i = 0; i < SM; i++) + { + for (int k = 0; k < SM; k++) + { + for (int j = 0; j < SM; j++) + { + c[i * n + j] += a[i * n + k] * b[k * SM + j]; + } + } + } +} + +void gemm_opt(float *restrict a, float *restrict b, float *restrict c, int n) +{ + int bk = n / SM; + { + float b2[SM * SM]; + for (int i = 0; i < bk; i++) + { + for (int j = 0; j < bk; j++) + { + for (int k = 0; k < bk; k++) + { + reorder2(&b[SM * (k * n + j)], b2, n); + kernel(&a[SM * (i * n + k)], b2, &c[SM * (i * n + j)], n); + } + } + } + } +} + +void gemm(float *restrict a, float *restrict b, float *restrict c, int n) +{ + int i, j, k; + + for (int i = 0; i < n; ++i) + { + for (int j = 0; j < n; ++j) + { + float sum = 0.0; + for (int k = 0; k < n; ++k) + { + sum += a[i + k * n] * b[k + j * n]; + } + c[i * n + j] += sum; + } + } +} + +int main(int argc, char *argv[]) +{ + int i, n = N, + iret = 0; + float *a, *b, *c, *g; + struct timespec rt[2]; + double wt; // walltime + + if (argc > 1) + n = atoi(argv[1]); + + /* + * 0. prepare x, y, and z + * + * y := a * x + y (on host) + * z := a * x + z (on accel) + */ + if (NULL == (a = (float *)malloc(sizeof(*a) * n * n))) + { + printf("error: memory allocation for 'x'\n"); + iret = -1; + } + if (NULL == (b = (float *)malloc(sizeof(*b) * n * n))) + { + printf("error: memory allocation for 'y'\n"); + iret = -1; + } + if (NULL == (c = (float *)malloc(sizeof(*c) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + if (NULL == (g = (float *)malloc(sizeof(*g) * n * n))) + { + printf("error: memory allocation for 'z'\n"); + iret = -1; + } + + if (0 != iret) + { + free(a); + free(b); + free(c); + free(g); + exit(EXIT_FAILURE); + } + + if (n <= 4096) + { + clock_gettime(CLOCK_REALTIME, rt + 0); + gemm_opt(a, b, c, n); + clock_gettime(CLOCK_REALTIME, rt + 1); + wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); + printf("gemm_opt on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n * n * n / (1.0e6 * wt)); + } + + for (i = 0; i < n; i++) + { + iret = *(int *)(g + i) ^ *(int *)(c + i); + assert(iret == 0); + } + free(a); + free(b); + free(c); + free(g); + + return 0; +} diff --git a/profile/utils.c b/profile/utils.c new file mode 100644 index 0000000..1f8ceb8 --- /dev/null +++ b/profile/utils.c @@ -0,0 +1,150 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/** + * @file utils.c + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief File containing utilities functions for HPC Unimore Class + * + * Utilities for OpenMP lab. + * + * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html + */ + +#define _POSIX_C_SOURCE 199309L +#include +#include +#include +#include +#include + +#include "utils.h" + +#define MAX_ITERATIONS 100 +static struct timespec timestampA, timestampB; +static unsigned long long statistics[MAX_ITERATIONS]; +static int iterations = 0; + +static unsigned long long __diff_ns(struct timespec start, struct timespec end) +{ + struct timespec temp; + if ((end.tv_nsec - start.tv_nsec) < 0) + { + temp.tv_sec = end.tv_sec - start.tv_sec - 1; + temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec; + } + else + { + temp.tv_sec = end.tv_sec - start.tv_sec; + temp.tv_nsec = end.tv_nsec - start.tv_nsec; + } + + return temp.tv_nsec + temp.tv_sec * 1000000000ULL; +} + +void start_timer() +{ + asm volatile("" :: + : "memory"); + clock_gettime(CLOCK_MONOTONIC_RAW, ×tampA); + asm volatile("" :: + : "memory"); +} + +void stop_timer() +{ + unsigned long long elapsed = 0ULL; + asm volatile("" :: + : "memory"); + clock_gettime(CLOCK_MONOTONIC_RAW, ×tampB); + asm volatile("" :: + : "memory"); +} + +unsigned long long elapsed_ns() +{ + return __diff_ns(timestampA, timestampB); +} + +void start_stats() +{ + start_timer(); +} + +void collect_stats() +{ + assert(iterations < MAX_ITERATIONS); + stop_timer(); + statistics[iterations++] = elapsed_ns(); +} + +void print_stats() +{ + unsigned long long min = ULLONG_MAX; + unsigned long long max = 0LL; + double average = 0.0; + double std_deviation = 0.0; + double sum = 0.0; + + /* Compute the sum of all elements */ + for (int i = 0; i < iterations; i++) + { + if (statistics[i] > max) + max = statistics[i]; + if (statistics[i] < min) + min = statistics[i]; + sum = sum + statistics[i] / 1E6; + } + average = sum / (double)iterations; + + /* Compute variance and standard deviation */ + for (int i = 0; i < iterations; i++) + { + sum = sum + pow((statistics[i] / 1E6 - average), 2); + } + std_deviation = sqrt(sum / (double)iterations); + + printf("AvgTime\tMinTime\tMaxTime\tStdDev\n"); + printf("%.4f ms\t%.4f ms\t%.4f ms\t%.4f\n", (double)average, (double)min / 1E6, (double)max / 1E6, (double)std_deviation); +} + +#if defined(__GNUC__) +#pragma GCC push_options +#pragma GCC optimize("O0") +void work(unsigned long num) +#else +void work __attribute__((optnone)) (unsigned long num) +#endif +{ + volatile int cnt = 0; + for (int i = 0; i < num; i++) + cnt += i; +} +#if defined(__GNUC__) +#pragma GCC pop_options +#endif diff --git a/profile/utils.h b/profile/utils.h new file mode 100644 index 0000000..807ce3b --- /dev/null +++ b/profile/utils.h @@ -0,0 +1,162 @@ +/* + * BSD 2-Clause License + * + * Copyright (c) 2020, Alessandro Capotondi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file utils.h + * @author Alessandro Capotondi + * @date 27 Mar 2020 + * @brief File containing utilities functions for HPC Unimore Class + * + * The header define time functions and dummy workload used on the example tests. + * + * @see http://algo.ing.unimo.it/people/andrea/Didattica/HPC/index.html + */ +#ifndef __UTILS_H__ +#define __UTILS_H__ + +#include + +#if defined(VERBOSE) +#define DEBUG_PRINT(x, ...) printf((x), ##__VA_ARGS__) +#else +#define DEBUG_PRINT(x, ...) +#endif + +#if !defined(NTHREADS) +#define NTHREADS (4) +#endif + +#if !defined(NTHREADS_GPU) +#define NTHREADS_GPU (1024) +#endif + +/** + * @brief The function set the timestampA + * + * The function is used to measure elapsed time between two execution points. + * The function start_timer() sets the starting point timestamp, while the function + * stop_timer() sets the termination timestamp. The elapsed time, expressed in nanoseconds, + * between the two points can be retrieved using the function elapsed_ns(). + * + * Example usage: + * @code + * start_timer(); // Point A + * //SOME CODE HERE + * stop_timer(); // Point B + * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B + * //SOME OTHER CODE HERE + * stop_timer(); // Point C + * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C + * @endcode + * + * @return void + * @see start_timer() + * @see stop_timer() + * @see elapsed_ns() + */ +void start_timer(); + +/** + * @brief The function set the second timestamps + * + * The function is used to measure elapsed time between two execution points. + * The function start_timer() sets the starting point timestamp, while the function + * stop_timer() returns the elapsed time, expressed in nanoseconds between the last call + * of start_timer() and the current execution point. + * + * Example usage: + * @code + * start_timer(); // Point A + * //SOME CODE HERE + * stop_timer(); // Point B + * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and B + * //SOME OTHER CODE HERE + * stop_timer(); // Point C + * printf("Elapsed time = %llu ns\n", elapsed_ns())); //Elapsed time between A and C + * @endcode + * + * @return void + * @see start_timer() + * @see stop_timer() + * @see elapsed_ns() + */ +void stop_timer(); + +/** + * @brief Elapsed nano seconds between start_timer() and stop_timer(). + * + * @return Elapsed nano seconds + * @see start_timer() + * @see stop_timer() + */ +unsigned long long elapsed_ns(); + +/** + * @brief The function init the starting point of stat measurement. + * + * The function is similar to start_timer(). + * + * @return void + * @see start_timer + */ +void start_stats(); + +/** + * @brief The function collects the elapsed time between the current exeuction point and the + * last call of start_stats(). + * + * @return void + */ +void collect_stats(); + +/** + * @brief The function display the collected statistics. + * @return void + */ +void print_stats(); + +/** + * @brief The dummy work function + * + * The function is used to emulate some usefull workload. + * + * @param @num work duration in terms of loop iterations. + * @return void + */ +#if defined(__GNUC__) +#pragma GCC push_options +#pragma GCC optimize("O0") +void work(unsigned long num); +#else +void work __attribute__((optnone)) (unsigned long num); +#endif +#if defined(__GNUC__) +#pragma GCC pop_options +#endif + +#endif /*__UTILS_H__*/