From d89c501b590a0dfe4e6d73dc0eb6af46a816e55e Mon Sep 17 00:00:00 2001
From: Stefano Pigozzi <me@steffo.eu>
Date: Wed, 16 Nov 2022 18:05:12 +0100
Subject: [PATCH] `kernel_atax`: Parallelizing the second loop gives a nice
 speedup

---
 OpenMP/linear-algebra/kernels/atax/atax.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/OpenMP/linear-algebra/kernels/atax/atax.c b/OpenMP/linear-algebra/kernels/atax/atax.c
index 787db79..7469897 100644
--- a/OpenMP/linear-algebra/kernels/atax/atax.c
+++ b/OpenMP/linear-algebra/kernels/atax/atax.c
@@ -69,7 +69,9 @@ static void kernel_atax(int nx, int ny,
     y[i] = 0;
   
   /// This computes... something? I guess whatever ATAX is?
-  // Trying to parallelize this only seems to increase the time required
+  // Now this gives a nice speedup, especially with a lot more threads than the count!
+  // THREAD_COUNT * 4 seems to be the best on my local computer. What's the best for the Jetson Nano?
+  #pragma omp parallel for num_threads(THREAD_COUNT * 4) schedule(static)
   for (i = 0; i < _PB_NX; i++)
   {
     /// Every iteration has its own tmp variable