diff --git a/atax/Makefile b/atax/Makefile
index 5dacf14..2978ab6 100644
--- a/atax/Makefile
+++ b/atax/Makefile
@@ -23,9 +23,10 @@ NVCFLAGS:=$(CXXFLAGS) $(NVOPT)
 	$(NVCC) $(NVCFLAGS) -c $< -o $@
 
 
-.PHONY: bench clean
+.PHONY: bench clean dev
 
-all: atax.elf
+dev: atax.elf
+	./atax.elf
 
 bench:
 	./.bench.sh
diff --git a/atax/atax.cu b/atax/atax.cu
index 0c7c48c..3374573 100644
--- a/atax/atax.cu
+++ b/atax/atax.cu
@@ -31,6 +31,9 @@
 // TODO: Remove this, as it will be set by .bench.sh
 #define HPC_USE_STRIDE
 
+// Create macro for debug logging
+#define debug(txt) std::cerr << txt << std::endl
+
 
 /**
  * Initialize the arrays to be used in the computation:
@@ -75,26 +78,6 @@ __host__ static void init_array(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
 }
 #endif
 
-/**
- * Initialize the arrays to be used in the computation:
- * 
- * - `X` is filled with multiples of `M_PI`;
- * - `Y` is zeroed;
- * - `A` is filled with sample data.
- * 
- * It is called by the host, runs on the device, and calls the other init_arrays on the device.
- */
-#ifdef HPC_USE_CUDA
-__global__ static void init_array_cuda(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
-{
-	unsigned int threads = gridDim.x * blockDim.x;
-
-	init_array_cuda_x(X, threads);
-	init_array_cuda_y(Y, threads);
-	init_array_cuda_a(A, threads);
-}
-#endif
-
 /**
  * Initialize the `X` array.
  * 
@@ -161,9 +144,40 @@ __device__ static void init_array_cuda_y(DATA_TYPE* Y, unsigned int threads)
  * Runs on the device.
 */
 #ifdef HPC_USE_CUDA
-__device__ static void init_array_cuda_a(DATA_TYPE** A, unsigned int threads)
+__device__ static void init_array_cuda_a(DATA_TYPE* A, unsigned int threads)
 {
+	// Find how many elements should be written in total
+	unsigned int elements = NX * NY;
 
+	// Find how many iterations should be performed by each thread
+	unsigned int perThread = elements / threads;
+
+	// Find the index of the current thread, even if threads span multiple blocks
+	int blockThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	/* TODO */
+}
+#endif
+
+/**
+ * Initialize the arrays to be used in the computation:
+ * 
+ * - `X` is filled with multiples of `M_PI`;
+ * - `Y` is zeroed;
+ * - `A` is filled with sample data.
+ * 
+ * Beware that `A` here is a simple array, it is not a matrix, so elements are accessed via [y * NX + x] (I think?).
+ * 
+ * It is called by the host, runs on the device, and calls the other init_arrays on the device.
+ */
+#ifdef HPC_USE_CUDA
+__global__ static void init_array_cuda(DATA_TYPE* A, DATA_TYPE* X, DATA_TYPE* Y)
+{
+	unsigned int threads = gridDim.x * blockDim.x;
+
+	init_array_cuda_x(X, threads);
+	init_array_cuda_y(Y, threads);
+	init_array_cuda_a(A, threads);
 }
 #endif
 
@@ -226,8 +240,14 @@ __host__ static void kernel_atax(DATA_TYPE** A, DATA_TYPE* X, DATA_TYPE* Y)
  */
 __host__ int main(int argc, char** argv)
 {
+	debug("Starting main...");
+
 	#ifndef HPC_USE_CUDA
 
+		debug("[Mode] Host-only");
+
+		debug("[Pointers] Allocating...");
+
 		// A[NX][NY]
 		DATA_TYPE** A = new DATA_TYPE*[NX] {};
 		for(unsigned int x = 0; x < NX; x++)
@@ -241,56 +261,86 @@ __host__ int main(int argc, char** argv)
 		// Y[NX]
 		DATA_TYPE* Y = new DATA_TYPE[NX] {};
 
+		debug("[Pointers] Allocated!");
+
 		#ifdef HPC_INCLUDE_INIT
+			debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif
 
+		debug("[Init] Initializing...");
 		init_array(A, X, Y);
+		debug("[Init] Initialized!");
 
 		#ifndef HPC_INCLUDE_INIT
+			debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif
 
+		debug("[Kernel] Running...");
 		kernel_atax(A, X, Y);
+		debug("[Kernel] Completed!");
 
+		debug("[Benchmark] Stopping...");
 		polybench_stop_instruments;
 		polybench_print_instruments;
+		debug("[Benchmark] Complete!");
 
+		debug("[Verify] Printing...")
 		polybench_prevent_dce(
 			print_array(Y)
 		);
+		debug("[Verify] Done!")
 
 	#else
+	
+		debug("[Mode] Host-and-device, CUDA");
 
-		DATA_TYPE** A;
+		debug("[Pointers] Allocating...");
+		DATA_TYPE* A;
 		DATA_TYPE* X;
 		DATA_TYPE* Y;
 		
-		if(cudaMalloc(&A, sizeof(DATA_TYPE) * NX * NY)) 
+		debug("[CUDA] Allocating A...");
+		if(cudaMalloc((void**)&A, sizeof(DATA_TYPE) * NX * NY)) 
 		{
-			std::cerr << "Could not allocate A on the device\n";
+			debug("[CUDA] Could not allocate A!");
 			return 1;
 		}
+		debug("[CUDA] Allocated A!");
 		
-		if(cudaMalloc(&X, sizeof(DATA_TYPE) * NY))
+		debug("[CUDA] Allocating X...");
+		if(cudaMalloc((void**)&X, sizeof(DATA_TYPE) * NY))
 		{
-			std::cerr << "Could not allocate X on the device\n";
+			debug("[CUDA] Could not allocate X!");
 			return 1;
 		}
+		debug("[CUDA] Allocated X!");
 
-		if(cudaMalloc(&Y, sizeof(DATA_TYPE) * NX))
+		debug("[CUDA] Allocating Y...");
+		if(cudaMalloc((void**)&Y, sizeof(DATA_TYPE) * NX))
 		{
-			std::cerr << "Could not allocate Y on the device\n";
+			debug("[CUDA] Could not allocate Y!");
 			return 1;
 		}
+		debug("[CUDA] Allocated Y!");
 
 		#ifdef POLYBENCH_INCLUDE_INIT
+			debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif
 
-		init_array_cuda<<<1, 1>>>(A, X, Y);
+		debug("[Init] Initializing...");
+		init_array_cuda<<<32, 32>>>((double*) A, (double*) X, (double*) Y);
+		if(cudaGetLastError())
+		{
+			debug("[Init] Failed to execute kernel!");
+			return 1;
+		}
+		debug("[Init] Initialized!");
 
 		#ifndef POLYBENCH_INCLUDE_INIT
+			debug("[Benchmark] Starting...");
 			polybench_start_instruments;
 		#endif