#include #include "ee155_utils.hxx" #include "matrix.hxx" const int BS = 32; // The blocks are BS x BS. template __device__ void print_array (float arr [size][size]); /////////////////////////////// // This is the CUDA kernel function for you to write. // __global__ void mat_mult (float *d_A, float *d_B, float *d_C, int N) { int rB=blockIdx..., __shared__ float SA[BS][BS], SB[BS][BS]; //printf("In thread with r=(%d,%d) c=(%d,%d)\n", rB,rI,cB,cI); ... } /////////////////////////////// // This is the host function for you to write. // It allocates memory and moves data between CPU<->GPU void Matrix::mpy1 (const Matrix &A, const Matrix &B, int BS) { // Copy A from host memory to device memory. int numElem=N()*N(), sizeBytes = numElem*4; float *d_A = NULL; cudaError_t err = cudaMalloc((void **)&d_A, sizeBytes); ERR_CHK (err, "Failed to allocate device matrix A"); err = cudaMemcpy (d_A, A.data.data(), sizeBytes, cudaMemcpyHostToDevice); ERR_CHK (err, "Failed to copy matrix A from host to device"); // Allocate device memory for B. ... // Copy B from host memory to device memory. ... // Allocate device memory for C. ... // Launch the CUDA Kernel ... err = cudaGetLastError(); ERR_CHK (err, "Failed to launch/complete the mat_mult() kernel"); // Copy the result from device memory to host memory. ... // Free device memory. err = cudaFree(d_A); ERR_CHK (err, "Failed to free CUDA matrix A"); ... } // Simple function for printing out an array for debug. Make sure to // only call it from one thread, or else you'll get *lots* of output! template __device__ void print_array (float arr [size][size]) { for (int r=0; r