CUDA:使用共享内存的瓦片矩阵乘法,以及矩阵大小不是块大小的倍数。

23

我正在尝试熟悉CUDA编程,并且感到非常有趣。我目前正在查看this处理矩阵乘法的PDF,其中包括使用和不使用共享内存的方法。两个版本的完整代码可以在here找到。这段代码几乎与CUDA矩阵乘法示例中的代码完全相同。虽然非共享内存版本具有在任何矩阵大小下运行的能力,而不受块大小的限制,但共享内存版本必须处理块大小的倍数的矩阵(我将其设置为4,默认值最初为16)。

PDF末尾建议的问题之一是更改共享内存版本,使其也能处理非块大小的倍数的矩阵。我认为这将是一个简单的索引检查,就像非共享版本中一样:

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row > A.height || col > B.width) return;

但这并不起作用。这是完整的代码,减去了主方法(有点混乱,抱歉),我对其进行了一些修改:

void MatMul(const Matrix A, const Matrix B, Matrix C) { 
  // Load A and B to device memory 
  Matrix d_A; 
  d_A.width = d_A.stride = A.width; 
  d_A.height = A.height; 
  size_t size = A.width * A.height * sizeof(float); 
  cudaError_t err = cudaMalloc(&d_A.elements, size); 
  printf("CUDA malloc A: %s\n",cudaGetErrorString(err)); 
  err = cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice); 
  printf("Copy A to device: %s\n",cudaGetErrorString(err)); 

  Matrix d_B; 
  d_B.width = d_B.stride = B.width; 
  d_B.height = B.height; 
  size = B.width * B.height * sizeof(float); 
  err = cudaMalloc(&d_B.elements, size); 
  printf("CUDA malloc B: %s\n",cudaGetErrorString(err));
  err = cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
  printf("Copy B to device: %s\n",cudaGetErrorString(err)); 

  Matrix d_C; 
  d_C.width = d_C.stride = C.width; 
  d_C.height = C.height; 
  size = C.width * C.height * sizeof(float); 
  err = cudaMalloc(&d_C.elements, size); 
  printf("CUDA malloc C: %s\n",cudaGetErrorString(err));

  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 
    dim3 dimGrid((B.width + dimBlock.x - 1) / dimBlock.x, (A.height + dimBlock.y-1) / dimBlock.y);
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); 
    err = cudaThreadSynchronize();
    printf("Run kernel: %s\n", cudaGetErrorString(err));

  // Read C from device memory 
  err = cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost); 
  printf("Copy C off of device: %s\n",cudaGetErrorString(err));

  // Free device memory
  cudaFree(d_A.elements); 
  cudaFree(d_B.elements); 
  cudaFree(d_C.elements); 
} 

// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col) { 
  return A.elements[row * A.stride + col]; 
} 

// Set a matrix element 
__device__ void SetElement(Matrix A, int row, int col, float value) { 
  A.elements[row * A.stride + col] = value; 
} 

// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is 
// located col sub-matrices to the right and row sub-matrices down 
// from the upper-left corner of A 
__device__ Matrix GetSubMatrix(Matrix A, int row, int col) { 
  Matrix Asub; 
  Asub.width = BLOCK_SIZE; 
  Asub.height = BLOCK_SIZE; 
  Asub.stride = A.stride; 
  Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]; 
  return Asub; 
}


// Matrix multiplication kernel called by MatMul() 
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) { 
  // Block row and column 
  int blockRow = blockIdx.y; 
  int blockCol = blockIdx.x; 

  int rowTest = blockIdx.y * blockDim.y + threadIdx.y;
  int colTest = blockIdx.x * blockDim.x + threadIdx.x;
  if (rowTest>A.height || colTest>B.width)
    return;
  // Each thread block computes one sub-matrix Csub of C
  Matrix Csub = GetSubMatrix(C, blockRow, blockCol); 

  // Each thread computes one element of Csub 
  // by accumulating results into Cvalue 
  float Cvalue = 0.0; 
  // Thread row and column within Csub 
  int row = threadIdx.y; 
  int col = threadIdx.x; 
  // Loop over all the sub-matrices of A and B that are 
  // required to compute Csub 
  // Multiply each pair of sub-matrices together 
  // and accumulate the results 
  for (int m = 0; m < (BLOCK_SIZE + A.width - 1)/BLOCK_SIZE; ++m) {
    // Get sub-matrix Asub of A 
    Matrix Asub = GetSubMatrix(A, blockRow, m); 

    // Get sub-matrix Bsub of B 
    Matrix Bsub = GetSubMatrix(B, m, blockCol); 

    // Shared memory used to store Asub and Bsub respectively 
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 

    // Load Asub and Bsub from device memory to shared memory 
    // Each thread loads one element of each sub-matrix 
    As[row][col] = GetElement(Asub, row, col); 
    Bs[row][col] = GetElement(Bsub, row, col); 

    // Synchronize to make sure the sub-matrices are loaded 
    // before starting the computation 
    __syncthreads(); 

    // Multiply Asub and Bsub together 
    for (int e = 0; e < BLOCK_SIZE; ++e) 
    {
      Cvalue += As[row][e] * Bs[e][col];
    }
    // Synchronize to make sure that the preceding 
    // computation is done before loading two new 
    // sub-matrices of A and B in the next iteration 
    __syncthreads();  
  }
  // Write Csub to device memory 
  // Each thread writes one element 
  SetElement(Csub, row, col, Cvalue); 
}

我所做的显著更改:我在MatMulKernel中添加了一个检查,检查当前线程是否试图处理不存在的C位置。这似乎不起作用。虽然它确实改变了结果,但是这些更改似乎没有任何模式,除了后面(更高的x或y值)的条目似乎更受影响(我得到了更多的非整数结果)。我还更改了给定的dimGrid计算方法和MatMulKernelm的循环条件(之前只是将widthheight除以块大小,这似乎是错误的)。
即使是我找到的解决方案指南也似乎表明它应该只是一个简单的索引检查,所以我认为我缺少了一些非常基本的东西。
1个回答

31

当矩阵的维度不是瓦片维度的倍数时,就可能会出现一些瓦片只部分覆盖矩阵的情况。超出完全重叠瓦片范围的瓦片元素应该被正确地置零。因此,将您的代码扩展到任意大小的矩阵很容易,但并不等同于简单的索引检查。下面,我复制并粘贴了我的版本的带有任意大小矩阵的瓦片矩阵乘法核。

__global__ void MatMul(float* A, float* B, float* C, int ARows, int ACols, int BRows,
    int BCols, int CRows, int CCols)
{
    float CValue = 0;

    int Row = blockIdx.y*TILE_DIM + threadIdx.y;
    int Col = blockIdx.x*TILE_DIM + threadIdx.x;

    __shared__ float As[TILE_DIM][TILE_DIM];
    __shared__ float Bs[TILE_DIM][TILE_DIM];

    for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) {

         if (k*TILE_DIM + threadIdx.x < ACols && Row < ARows)
             As[threadIdx.y][threadIdx.x] = A[Row*ACols + k*TILE_DIM + threadIdx.x];
         else
             As[threadIdx.y][threadIdx.x] = 0.0;

         if (k*TILE_DIM + threadIdx.y < BRows && Col < BCols)
             Bs[threadIdx.y][threadIdx.x] = B[(k*TILE_DIM + threadIdx.y)*BCols + Col];
         else
             Bs[threadIdx.y][threadIdx.x] = 0.0;

         __syncthreads();

         for (int n = 0; n < TILE_DIM; ++n)
             CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];

         __syncthreads();
    }

    if (Row < CRows && Col < CCols)
        C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols) +
           (blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
}

我对一般的cuda实现(不使用共享内存)运行了这段代码,惊讶地发现两种方法所需的时间几乎相同。我本来期望使用共享内存会有很好的加速效果,因为通常情况下它可以提高执行时间。 - Rajith Gun Hewage
1
@rajeerc 我们正在讨论的代码只是一个练习,用于说明共享内存在具体测试案例(瓦片矩阵乘法)中的使用。它并不代表实现矩阵乘法的最佳方式:cuBLAS做得更加复杂。话虽如此,你没有观察到任何改进的原因是对于你运行的GPU架构,L1缓存已经完成了共享内存的全部工作。请记住,除非是非常旧的架构,否则共享内存可以被视为受控缓存。 - Vitality
1
有没有什么方法可以使用共享内存方法来超越正常的CUDA实现运行时间?(而不需要使用cuBLAS) - Rajith Gun Hewage
可能有点晚了,但如果您仍然感兴趣,请看一下Winograd算法用于Gemm,CLblast也是一个很好的Open CL Gemm库。 - Abhishek Nikam
嗨,我正在尝试修改这个内核,以便用矩阵A替换矩阵B,并将其访问为A的转置,以执行A*A_T。有人知道我该如何做吗? - omn_1

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接