C++和Fortran CUDA基本示例的nvprof输出差异

Question

C++和Fortran CUDA基本示例的nvprof输出差异

4

我正在自学CUDA编程。我的最终目标是将它应用到Fortran中，但因为很多课程/视频都是基于C/C++的，所以我经常不得不在两种语言中执行相同的练习（这是一件好事）。目前，我正在尝试在GPU上运行一个简单的练习，该练习在数组a(i)上执行b(i) + c(i)。为了完整起见，我将两种语言的代码都发布如下进行比较：

C 语言代码如下

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "cuda_common.cuh"
#include "common.h"

//assume grid is 1D and block is 1D then nx = size
__global__ void sum_arrays_1Dgrid_1Dblock(float* a, float* b, float *c, int nx)
{
   int gid = blockIdx.x * blockDim.x + threadIdx.x;

   if (gid < nx)
      c[gid] = a[gid] + b[gid];
}


void run_sum_array_1d(int argc, char** argv)
{
   printf("Runing 1D grid \n");
   int size = 1 << 22;
   int block_size = 128;

   int nx, ny = 0;

   if (argc > 2)
      size = 1 << atoi(argv[2]);

   if (argc > 4)
      block_size = 1 << atoi(argv[4]);


   unsigned int byte_size = size * sizeof(float);

   printf("Input size : %d \n",size);

   float * h_a, *h_b, *h_out, *h_ref;
   h_a = (float*)malloc(byte_size);
   h_b = (float*)malloc(byte_size);
   h_out = (float*)malloc(byte_size);
   h_ref = (float*)malloc(byte_size);


   if (!h_a)
      printf("host memory allocation error \n");

  for (size_t i = 0; i < size; i++)
   {
      h_a[i] = i % 10;
      h_b[i] = i % 7;
   }

   clock_t cpu_start, cpu_end;
   cpu_start = clock();
   sum_array_cpu(h_a, h_b, h_out,size);
   cpu_end   = clock();

   dim3 block( block_size);
   dim3 grid((size+block.x -1)/block.x);

   printf("Kernel is lauch with grid(%d,%d,%d) and block(%d,%d,%d) \n",
      grid.x,grid.y,grid.z,block.x,block.y, block.z);

   float *d_a, *d_b, *d_c;

   gpuErrchk(cudaMalloc((void**)&d_a, byte_size));
   gpuErrchk(cudaMalloc((void**)&d_b, byte_size));
   gpuErrchk(cudaMalloc((void**)&d_c, byte_size));
   gpuErrchk(cudaMemset(d_c,0,byte_size));

   clock_t htod_start, htod_end;
   htod_start = clock();
   gpuErrchk(cudaMemcpy(d_a,h_a,byte_size,cudaMemcpyHostToDevice));
   gpuErrchk(cudaMemcpy(d_b, h_b, byte_size, cudaMemcpyHostToDevice));
   htod_end = clock();

   clock_t gpu_start, gpu_end;
   gpu_start = clock();
   sum_arrays_1Dgrid_1Dblock << <grid, block >> > (d_a, d_b, d_c, size);
   gpuErrchk(cudaDeviceSynchronize());
   gpu_end   = clock();

   clock_t dtoh_start, dtoh_end;
   dtoh_start = clock();
   gpuErrchk(cudaMemcpy(h_ref,d_c,byte_size,cudaMemcpyDeviceToHost));
   dtoh_end   = clock();

   compare_arrays(h_out, h_ref,size);

     // elapsed time comparison
   printf("Sum array CPU execution time [ms] : %4.6f \n",
         (double)((double)1000.0*(cpu_end - cpu_start)/CLOCKS_PER_SEC));
   printf("Sum array GPU execution time [ms] : %4.6f \n",
         (double)((double)1000.0*(gpu_end - gpu_start)/CLOCKS_PER_SEC));
   printf("htod mem transfer time [ms] : %4.6f \n",
         (double)((double)1000.0*(htod_end - htod_start)/CLOCKS_PER_SEC));
   printf("dtoh mem transfer time [ms] : %4.6f \n",
        (double)((double)1000.0*(dtoh_end - dtoh_start)/CLOCKS_PER_SEC));
   printf("Total GPU execution time [ms] : %4.6f \n",
         (double)((double)1000.0*(dtoh_end - htod_start)/CLOCKS_PER_SEC));

   cudaFree(d_c);
   cudaFree(d_b);
   cudaFree(d_a);
   free(h_ref);
   free(h_out);
   free(h_b);
   free(h_a);
}

////arguments :
////1 - kernel (0:1D or 1:2D), 
////2 - input size (2 pow (x))
////3 - for 2D kernel nx, 
////4 - block.x 
////5 - block.y  
int main(int argc, char** argv)
{
   printf("\n----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------ \n\n");
   printf("argc : %d \n",argc);
   for (int i = 0; i < argc; i++)
   {
      printf("argv : %s \n",argv[i]);
   };

   run_sum_array_1d(argc, argv);

   //query_device();
   return 0;
}

以下是Fortran代码

#include 'Error.fpp'
MODULE CUDAOps
   USE cudafor
   USE CUDAUtils
   USE CPUOps
   IMPLICIT NONE

   CONTAINS

   ATTRIBUTES(GLOBAL) SUBROUTINE sumArraysGPU_1D(a,b,c,Nsize)
      IMPLICIT NONE

      !> intent variables
      INTEGER, INTENT(IN),    DIMENSION(:) :: a,b
      INTEGER, INTENT(INOUT), DIMENSION(:) :: c
      INTEGER, INTENT(IN),    VALUE        :: Nsize

      !> local variables
      INTEGER :: blockId, threadId

      ! get the blockId
      blockId = (blockIdx%x-1)

      ! get the threadId
      threadId = blockId * blockDim%x + threadIdx%x-1

      ! adjust to let the threadId to start from 1
      threadId = threadId + 1


      !WRITE(*,*) 'threadId = ',threadId

      ! set the maximum
      IF (threadId <= Nsize) THEN

         ! perform the sum
         c(threadId) = a(threadId) + b(threadId)
      END IF

  END SUBROUTINE sumArraysGPU_1D

 SUBROUTINE runSumArrays1D(xpow,blockSizeX)
     IMPLICIT NONE

     ! intent variables
     INTEGER, INTENT(IN) :: xpow,blockSizeX

     !> variables declaration
     ! size of the arrays
     INTEGER:: Nsize
     ! size of the GPU block
     INTEGER:: block_size

     ! other auxiliary variables
     INTEGER          :: i,j,istat
     REAL(KIND=wp)    :: t1,t2,time,timeGPU
     TYPE(cudaEvent)  :: startEvent, stopEvent

     ! host data allocation
     INTEGER, DIMENSION(:), ALLOCATABLE :: h_a, h_b, h_c, gpu_results
     ! device data allocation
     INTEGER, DIMENSION(:), ALLOCATABLE, DEVICE :: d_a, d_b, d_c

     ! define the GPU grid and block
     TYPE(DIM3)            :: grid, tBlock

     ! define data size and block size along X dimension
     Nsize = 2**xpow
     block_size = 2**blockSizeX

     ! allocate memory in host
     ALLOCATE(h_a(Nsize))
     ALLOCATE(h_b(Nsize))
     ALLOCATE(h_c(Nsize))

     ! allocate memory in device
     ALLOCATE(gpu_results(Nsize))
     ALLOCATE(d_a(Nsize))
     ALLOCATE(d_b(Nsize))
     ALLOCATE(d_c(Nsize))

     ! define block and grid
     tBlock = DIM3(block_size,1,1)
     grid   = DIM3((Nsize/tBlock%x),1,1)

     ! host data initialization
     CALL generateNumberByIntegerDivision(h_a,10,Nsize)
     CALL generateNumberByIntegerDivision(h_b,7,Nsize)

     WRITE(*,*) 'Kernel is going to be launched with'
     WRITE(*,*) 'Nsize = ',Nsize
     WRITE(*,*) 'xpow = ',xpow
     WRITE(*,*) 'blockSizeX = ',blockSizeX
     WRITE(*,*) 'block_size = ',block_size
     WRITE(*,*) 'grid.x = ',grid%x
     WRITE(*,*) 'grid.y = ',grid%y
     WRITE(*,*) 'grid.z = ',grid%z
     WRITE(*,*) 'block.x = ',tblock%x
     WRITE(*,*) 'block.y = ',tblock%y
     WRITE(*,*) 'block.z = ',tblock%z
     timeGPU = 0.0_wp


     CALL CPU_TIME(t1)
     ! perform the sum in serial using the CPU
     CALL sumArraysCPU(h_a,h_b,h_c)
     CALL CPU_TIME(t2)
     WRITE(*,*) 'time for the CPU implementation (ms) = ',(t2-t1)*1e3

     ! initialize CUDA events
     !istat = cudaEventCreate(startEvent)
     GPU_ERROR(cudaEventCreate(startEvent))
     istat = cudaEventCreate(stopEvent)

     ! copy the source data h_a from CPU to GPU
     istat = cudaEventRecord(startEvent,0)
     istat = cudaMemCpy(d_a,h_a,Nsize,cudaMemcpyHostToDevice)
     istat = cudaEventRecord(stopEvent,0)
     istat = cudaEventSynchronize(stopEvent)
     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
     WRITE(*,*) 'time to transfer h_a to GPU (ms) = ',time
     timeGPU = timeGPU + time

     ! copy the source data h_b from CPU to GPU
     istat = cudaEventRecord(startEvent,0)
     istat = cudaMemCpy(d_b,h_b,Nsize,cudaMemcpyHostToDevice)
     istat = cudaEventRecord(stopEvent,0)
     istat = cudaEventSynchronize(stopEvent)
     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
     WRITE(*,*) 'time to transfer h_b to GPU (ms) = ',time
     timeGPU = timeGPU + time

     ! perform the sum on the GPU
     istat = cudaEventRecord(startEvent,0)
     CALL sumArraysGPU_1D<<<grid, tBlock>>>(d_a,d_b,d_c,Nsize)
     istat = cudaEventRecord(stopEvent,0)
     istat = cudaEventSynchronize(stopEvent)
     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
     WRITE(*,*) 'time to perform the sum on GPU (ms) = ',time
     timeGPU = timeGPU + time

     ! copy the data back from GPU to CPU
     istat = cudaEventRecord(startEvent,0)
     istat = cudaMemCpy(gpu_results,d_c,Nsize,cudaMemcpyDeviceToHost)
     istat = cudaEventRecord(stopEvent,0)
     istat = cudaEventSynchronize(stopEvent)
     istat = cudaEventElapsedTime(time, startEvent, stopEvent)
     WRITE(*,*) 'time to copy back data from GPU to CPU (ms) = ',time
     timeGPU = timeGPU + time
     WRITE(*,*) 'Total time to execute GPU (ms) :',timeGPU

     !WRITE(*,*) 'h_c = ',h_c
     !WRITE(*,*) 'gpu_results = ',gpu_results
     ! make a formal check of the result component by component
     CALL checkArraysCPU(h_c,gpu_results,Nsize)
     WRITE(*,*) 'SUM(h_c) = ',SUM(h_c)
     WRITE(*,*) 'SUM(gpu_results) = ',SUM(gpu_results)

     ! deallocate memory in host
     DEALLOCATE(h_a)
     DEALLOCATE(h_b)
     DEALLOCATE(h_c)

     ! deallocate memory in device
     DEALLOCATE(gpu_results)
     DEALLOCATE(d_a)
     DEALLOCATE(d_b)
     DEALLOCATE(d_c)

  END SUBROUTINE runSumArrays1D

PROGRAM main
   USE CPUOps
   USE CUDAOps

   IMPLICIT NONE

   ! declare local variables
   INTEGER :: i,xpow,sizeBlockX

   ! set the default values
   xpow       = 22
   sizeBlockX = 7

  ! lanuch the dedicated routines
  CALL runSumArrays1D(xpow,sizeBlockX)
STOP
END PROGRAM main

当我使用默认选项（数据大小和块大小）使用nvprof运行代码时，对于这两个代码都使用以下命令：

nvprof ./code.x

我得到了以下输出。

C 代码如下：

----------------------- SUM ARRAY EXAMPLE FOR NVPROF ------------------------ 

Runing 1D grid 
Input size : 4194304 
Kernel is lauch with grid(32768,1,1) and block(128,1,1) 
==33351== NVPROF is profiling process 33351, command: ./code_c.x
Arrays are same 
Sum array CPU execution time [ms] : 4.850000 
Sum array GPU execution time [ms] : 1.610000 
htod mem transfer time [ms] : 10.640000 
dtoh mem transfer time [ms] : 5.759000 
Total GPU execution time [ms] : 18.011000 
==33351== Profiling application: ./code_c.x
==33351== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   61.35%  10.715ms         2  5.3577ms  5.3566ms  5.3589ms  [CUDA memcpy HtoD]
                   30.94%  5.4040ms         1  5.4040ms  5.4040ms  5.4040ms  [CUDA memcpy DtoH]
                    5.81%  1.0147ms         1  1.0147ms  1.0147ms  1.0147ms  sum_arrays_1Dgrid_1Dblock(float*, float*, float*, int)
                    1.90%  331.81us         1  331.81us  331.81us  331.81us  [CUDA memset]
      API calls:   75.67%  60.242ms         3  20.081ms  55.398us  60.116ms  cudaMalloc
                   20.59%  16.393ms         3  5.4645ms  5.2016ms  5.7578ms  cudaMemcpy
                    2.00%  1.5906ms         1  1.5906ms  1.5906ms  1.5906ms  cudaDeviceSynchronize
                    1.47%  1.1673ms         3  389.10us  186.65us  497.81us  cudaFree
                    0.14%  107.71us       101  1.0660us      88ns  57.578us  cuDeviceGetAttribute
                    0.08%  65.483us         1  65.483us  65.483us  65.483us  cuDeviceGetName
                    0.02%  17.946us         1  17.946us  17.946us  17.946us  cudaMemset
                    0.02%  16.011us         1  16.011us  16.011us  16.011us  cudaLaunchKernel
                    0.01%  8.6300us         1  8.6300us  8.6300us  8.6300us  cuDeviceGetPCIBusId
                    0.00%  1.1600us         3     386ns     146ns     846ns  cuDeviceGetCount
                    0.00%     369ns         2     184ns      94ns     275ns  cuDeviceGet
                    0.00%     246ns         1     246ns     246ns     246ns  cuDeviceTotalMem
                    0.00%     194ns         1     194ns     194ns     194ns  cuModuleGetLoadingMode
                    0.00%     167ns         1     167ns     167ns     167ns  cuDeviceGetUuid

对于Fortran代码：

==38266== NVPROF is profiling process 38266, command: ./code_f.x 
 Kernel is going to be launched with
 Nsize =       4194304
 xpow =            22
 blockSizeX =             7
 block_size =           128
 grid.x =         32768
 grid.y =             1
 grid.z =             1
 block.x =           128
 block.y =             1
 block.z =             1
 time for the CPU implementation (ms) =     4.997969    
 time to transfer h_a to GPU (ms) =     5.680192    
 time to transfer h_b to GPU (ms) =     5.561248    
 time to perform the sum on GPU (ms) =     1.510400    
 time to copy back data from GPU to CPU (ms) =     7.039712    
 Total time to execute GPU (ms) :    19.79155    
 Arrays are the same!
 SUM(h_c) =    1592097881
 SUM(gpu_results) =    1592097881
==38266== Profiling application: ./code_f.x 
==38266== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   58.75%  10.911ms         5  2.1822ms  1.2160us  5.4682ms  [CUDA memcpy HtoD]
                   35.16%  6.5297ms         1  6.5297ms  6.5297ms  6.5297ms  [CUDA memcpy DtoH]
                    6.10%  1.1321ms         1  1.1321ms  1.1321ms  1.1321ms  cudaops_sumarraysgpu_1d_
      API calls:   87.80%  150.69ms         6  25.115ms  2.5020us  150.30ms  cudaMalloc
                    9.95%  17.072ms         6  2.8454ms  4.1870us  7.0309ms  cudaMemcpy
                    1.39%  2.3788ms         6  396.47us  2.2640us  1.1368ms  cudaFree
                    0.72%  1.2281ms         4  307.02us  6.6590us  629.72us  cudaEventSynchronize
                    0.05%  93.254us       101     923ns      92ns  41.961us  cuDeviceGetAttribute
                    0.04%  64.982us         1  64.982us  64.982us  64.982us  cuDeviceGetName
                    0.02%  36.395us         8  4.5490us  1.1180us  13.299us  cudaEventRecord
                    0.02%  31.801us         2  15.900us     873ns  30.928us  cudaEventCreate
                    0.01%  18.638us         1  18.638us  18.638us  18.638us  cudaLaunchKernel
                    0.00%  6.3520us         4  1.5880us     970ns  2.5790us  cudaEventElapsedTime
                    0.00%  4.9980us         1  4.9980us  4.9980us  4.9980us  cuDeviceGetPCIBusId
                    0.00%  1.5290us         3     509ns     165ns  1.1890us  cuDeviceGetCount
                    0.00%     444ns         2     222ns      92ns     352ns  cuDeviceGet
                    0.00%     279ns         1     279ns     279ns     279ns  cuModuleGetLoadingMode
                    0.00%     248ns         1     248ns     248ns     248ns  cuDeviceTotalMem
                    0.00%     164ns         1     164ns     164ns     164ns  cuDeviceGetUuid

我在这里想要理解的是，为什么“cudaMalloc”、“cudaMemcpy”和“cudaFree”的调用次数与我在C代码中编写的内容一致，而在Fortran代码中却不一致。

具体来说，虽然我执行了3个数组的分配，但cudaMalloc显示我调用了6次？

我试图理解我的Fortran代码是否存在错误/漏洞，或者这是正常现象，如果是，原因是什么。谢谢。

我尝试在Fortran中调整d_a、d_b和d_c数组的分配语句。看起来内核调用似乎会在已经显式完成的调用之上再次进行cudaMalloc和内存复制。

----------------- 编辑另一个问题是。如果我打印一些nvprof特定的指标，例如：

nvprof --metrics gld_efficiency,sm_efficiency,achieved_occupancy ./code.x

这是C语言的输出：

Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "NVIDIA GeForce MX330 (0)"
    Kernel: sum_arrays_1Dgrid_1Dblock(float*, float*, float*, int)
          1                            gld_efficiency             Global Memory Load Efficiency     100.00%     100.00%     100.00%
          1                             sm_efficiency                   Multiprocessor Activity      99.50%      99.50%      99.50%
          1                        achieved_occupancy                        Achieved Occupancy    0.922875    0.922875    0.922875

这是Fortran的输出结果

Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "NVIDIA GeForce MX330 (0)"
    Kernel: cudaops_sumarraysgpu_1d_
          1                            gld_efficiency             Global Memory Load Efficiency      67.86%      67.86%      67.86%
          1                             sm_efficiency                   Multiprocessor Activity      99.62%      99.62%      99.62%
          1                        achieved_occupancy                        Achieved Occupancy    0.877743    0.877743    0.877743

您可以清楚地看到全局内存负载效率的差异。这两个问题是否相关？

- Principio Tudisco

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Robert Crovella · Accepted Answer

在CUDA中，至少对于Fortran数组，需要元数据以便进行(CUDA) Fortran设备代码生成。

这个元数据会导致每个Fortran数组有两个分配。一个是实际数据的分配，另一个是元数据的分配。

元数据的一个例子可能是数组的“宽度”。（我在这里松散使用术语Fortran数组。你不会总是看到任何类型的设备分配都有这种元数据。）

自然而然地，由于需要元数据（在这种情况下，由CUDA Fortran编译器确定），并且单独分配，因此您还将看到每个数组的2次复制操作。相应地，也有2次释放操作。