我的CUDA内核中的printf()没有产生任何输出

Question

我的CUDA内核中的printf()没有产生任何输出

7

我在我的CUDA程序中添加了一些printf()语句。

__device__ __global__ void Kernel(float *, float * ,int );
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
    //Kernel call
    printf("calling kernel\n");
    Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
    printf("kernel called\n");
  ....
}

int main(int argc , char **argv)
{   ....
    printf("beforeDeviceFunc\n\n");
    DeviceFunc(a_h , numvar , b_h); //Showing the data
    printf("after DeviceFunc\n\n");
    ....
}

在Kernel.cu文件中，我写了以下内容：

#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
    int idx = threadIdx.x ;
    int idy = threadIdx.y ;
    //Allocating memory in the share memory of the device
    __shared__ float temp[16][16];

    //Copying the data to the shared memory
    temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
    printf("idx=%d, idy=%d, size=%d", idx, idy, size);
    ....
}

然后我使用-arch=sm_20进行编译，就像这样:

nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main

现在当我运行程序时，我看到：

beforeDeviceFunc

calling kernel
kernel called
after DeviceFunc

因此，内核中的printf()未被打印。我该如何解决这个问题？

- mahmood

在我的情况下，cudaDeviceSynchronize() 没有返回任何错误。但是我注意到块维度太大了（32 x 32），创建较小的线程块可以解决问题。 - atoMerz

2个回答

0

我刚刚遇到了同样的错误，将块大小减小到512有所帮助。根据文档，最大块大小可以是512或1024。

我编写了一个简单的测试，显示我的GTX 1070具有最大块大小为1024。更新：您可以使用cudaError_t cudaPeekAtLastError()检查您的内核是否已执行，如果内核已成功启动，则返回cudaSuccess，仅在调用cudaError_t cudaDeviceSynchronize()之后才更糟糕。

测试块大小为1023

测试块大小为1024

测试块大小为1025

CUDA错误：无效的配置参数

块最大尺寸为1024

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>

__global__
void set1(int* t)
{
    t[threadIdx.x] = 1;
}

inline bool failed(cudaError_t error)
{
    if (cudaSuccess == error)
        return false;

    fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
    return true;
}

int main()
{
    int blockSize;
    for (blockSize = 1; blockSize < 1 << 12; blockSize++)
    {
        printf("Testing block size of %d\n", blockSize);
        int* t;
        if(failed(cudaMallocManaged(&t, blockSize * sizeof(int))))
        {
            failed(cudaFree(t));
            break;
        }
        for (int i = 0; i < blockSize; i++)
            t[0] = 0;
        set1 <<<1, blockSize>>> (t);
        if (failed(cudaPeekAtLastError()))
        {
            failed(cudaFree(t));
            break;
        }
        if (failed(cudaDeviceSynchronize()))
        {
            failed(cudaFree(t));
            break;
        }

        bool hasError = false;
        for (int i = 0; i < blockSize; i++)
            if (1 != t[i])
            {
                printf("CUDA error: t[%d] = %d but not 1\n", i, t[i]);
                hasError = true;
                break;
            }
        if (hasError)
        {
            failed(cudaFree(t));
            break;
        }

        failed(cudaFree(t));
    }
    blockSize--;
    if(blockSize <= 0)
    {
        printf("CUDA error: block size cannot be 0\n");
        return 1;
    }
    printf("Block maximum size is %d", blockSize);
    return  0;
}

附注：请注意，块大小中唯一的东西是现在为32的warp granularity，因此如果0 == yourBlockSize％32，则warp可以非常有效地使用。将块大于32的唯一原因是当代码需要同步时，同步仅在单个块的线程之间可用，这使开发人员使用单个大块而不是许多小块。因此，使用更多较小块的数量运行可能比使用较少较大块的数量运行更有效。

- Darth Jurassic

你的错误检查是不正确的，就像你所说的“请注意，如果提供的块大小超过了最大块大小，内核就会在没有任何异常、错误等情况下停止运行”。而块大小限制等内容都在文档中有详细描述。 - talonmies

感谢您的反馈。正如我所提到的“至少我不知道从哪里获取[错误]”，我已经发现调用cudaPeekAtLastError有所帮助，并更新了答案。 - Darth Jurassic

根据GPU的Cuda计算能力，每个多处理器的块数会更多或更少受限。例如，2.x支持每个SM 1536个线程，但仅支持8个块。如果您只使用一个完整的warp每个块，最大线程数为256，这使得隐藏延迟更加困难。除了同步和延迟之外，缓存利用是让线程在同一SM上运行的另一个原因。对于具有足够高数量的线程（以隐藏延迟）和块（以利用所有SM）的for循环，则是一种替代方法，以进行缓存利用。 - Sebastian

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- tera · Accepted Answer

printf() 输出仅在内核成功完成后显示，因此请检查所有 CUDA 函数调用的返回代码，并确保没有报告任何错误。

此外，printf() 输出仅在程序的某些点上显示。编程指南附录 B.32.2 列出了这些点。

Kernel启动通过<<<>>>或cuLaunchKernel()（在启动开始时，如果CUDA_LAUNCH_BLOCKING环境变量设置为1，则在启动结束时也是如此），同步通过cudaDeviceSynchronize()、cuCtxSynchronize()、cudaStreamSynchronize()、cuStreamSynchronize()、cudaEventSynchronize()或cuEventSynchronize()，内存复制通过任何阻塞版本的cudaMemcpy*()或cuMemcpy*()，模块加载/卸载通过cuModuleLoad()或cuModuleUnload()，上下文销毁通过cudaDeviceReset()或cuCtxDestroy()。在执行由cudaStreamAddCallback()或cuStreamAddCallback()添加的流回调之前，请检查这是否是您的问题。将以下代码放置在内核调用后进行检查：

{
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\".\n",
               cudaGetErrorString(cudaerr));
}

你应该看到你的内核输出或错误消息之一。

更方便的是，如果你在cuda-memcheck下运行可执行文件，它会自动为你检查所有返回代码。虽然你应该始终检查错误，但在解决具体问题时，这非常方便。