逆FFT中的缩放问题在cuFFT中的解决方法

5
每当我使用cuFFT绘制程序获得的值并将结果与Matlab进行比较时,我得到相同形状的图形,并且最大值和最小值的数值在相同点上。然而,cuFFT产生的值比Matlab产生的值要大得多。Matlab代码如下:
fs = 1000;                              % sample freq
D = [0:1:4]';                           % pulse delay times
t = 0 : 1/fs : 4000/fs;                 % signal evaluation time
w = 0.5;                                % width of each pulse
yp = pulstran(t,D,'rectpuls',w);
filt = conj(fliplr(yp));
xx = fft(yp,1024).*fft(filt,1024);
xx = (abs(ifft(xx)));    

同样的输入,CUDA代码如下:

cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD);
cufftExecC2C(plan, (cufftComplex *)d_filter_signal, (cufftComplex *)d_filter_signal,     CUFFT_FORWARD);
ComplexPointwiseMul<<<blocksPerGrid, threadsPerBlock>>>(d_signal, d_filter_signal, NX);
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE);

cuFFT可以进行batch size为2的1024点FFT计算。
使用缩放因子NX=1024时,得到的值不正确。请告诉我如何解决。

我认为在cufft内部直接处理缩放没有简单的方法。你可以编写自己的核函数或稍后使用thrust来缩小信号。 - Pavan Yalamanchili
1
请注意,在cufft示例代码中建议通过数据元素数量进行除法运算,以在CUFFT_INVERSE操作之后返回原始数据。 - Robert Crovella
3个回答

5
这是晚些时候的答案,以将此问题从未回答的列表中删除。
您没有提供足够的信息来诊断您的问题,因为您没有指定设置cuFFT计划的方式。您甚至没有指定Matlab和cuFFT信号的形状是否完全相同(因此只需缩放),还是近似相同。但是,让我提出以下两点观察:
  1. yp向量有4000个元素;相反,通过将信号截断为1024个元素,您正在执行FFT:fft(yp,1024);
  2. 逆cuFFT不通过向量元素的数量来进行缩放。
出于方便起见(对其他用户可能有用),我在下面报告了一个简单的FFT-IFFT方案,该方案还使用CUDA Thrust库执行缩放。
#include <cufft.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

/*********************/
/* SCALE BY CONSTANT */
/*********************/
class Scale_by_constant
{
    private:
        float c_;

    public:
        Scale_by_constant(float c) { c_ = c; };

        __host__ __device__ float2 operator()(float2 &a) const
        {
            float2 output;

            output.x = a.x / c_;
            output.y = a.y / c_;

            return output;
        }

};

int main(void){

    const int N=4;

    // --- Setting up input device vector    
    thrust::device_vector<float2> d_vec(N,make_cuComplex(1.f,2.f));

    cufftHandle plan;
    cufftPlan1d(&plan, N, CUFFT_C2C, 1);

    // --- Perform in-place direct Fourier transform
    cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_FORWARD);

    // --- Perform in-place inverse Fourier transform
    cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_INVERSE);

    thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(), Scale_by_constant((float)(N)));

    // --- Setting up output host vector    
    thrust::host_vector<float2> h_vec(d_vec);

    for (int i=0; i<N; i++) printf("Element #%i; Real part = %f; Imaginary part: %f\n",i,h_vec[i].x,h_vec[i].y);

    getchar();
}

我喜欢这种简单的方法,但是你认为与使用FFT回调相比性能如何?使用Thrust版本是否需要经过全局内存的来回传输? - Andreas Yankopolus
@AndreasYankopolus 我已经添加了进一步的答案,分析了性能。 - Vitality

4
通过cuFFT回调功能的引入,可以通过将归一化操作定义为__device__函数,直接嵌入到cufftExecC2C调用中所需的逆FFT的标准化中。
除了cuFFT用户指南外,有关cuFFT回调功能,请参见: CUDA专业提示:使用cuFFT回调进行自定义数据处理 以下是通过cuFFT回调实现IFFT标准化的示例。
    #include <stdio.h>
    #include <assert.h>

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"

    #include <cufft.h>
    #include <cufftXt.h>

    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess)
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }

    /*********************/
    /* CUFFT ERROR CHECK */
    /*********************/
    // See https://dev59.com/THHYa4cB1Zd3GeqPMIG-
    #ifdef _CUFFT_H_
    static const char *_cudaGetErrorEnum(cufftResult error)
    {
        switch (error)
        {
            case CUFFT_SUCCESS:
                return "CUFFT_SUCCESS";

            case CUFFT_INVALID_PLAN:
                return "CUFFT_INVALID_PLAN";

            case CUFFT_ALLOC_FAILED:
                return "CUFFT_ALLOC_FAILED";

            case CUFFT_INVALID_TYPE:
                 return "CUFFT_INVALID_TYPE";

            case CUFFT_INVALID_VALUE:
                return "CUFFT_INVALID_VALUE";

            case CUFFT_INTERNAL_ERROR:
                return "CUFFT_INTERNAL_ERROR";

            case CUFFT_EXEC_FAILED:
                return "CUFFT_EXEC_FAILED";

            case CUFFT_SETUP_FAILED:
                return "CUFFT_SETUP_FAILED";

            case CUFFT_INVALID_SIZE:
                return "CUFFT_INVALID_SIZE";

            case CUFFT_UNALIGNED_DATA:
                return "CUFFT_UNALIGNED_DATA";
        }

        return "<unknown>";
    }
    #endif

    #define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
    inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
    {
        if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
        }
    }

    __device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {

        float *scaling_factor = (float*)callerInfo;

        float2 output;
        output.x = cuCrealf(element);
        output.y = cuCimagf(element);

        output.x = output.x / scaling_factor[0];
        output.y = output.y / scaling_factor[0];

        ((float2*)dataOut)[offset] = output;
}

    __device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;

    /********/
    /* MAIN */
    /********/
    int main() {

        const int N = 16;

        cufftHandle plan;

        float2 *h_input             = (float2*)malloc(N*sizeof(float2));
        float2 *h_output1           = (float2*)malloc(N*sizeof(float2));
        float2 *h_output2           = (float2*)malloc(N*sizeof(float2));

        float2 *d_input;            gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
        float2 *d_output1;          gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
        float2 *d_output2;          gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));

        float *h_scaling_factor     = (float*)malloc(sizeof(float));
        h_scaling_factor[0] = 16.0f;
        float *d_scaling_factor;    gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
        gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));

        for (int i=0; i<N; i++) {
            h_input[i].x = 1.0f;
            h_input[i].y = 0.f;
        }

        gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice));

        cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));

        cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
        gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost));
        for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);

        cufftCallbackStoreC h_storeCallbackPtr;
        gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));

        cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));

        cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
        gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost));
        for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);

        cufftSafeCall(cufftDestroy(plan));

        gpuErrchk(cudaFree(d_input));
        gpuErrchk(cudaFree(d_output1));
        gpuErrchk(cudaFree(d_output2));

        return 0;
    }

编辑

在调用时,CUFFT_CB_ST_COMPLEX指定了回调操作执行的“时刻”。请注意,您可以在同一个cuFFT计划中使用相同的加载和存储回调。


如果在创建计划后立即设置回调函数,那么似乎缩放将同时发生在正向和反向FFT上,是吗?因此最好为正向FFT(无缩放)和反向FFT(有缩放)创建单独的计划。 - Andreas Yankopolus

2

性能

我将进一步回答在这种特定情况下与同一代码的非回调版本相比较,用于IFFT缩放。 我正在使用的代码是:

#include <stdio.h>
#include <assert.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cufft.h>
#include <cufftXt.h>

#include <thrust/device_vector.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

//#define DISPLAY

/*******************************/
/* THRUST FUNCTOR IFFT SCALING */
/*******************************/
class Scale_by_constant
{
    private:
        float c_;

    public:
        Scale_by_constant(float c) { c_ = c; };

        __host__ __device__ float2 operator()(float2 &a) const
        {
            float2 output;

            output.x = a.x / c_;
            output.y = a.y / c_;

            return output;
        }

};

/**********************************/
/* IFFT SCALING CALLBACK FUNCTION */
/**********************************/
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {

    float *scaling_factor = (float*)callerInfo;

    float2 output;
    output.x = cuCrealf(element);
    output.y = cuCimagf(element);

    output.x = output.x / scaling_factor[0];
    output.y = output.y / scaling_factor[0];

    ((float2*)dataOut)[offset] = output;
}

__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;

/********/
/* MAIN */
/********/
int main() {

    const int N = 100000000;

    cufftHandle plan;           cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));

    TimingGPU timerGPU;

    float2 *h_input             = (float2*)malloc(N*sizeof(float2));
    float2 *h_output1           = (float2*)malloc(N*sizeof(float2));
    float2 *h_output2           = (float2*)malloc(N*sizeof(float2));

    float2 *d_input;            gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
    float2 *d_output1;          gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
    float2 *d_output2;          gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));

    // --- Callback function parameters
    float *h_scaling_factor     = (float*)malloc(sizeof(float));
    h_scaling_factor[0] = 16.0f;
    float *d_scaling_factor;    gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
    gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));

    // --- Initializing the input on the host and moving it to the device
    for (int i = 0; i < N; i++) {
        h_input[i].x = 1.0f;
        h_input[i].y = 0.f;
    }
    gpuErrchk(cudaMemcpy(d_input, h_input, N * sizeof(float2), cudaMemcpyHostToDevice));

    // --- Execute direct FFT on the device and move the results to the host
    cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output1, d_output1, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
#endif

    // --- Execute inverse FFT with subsequent scaling on the device and move the results to the host
    timerGPU.StartCounter();
    cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
    thrust::transform(thrust::device_pointer_cast(d_output2), thrust::device_pointer_cast(d_output2) + N, thrust::device_pointer_cast(d_output2), Scale_by_constant((float)(N)));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
    printf("Timing NO callback %f\n", timerGPU.GetCounter());

    // --- Setup store callback
//    timerGPU.StartCounter();
    cufftCallbackStoreC h_storeCallbackPtr;
    gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
    cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));

    // --- Execute inverse callback FFT on the device and move the results to the host
    timerGPU.StartCounter();
    cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
    printf("Timing callback %f\n", timerGPU.GetCounter());

    cufftSafeCall(cufftDestroy(plan));

    gpuErrchk(cudaFree(d_input));
    gpuErrchk(cudaFree(d_output1));
    gpuErrchk(cudaFree(d_output2));

    return 0;
}

对于如此大的一维数组和简单的处理(缩放),在Kepler K20c上的时间如下:

Non-callback 69.029762 ms
Callback     65.868607 ms

所以,改进并不是很大。我认为看到的改进是因为在非回调情况下避免了单独的内核调用。对于较小的1D数组,要么没有改进,要么非回调情况下运行得更快。


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接