在CUDA中实现FIR滤波器（作为1D卷积）

Question

在CUDA中实现FIR滤波器（作为1D卷积）

3

我正在尝试在CUDA中实现一个FIR（有限脉冲响应）滤波器。我的方法非常简单，看起来有点像这样：

#include <cuda.h>

__global__ void filterData(const float *d_data,
                           const float *d_numerator, 
                           float *d_filteredData, 
                           const int numeratorLength,
                           const int filteredDataLength)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    float sum = 0.0f;

    if (i < filteredDataLength)
    {
        for (int j = 0; j < numeratorLength; j++)
        {
            // The first (numeratorLength-1) elements contain the filter state
            sum += d_numerator[j] * d_data[i + numeratorLength - j - 1];
        }
    }

    d_filteredData[i] = sum;
}

int main(void)
{
    // (Skipping error checks to make code more readable)

    int dataLength = 18042;
    int filteredDataLength = 16384;
    int numeratorLength= 1659;

    // Pointers to data, filtered data and filter coefficients
    // (Skipping how these are read into the arrays)
    float *h_data = new float[dataLength];
    float *h_filteredData = new float[filteredDataLength];
    float *h_filter = new float[numeratorLength];


    // Create device pointers
    float *d_data = nullptr;
    cudaMalloc((void **)&d_data, dataLength * sizeof(float));

    float *d_numerator = nullptr;
    cudaMalloc((void **)&d_numerator, numeratorLength * sizeof(float));

    float *d_filteredData = nullptr;
    cudaMalloc((void **)&d_filteredData, filteredDataLength * sizeof(float));


    // Copy data to device
    cudaMemcpy(d_data, h_data, dataLength * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_numerator, h_numerator, numeratorLength * sizeof(float), cudaMemcpyHostToDevice);  

    // Launch the kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (filteredDataLength + threadsPerBlock - 1) / threadsPerBlock;
    filterData<<<blocksPerGrid,threadsPerBlock>>>(d_data, d_numerator, d_filteredData, numeratorLength, filteredDataLength);

    // Copy results to host
    cudaMemcpy(h_filteredData, d_filteredData, filteredDataLength * sizeof(float), cudaMemcpyDeviceToHost);

    // Clean up
    cudaFree(d_data);
    cudaFree(d_numerator);
    cudaFree(d_filteredData);

    // Do stuff with h_filteredData...

    // Clean up some more
    delete [] h_data;
    delete [] h_filteredData;
    delete [] h_filter;
}

过滤器是有效的，但由于我刚开始学习CUDA编程，不确定如何进行优化。

我发现一个小问题是，在我打算使用过滤器的应用程序中，无法预先知道"dataLength"、"filteredDataLength"和"numeratorLength"的值。而且，尽管在上面的代码中，"dataLength"是32的倍数，在最终的应用程序中也不能保证是这样的。

当我将我的代码与ArrayFire进行比较时，我的代码执行时间大约需要三倍。

是否有人有任何想法如何加速处理？

编辑：已将所有“filterLength”更改为“numeratorLength”。

- Elfendahl

1

“numeratorLength”和“filterLength”是相同的吗？我在您发布的内容中没有看到“numeratorLength”的定义。这个问题本质上是一个一维图案问题。针对图案问题的标准优化是将输入数据的一部分带入共享内存，足够一个块的线程计算其输出，然后让这些线程从共享内存副本中工作。 - Robert Crovella

1

如果你最终战胜了ArrayFire，请告诉我们！如果没有，你随时可以使用ArrayFire，因为它更快 :) - arrayfire

@RobertCrovella 是的，numeratorLength和filterLength是一样的。我决定更改名称，但很明显错过了几个地方。我的错，抱歉。我已经修改了原始帖子，所以只有numeratorLength。感谢建议使用共享内存。我已经读到这些比全局内存快得多，但我不太确定如何最好地实现它，因为共享内存的大小是有限的，而滤波器长度可能会相当长。我将进行试验并看看它的效果如何。 - Elfendahl

@accelereyes 你好，AccelerEyes！我想知道你是否会注意到我的小帖子。 :) 首先，让我说一下，我真的很喜欢ArrayFire，我并不想打败你或者什么的。对我来说，这主要是一个学习CUDA和GPU工作原理的练习，而且很有趣！我把ArrayFire看作是代码速度能达到的基准，但我并不指望能够达到同样的速度。 - Elfendahl

3个回答

2

您正在尝试通过直接评估CUDA内核中的1D卷积来计算滤波器输出。

在滤波器脉冲响应持续时间较长的情况下，您可以通过使用FFT在共轭域中直接执行计算来评估过滤后的输入。下面我将报告一个使用CUDA Thrust和cuFFT库的示例代码。它是Matlab示例的直接翻译。

让我声明一下，这段代码可以进行一些优化，但我更喜欢将其保留原样，以便更容易地与其Matlab对应物进行比较。

#include <stdio.h>
#include <math.h>

#include <cufft.h>

#include <thrust\device_vector.h>
#include <thrust\sequence.h>

#define pi_f  3.14159265358979f                 // Greek pi in single precision

/****************/
/* SIN OPERATOR */
/****************/
class sin_op {

    float fk_, Fs_;

    public:

        sin_op(float fk, float Fs) { fk_ = fk; Fs_ = Fs; }

        __host__ __device__ float operator()(float x) const { return sin(2.f*pi_f*x*fk_/Fs_); }
};

/*****************/
/* SINC OPERATOR */
/*****************/
class sinc_op {

    float fc_, Fs_;

    public:

        sinc_op(float fc, float Fs) { fc_ = fc; Fs_ = Fs; }

        __host__ __device__ float operator()(float x) const 
        {
            if (x==0)   return (2.f*fc_/Fs_);
            else            return (2.f*fc_/Fs_)*sin(2.f*pi_f*fc_*x/Fs_)/(2.f*pi_f*fc_*x/Fs_);
        }
};

/********************/
/* HAMMING OPERATOR */
/********************/
class hamming_op {

    int L_;

    public:

        hamming_op(int L) { L_ = L; }

        __host__ __device__ float operator()(int x) const 
        {
            return 0.54-0.46*cos(2.f*pi_f*x/(L_-1));
        }
};


/*********************************/
/* MULTIPLY CUFFTCOMPLEX NUMBERS */
/*********************************/
struct multiply_cufftComplex {
    __device__ cufftComplex operator()(const cufftComplex& a, const cufftComplex& b) const {
        cufftComplex r;
        r.x = a.x * b.x - a.y * b.y;
        r.y = a.x * b.y + a.y * b.x;
        return r;
    }
};

/********/
/* MAIN */
/********/
void main(){

    // Signal parameters:
    int M = 256;                            // signal length
    const int N = 4;
    float f[N] = { 440, 880, 1000, 2000 };              // frequencies
    float Fs = 5000.;                       // sampling rate

    // Generate a signal by adding up sinusoids:
    thrust::device_vector<float> d_x(M,0.f);            // pre-allocate 'accumulator'
    thrust::device_vector<float> d_n(M);                // discrete-time grid
    thrust::sequence(d_n.begin(), d_n.end(), 0, 1);

    thrust::device_vector<float> d_temp(M);
    for (int i=0; i<N; i++) { 
        float fk = f[i];
        thrust::transform(d_n.begin(), d_n.end(), d_temp.begin(), sin_op(fk,Fs));
        thrust::transform(d_temp.begin(), d_temp.end(), d_x.begin(), d_x.begin(), thrust::plus<float>()); 
    }

    // Filter parameters:
    int L = 257;                        // filter length
    float fc = 600.f;                   // cutoff frequency

    // Design the filter using the window method:
    thrust::device_vector<float> d_hsupp(L);            
    thrust::sequence(d_hsupp.begin(), d_hsupp.end(), -(L-1)/2, 1);
    thrust::device_vector<float> d_hideal(L);           
    thrust::transform(d_hsupp.begin(), d_hsupp.end(), d_hideal.begin(), sinc_op(fc,Fs));
    thrust::device_vector<float> d_l(L);                
    thrust::sequence(d_l.begin(), d_l.end(), 0, 1);
    thrust::device_vector<float> d_h(L);                
    thrust::transform(d_l.begin(), d_l.end(), d_h.begin(), hamming_op(L));
    // h is our filter
    thrust::transform(d_hideal.begin(), d_hideal.end(), d_h.begin(), d_h.begin(), thrust::multiplies<float>());  

    // --- Choose the next power of 2 greater than L+M-1
    int Nfft = pow(2,(ceil(log2((float)(L+M-1))))); // or 2^nextpow2(L+M-1)

    // Zero pad the signal and impulse response:
    thrust::device_vector<float> d_xzp(Nfft,0.f);
    thrust::device_vector<float> d_hzp(Nfft,0.f);
    thrust::copy(d_x.begin(), d_x.end(), d_xzp.begin());
    thrust::copy(d_h.begin(), d_h.end(), d_hzp.begin());

    // Transform the signal and the filter:
    cufftHandle plan;
    cufftPlan1d(&plan, Nfft, CUFFT_R2C, 1);
    thrust::device_vector<cufftComplex> d_X(Nfft/2+1);
    thrust::device_vector<cufftComplex> d_H(Nfft/2+1);
    cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_xzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_X.data()));
    cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_hzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_H.data()));

    thrust::device_vector<cufftComplex> d_Y(Nfft/2+1);
    thrust::transform(d_X.begin(), d_X.end(), d_H.begin(), d_Y.begin(), multiply_cufftComplex());  

    cufftPlan1d(&plan, Nfft, CUFFT_C2R, 1);
    thrust::device_vector<float> d_y(Nfft);
    cufftExecC2R(plan, (cufftComplex*)thrust::raw_pointer_cast(d_Y.data()), (cufftReal*)thrust::raw_pointer_cast(d_y.data()));

    getchar();

}

- Vitality

2

我可以建议以下措施来加速您的代码：

使用共享内存：它类似于一个小缓存，但比全局卡内存要快得多。您可以在CUDA文档中查找 __shared__ 关键字了解更多信息。例如，您可以将滤波器分子和大块数据预取到共享内存中，这将显着提高性能。在这种情况下，您需要特别注意数据对齐，因为它确实很重要，可能会减慢您的代码。
考虑展开分子求和的 for 循环。您可以查看 CUDA 文档中的 reduce-vector 示例。
您还可以考虑将分子循环本身并行化。这可以通过向线程块添加额外的维度（例如“y”）来完成。您还需要将 sum 设为共享矢量，其尺寸为分子长度。您也可以查看 reduce vector 的示例，了解如何在最后快速计算该向量的总和。

- Bichoy

谢谢您的建议！我会研究一下使用共享内存，而且向量缩减示例看起来非常有趣。但我特别喜欢您的第三个建议，因为随着滤波器长度的增加，我现在代码中的for循环感觉像是一个潜在的瓶颈。 - Elfendahl

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Vitality · Accepted Answer

除了我之前的回答，我认为对于持续时间较长的卷积核，更方便的是下面报告的另一种实现方式。这种实现方式更符合OP最初的尝试，并且我认为对于持续时间较短的卷积核更方便。这种实现基于一个手写的卷积核，利用共享内存中的缓存。更多细节可以在D.B. Kirk和W.-m. W. Hwu的书中找到。

《大规模并行处理器编程：第二版：实践方法》

#include <stdio.h>
#include <stdlib.h>

#include "TimingGPU.cuh"
#include "Utilities.cuh"

#define RG          10
#define BLOCKSIZE   8

/****************/
/* CPU FUNCTION */
/****************/
void h_convolution_1D(const float * __restrict__ h_Signal, const float * __restrict__ h_ConvKernel, float * __restrict__ h_Result_CPU, 
                      const int N, const int K) {

    for (int i = 0; i < N; i++) {

        float temp = 0.f;

        int N_start_point = i - (K / 2);

        for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
            temp += h_Signal[N_start_point+ j] * h_ConvKernel[j];
        }

        h_Result_CPU[i] = temp;
    }
}

/********************/
/* BASIC GPU KERNEL */
/********************/
__global__ void d_convolution_1D_basic(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU, 
                                       const int N, const int K) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    float temp = 0.f;

    int N_start_point = i - (K / 2);

    for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
        temp += d_Signal[N_start_point+ j] * d_ConvKernel[j];
    }

    d_Result_GPU[i] = temp;
}

/***************************/
/* GPU KERNEL WITH CACHING */
/***************************/
__global__ void d_convolution_1D_caching(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU, 
                                         const int N, const int K) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    __shared__ float d_Tile[BLOCKSIZE];

    d_Tile[threadIdx.x] = d_Signal[i];
    __syncthreads();

    float temp = 0.f;

    int N_start_point = i - (K / 2);

    for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {

            if ((N_start_point + j >= blockIdx.x * blockDim.x) && (N_start_point + j < (blockIdx.x + 1) * blockDim.x))

                // --- The signal element is in the tile loaded in the shared memory
                temp += d_Tile[threadIdx.x + j - (K / 2)] * d_ConvKernel[j]; 

            else

                // --- The signal element is not in the tile loaded in the shared memory
                temp += d_Signal[N_start_point + j] * d_ConvKernel[j];

    }

    d_Result_GPU[i] = temp;
}

/********/
/* MAIN */
/********/
int main(){

    const int N = 15;           // --- Signal length
    const int K = 5;            // --- Convolution kernel length

    float *h_Signal         = (float *)malloc(N * sizeof(float));
    float *h_Result_CPU     = (float *)malloc(N * sizeof(float));
    float *h_Result_GPU     = (float *)malloc(N * sizeof(float));
    float *h_ConvKernel     = (float *)malloc(K * sizeof(float));

    float *d_Signal;        gpuErrchk(cudaMalloc(&d_Signal,     N * sizeof(float)));
    float *d_Result_GPU;    gpuErrchk(cudaMalloc(&d_Result_GPU, N * sizeof(float)));
    float *d_ConvKernel;    gpuErrchk(cudaMalloc(&d_ConvKernel, K * sizeof(float)));

    for (int i=0; i < N; i++) { h_Signal[i] = (float)(rand() % RG); }

    for (int i=0; i < K; i++) { h_ConvKernel[i] = (float)(rand() % RG); }

    gpuErrchk(cudaMemcpy(d_Signal,      h_Signal,       N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_ConvKernel,  h_ConvKernel,   K * sizeof(float), cudaMemcpyHostToDevice));

    h_convolution_1D(h_Signal, h_ConvKernel, h_Result_CPU, N, K);

    d_convolution_1D_basic<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}

    printf("Test basic passed\n");

    d_convolution_1D_caching<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}

    printf("Test caching passed\n");

    return 0;
}