如何优化这个CUDA核心函数

Question

如何优化这个CUDA核心函数

3

我已经对我的模型进行了分析，似乎这个内核占据了我的全部运行时间的约2/3。我正在寻找优化建议。以下是代码。

__global__ void calcFlux(double* concs, double* fluxes, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    fluxes[idx]=knowles_flux(idx, concs);
    //fluxes[idx]=flux(idx, concs);
}

__device__ double knowles_flux(int r, double *conc)
{
    double frag_term = 0;
    double flux = 0;
    if (r == ((maxlength)-1))
    {
        //Calculation type : "Max"
        flux = -km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0];
    }
    else if (r > ((nc)-1))
    {
        //Calculation type : "F"
        //arrSum3(conc, &frag_term, r+1, maxlength-1);
        for (int s = r+1; s < (maxlength); s++)
        {
            frag_term += conc[s];
        }
        flux = -(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0];
    }
    else if (r == ((nc)-1))
    {
        //Calculation type : "N"
        //arrSum3(conc, &frag_term, r+1, maxlength-1);
        for (int s = r+1; s < (maxlength); s++)
        {
            frag_term += conc[s];
        }
        flux = (kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0];
    }
    else if (r < ((nc)-1))
    {
    //Calculation type : "O"
        flux = 0;
    }
    return flux;
}

仅仅为了让您理解为什么for循环是一个问题，这个内核被启动在大约maxlength=9000元素的数组上。就我们现在的目的而言，nc在2-6的范围内。以下是这个内核如何处理输入数组（conc）的插图。针对该数组，不同组别的元素需要应用五种不同类型的计算。

Array element : 0 1 2 3 4 5 6 7 8 9 ... 8955 8956 8957 8958 8959 8960
Type of calc  : M O O O O O N F F F ...   F   F    F    F    F   Max

我现在试图解决的潜在问题是四重 if-else 和 for 循环中的分支发散。

我处理分支发散的想法是将此内核分成四个单独的设备函数或内核，分别对待每个区域并同时启动。我不确定这比让分支发散发生要好多少，如果我没记错的话，这会导致四种计算类型以串行方式运行。

为了解决 for 循环，您会注意到已注释掉 arrSum3 函数，我基于以前编写的（可能很差）并行约简内核编写它。将其用于 for 循环可以显着增加运行时间。我感觉有一种聪明的方法可以实现我尝试使用 for 循环做的事情，但我并不那么聪明，我的指导老师已经厌倦我"浪费时间"思考它。

感谢任何帮助。

编辑

完整代码位于此处：https://stackoverflow.com/q/21170233/1218689

- Hair of Slytherin

什么是conc？一个双精度数组吗？这个数组经常改变吗？如果改变的话，频率有多高？在knowles_flux计算过程中，任何conc元素会发生变化吗？也就是说，更新是否是并行进行的？ - Xephon

1

你似乎在对 conc 数组进行大量冗余求和。如果你对数组进行一次前缀和操作，你可以通过 prefix_sum[high] - prefix_sum[low] 找到数组中任何连续子区域的总和。对于非常大的数组或者数组中值差异非常大的情况下，可能会遇到精度问题，但对于你的情况来说可能是可行的。 - mattnewport

@Xephon Conc确实是一个双精度数组。Conc通过内核运行，计算通量。然后使用Fluxes来推进Conc通过时间步长。不，Conc在通量计算过程中不会改变。Conc是该时间步长的浓度数组，并且在knowles_flux计算的五次迭代（对于Runge Kutta积分器为5-6次）中保持不变。 - Hair of Slytherin

@mattnewport，感谢您提醒我。几个月前，我曾认为可能有一些巧妙的方法来计算总和或数组的总和，以便快速计算这些部分和。现在，如果我能利用GPU的并行性来快速执行前缀和数组，那么也许我就可以做成这件事了。 - Hair of Slytherin

@mattnewport 我不认为Thrust适合我。它是纯粹在主机端运行的，对吗？这个算法必须能够从在GPU上运行积分器的内核中调用。 - Hair of Slytherin

显示剩余7条评论

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- huseyin tugrul buyukisik · Accepted Answer

假设sgn()和abs()不是从“if”和“else”派生出来的。

__device__ double knowles_flux(int r, double *conc)
{
    double frag_term = 0;
    double flux = 0;

        //Calculation type : "Max"
        //no divergence
        //should prefer 20-30 extra cycles instead of a branching.
        //may not be good for CPU
        fluxA = (1-abs(sgn(r-(maxlength-1)))) * (-km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0]);
        //is zero if r and maxlength-1 are not equal

        //always compute this in shared memory so work will be equal for all cores, no divergence

        // you should divide kernel into several pieces to do a reduction
        // but if you dont want that, then you can try :
        for (int s = 0;s<someLimit ; s++) // all count for same number of cycles so no divergence
        {
            frag_term += conc[s] * (   abs(sgn( s-maxlength ))*sgn(1- sgn( s-maxlength ))  )* (      sgn(1+sgn(s-(r+1)))  );
        }
         //but you can make easier of this using "add and assign" operation
         // in local memory (was it __shared in CUDA?)
         //  global conc[] to local concL[] memory(using all cores)(100 cycles)
         // for(others from zero to upper_limit)
         // if(localID==0)
         // {
         //    frag_termL[0]+=concL[s]             // local to local (10 cycles/assign.)
         //    frag_termL[0+others]=frag_termL[0]; // local to local (10 cycles/assign.)
         // }  -----> uses nearly same number of cycles but uses much less energy
         //using single core (2000 instr. with single core vs 1000 instr. with 2k cores)
         // in local memory, then copy it to private registers accordingly using all cores



        //Calculation type : "F"

        fluxB = (  abs(sgn(r-(nc-1)))*sgn(1+sgn(r-(nc-1)))   )*(-(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0]);
        // is zero if r is not greater than (nc-1)



        //Calculation type : "N"


        fluxC = (   1-abs(sgn(r-(nc-1)))   )*((kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0]);
        //zero if r and nc-1 are not equal



    flux=fluxA+fluxB+fluxC; //only one of these can be different than zero

    flux=flux*(   -sgn(r-(nc-1))*sgn(1-sgn(r-(nc-1)))  )
    //zero if r > (nc-1)

    return flux;
}

好的，让我稍微解释一下：

if(a>b) x+=y;

可以理解为

if a-b is negative sgn(a-b) is -1
then adding 1 to that -1 gives zero ==> satisfies lower part of comparison(a<b)
x+= (sgn(a-b) +1) = 0 if a<b (not a>b), x unchanged

if(a-b) is zero, sgn(a-b) is zero
then we should multiply the upper solution with sgn(a-b) too!
x+= y*(sgn(a-b) +1)*sgn(a-b)
means
x+= y*( 0  +  1) * 0 = 0           a==b is satisfied too!

lets check what happens if a>b
x+= y*(sgn(a-b) +1)*sgn(a-b)
x+= y*(1 +1)*1  ==> y*2 is not acceptable, needs another sgn on outherside

x+= y* sgn((sgn(a-b)+1)*sgn(a-b))

x+= y* sgn((1+1)*1)

x+= y* sgn(2)   

x+= y only when a is greater than b

当有太多的

abs(sgn(r-(nc-1))

那么你可以重新使用它作为：

tmp=abs(sgn(r-(nc-1))

.....  *tmp*(tmp-1) ....
...... +tmp*zxc[s] .....
......  ......

为了进一步减少总周期！寄存器访问可以达到每秒几个TB的级别，所以不应该是问题。就像全局访问一样：

tmpGlobal= conc[r];

...... tmpGlobal * tmp .....
.... tmpGlobal +x -y ....

所有私有寄存器每秒处理的数据量都达到了千兆字节。

警告：如果 conc[0] 的实际地址不是真正的零，从 conc[-1] 读取不应该导致任何故障，只要将其乘以零。但是写入则是有风险的。

如果您需要从 conc[-1] 中逃脱，您也可以将索引乘以一些绝对化的值！请参见：

 tmp=conc[i-1] becomes   tmp=conc[abs((i-1))] will always read from positive index, the value will be multiplied by zero later anyway. This was lower bound protection.
  You can apply a higher bound protection too. Just this adds even more cycles.

如果使用纯量值访问conc[r-1]和conc[r+1]时速度不够快，请考虑使用向量重排操作。向量元素之间的重排比通过本地内存复制到另一个核/线程更快。