cublas与cusparse相比速度异常缓慢

Question

cublas与cusparse相比速度异常缓慢

3

我正在尝试在Titan X下比较不同稀疏度情况下cusparse和cublas的性能，以下是名为"testcusparsevector.cpp"的主要代码：

#include <stdio.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <time.h>
#include <cuda_runtime.h>
#include <cublas.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include <assert.h>
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))


// /home/gpu1/Install/OpenBLAS-0.2.14


#define CHECK_EQ(a,b) do { \
    if ((a) != (b)) { \
        cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
        exit(1);\
    }\
} while(0)

#define CUBLAS_CHECK(condition) \
do {\
    cublasStatus_t status = condition; \
    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \
} while(0)

#define CUSPARSE_CHECK(condition)\
do {\
    cusparseStatus_t status = condition; \
    switch(status)\
    {\
        case CUSPARSE_STATUS_NOT_INITIALIZED:\
            cout << "CUSPARSE_STATUS_NOT_INITIALIZED" << endl;\
            break;\
        case CUSPARSE_STATUS_ALLOC_FAILED:\
            cout << "CUSPARSE_STATUS_ALLOC_FAILED" << endl;\
            break;\
        case CUSPARSE_STATUS_INVALID_VALUE:\
            cout << "CUSPARSE_STATUS_INVALID_VALUE" << endl;\
            break;\
        case CUSPARSE_STATUS_ARCH_MISMATCH:\
            cout << "CUSPARSE_STATUS_ARCH_MISMATCH" << endl;\
            break;\
        case CUSPARSE_STATUS_MAPPING_ERROR:\
            cout << "CUSPARSE_STATUS_MAPPING_ERROR" << endl;\
            break;\
            case CUSPARSE_STATUS_EXECUTION_FAILED:\
            cout << "CUSPARSE_STATUS_EXECUTION_FAILED" << endl;\
            break;\
        case CUSPARSE_STATUS_INTERNAL_ERROR:\
            cout << "CUSPARSE_STATUS_INTERNAL_ERROR" << endl;\
            break;\
        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:\
            cout << "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED" << endl;\
            break;\
        case CUSPARSE_STATUS_ZERO_PIVOT:\
            cout << "CUSPARSE_STATUS_ZERO_PIVOT" << endl;\
    }\
    CHECK_EQ(status, CUSPARSE_STATUS_SUCCESS); \
} while(0)

#define CUDA_CHECK(condition)\
do {\
    cudaError_t error = condition;\
    CHECK_EQ(error, cudaSuccess);\
} while(0)

//check after kernel function
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())



#define __TIMING__ 1

#if __TIMING__


#define INIT_TIMER  cudaEvent_t start, stop; \
    float milliseconds = 0; \
    float sum = 0;\
    cudaEventCreate( &start );\
    cudaEventCreate( &stop );

#define TIC {  cudaEventRecord( start ); }

#if __CUDNN__
    #define PREDEFNAME "CUDNN"
#else
    #define PREDEFNAME "CUDA"
#endif

#define TOC(a) { cudaEventRecord( stop ); \
        cudaEventSynchronize( stop ); \
        cudaEventElapsedTime( &milliseconds, start, stop );  \
        printf( "GPU Execution time of %s_%s: %f ms\n",PREDEFNAME, a, milliseconds ); \
        sum += milliseconds;\
        fflush(stdout); }

#define CLOSE_TIMER {cudaEventDestroy(start); cudaEventDestroy(stop); }
#endif

using namespace std;

void dispArray(double* array, size_t width, size_t height) {
    for (int i=0; i < height;i++ ) {
        for (int j=0;j < width;j++) {
            cout << array[j*height+i] << ' ';
        }
        cout << endl;
    }
    cout << endl;
}

int main()
{
    srand(time(NULL));
    const int num_loop = 1;
    const int inside_loop = 1000;
    // const int WIDTH = 512*3*3;
    // const int HEIGHT = 512;
    // const int WIDTHOUT = 36;
    const int WIDTH = 4608;
    const int HEIGHT = 512;
    const int WIDTHOUT = 144;
    // const int WIDTH = 18500;
    // const int HEIGHT = 512;
    // const int WIDTHOUT = 1;
    // const int WIDTH = 3;
    // const int HEIGHT = 5;
    // const int WIDTHOUT = 2;
    INIT_TIMER
    ofstream myfile;
    myfile.open("test_sparsity.log");

    cudaError_t cudaStat;    
    cusparseStatus_t stat;
    cusparseHandle_t handle;
    cublasHandle_t handleblas;

    double *devPtrOutput;
    double *devPtrOutput2;
    double *devPtrRand;
    double *devPtrSec;
    CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput), sizeof(double)*HEIGHT*WIDTHOUT));
    CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput2), sizeof(double)*HEIGHT*WIDTHOUT));

    CUDA_CHECK(cudaMalloc((void **)&(devPtrRand), sizeof(double)*WIDTH*WIDTHOUT));
    CUDA_CHECK(cudaMalloc((void **)&(devPtrSec), sizeof(double)*WIDTH*HEIGHT));
    const double alpha=1.0;
    const double beta=0.0;
    double *csrVal;
    int *csrRowPtr;
    int *csrColInd;

    const bool SPARSE = true;
    long a = clock();
    long temp = clock();
    cusparseMatDescr_t descr;
    CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
    int nnz;
    CUSPARSE_CHECK(cusparseCreate(&handle));
    CUBLAS_CHECK(cublasCreate(&handleblas));
    int *nnzPerRow_gpu;
    CUDA_CHECK(cudaMalloc((void **)&(nnzPerRow_gpu), sizeof(int)*HEIGHT));
    CUDA_CHECK(cudaMalloc((void **)&(csrRowPtr), sizeof(int)*(HEIGHT+1)));
    double density_array[1] = {0.9999};//, 0.8, 0.7, 0.6, 0.5,      0.4, 0.3, 0.2, 0.1 ,0.09,     0.08, 0.07, 0.06, 0.05 ,0.04,     0.03, 0.02, 0.01};
    for (int inddense=0;inddense < 1;inddense++) {
        double DENSITY = density_array[inddense];
        int num_non_zeros = DENSITY * (WIDTH * HEIGHT);

        CUDA_CHECK(cudaMalloc((void **)&(csrColInd), sizeof(int)*num_non_zeros));
        CUDA_CHECK(cudaMalloc((void **)&(csrVal), sizeof(double)*num_non_zeros));
        INIT_TIMER
        for (int iter=0; iter < num_loop;iter++) {
            vector<double> randVec(WIDTH*WIDTHOUT, 0);
            vector<double> secArray(WIDTH*HEIGHT, 0);
            vector<int> temp(WIDTH*HEIGHT, 1);

            for (int j = 0; j < WIDTH*WIDTHOUT; j++) {
                randVec[j]=(double)(rand()%100000)/100;
            }

            for (int x, i = 0; i < num_non_zeros;i++) {
                do
                {
                    x = rand() % (WIDTH*HEIGHT);
                } while(temp[x] == 0);
                temp[x]=0;
                secArray[x]=(double)(rand()%100000)/100;
            }
            int count = 0;
            for(int i=0;i < WIDTH*HEIGHT;i++) {
                if (secArray[i] != 0) {
                    count++;
                }
            }

            // randVec = {2,2,2,3,3,3};
            // secArray = {0,5,0,2,5,8,7,0,0,0,0,2,0,4,4};
            CUDA_CHECK(cudaMemcpy(devPtrRand, &randVec[0], sizeof(double)*WIDTH*WIDTHOUT, cudaMemcpyHostToDevice));
            CUDA_CHECK(cudaMemcpy(devPtrSec, &secArray[0], sizeof(double)*WIDTH*HEIGHT, cudaMemcpyHostToDevice));


            if (SPARSE) {
                CUSPARSE_CHECK(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, HEIGHT, WIDTH, descr, devPtrSec, HEIGHT, nnzPerRow_gpu, &nnz));
                CUSPARSE_CHECK(cusparseDdense2csr(handle, HEIGHT, WIDTH, descr,devPtrSec,HEIGHT,nnzPerRow_gpu,csrVal,csrRowPtr,csrColInd));
            }       
            // vector<double> tempcsrVal(nnz,0);
            // vector<int> tempcsrRowPtr(HEIGHT+1);
            // vector<int> tempcsrColInd(nnz,0);
            // CUDA_CHECK(cudaMemcpy(&tempcsrVal[0], csrVal, sizeof(double)*nnz, cudaMemcpyDeviceToHost));
            // CUDA_CHECK(cudaMemcpy(&tempcsrRowPtr[0], csrRowPtr, sizeof(int)*(HEIGHT+1), cudaMemcpyDeviceToHost));
            // CUDA_CHECK(cudaMemcpy(&tempcsrColInd[0], csrColInd, sizeof(int)*nnz, cudaMemcpyDeviceToHost));
            // for (int i =0; i < nnz;i++) {
                // cout << tempcsrVal[i] << " ";
            // }
            // cout << endl;
            // for (int i =0; i < HEIGHT+1;i++) {
                // cout << tempcsrRowPtr[i] << " ";
            // }
            // cout << endl;
            // for (int i =0; i < nnz;i++) {
                // cout << tempcsrColInd[i] << " ";
            // }
            // cout << endl;
            cudaDeviceSynchronize();
            TIC
            for (int i=0 ; i < inside_loop;i++) {
                if (WIDTHOUT == 1) {
                    // TIC
                    CUSPARSE_CHECK(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                    HEIGHT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, csrColInd, 
                    devPtrRand, &beta, devPtrOutput));
                    // TOC("csrmv")
                } else {
                    // TIC
                    CUSPARSE_CHECK(cusparseDcsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
                        HEIGHT, WIDTHOUT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, 
                        csrColInd, devPtrRand, WIDTH, &beta, devPtrOutput, HEIGHT));
                    // TOC("csrmm")
                }
            }
            TOC("csr")
            TIC
            for (int i=0 ; i < inside_loop;i++) {
                if (WIDTHOUT == 1) {
                    // TIC
                    CUBLAS_CHECK(cublasDgemv(handleblas, CUBLAS_OP_N, HEIGHT, WIDTH, &alpha, devPtrSec, HEIGHT , devPtrRand, 1, &beta, devPtrOutput2, 1));
                    // TOC("dgemv")
                } else {
                    // TIC
                    CUBLAS_CHECK(cublasDgemm(handleblas, CUBLAS_OP_N, CUBLAS_OP_N, HEIGHT, WIDTHOUT, WIDTH, &alpha, devPtrSec, HEIGHT, devPtrRand, WIDTH, &beta, devPtrOutput2, HEIGHT));
                    // TOC("dgemm")
                }
            }
            TOC("blas")


            #if 0
            vector<double> output(HEIGHT*WIDTHOUT, 0);
            vector<double> output2(HEIGHT*WIDTHOUT, 0);
            CUDA_CHECK(cudaMemcpy(&output[0], devPtrOutput, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost));
            CUDA_CHECK(cudaMemcpy(&output2[0], devPtrOutput2, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost));
            dispArray(&output[0], WIDTHOUT, HEIGHT);
            cout << endl;
            for (int i=0;i < WIDTHOUT * HEIGHT;i++) {
                if (output[i] != output2[i]) {
                    cout << "error: " << i << " " << (output[i] - output2[i]) << " " << output[i] << endl;
                }
            }
            #endif

        }

        cout << DENSITY << " " << sum/num_loop << endl;
        myfile << DENSITY << " " << sum/num_loop << endl;
        cudaFree(csrColInd);
        cudaFree(csrVal);
    }
    myfile.close();
    cudaFree(csrRowPtr);
    cudaFree(devPtrOutput);
    cudaFree(devPtrRand);
    cudaFree(devPtrSec);

}

但是，通过编译代码后

g++ -std=c++1y -O3 -I/usr/local/cuda/include -o testcusparsevector testcusparsevector.cpp -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusparse

这是输出结果：

GPU Execution time of CUDA_csr: 4818.447266 ms
GPU Execution time of CUDA_blas: 5024.459961 ms

这应该意味着，即使我的密度为0.999，cusparseDcsrmm仍然比cublasDgemm更快，我已经检查了结果，结果很好，并与其他示例进行了比较，似乎问题来自cublas速度太慢。

你有任何想法它从哪里来吗？

编辑：我尝试将值更改为float，结果更符合我的预期，显然，cublas不适用于双精度计算...

提前致谢。

- Caenorst

1

你发布的代码无法编译。在一大堆无法编译的代码中发现错误非常困难。 - talonmies

哦，是的我的错，我快速修改了一下，以使其更易读，但忘记试一下了。现在你可以编译它了。 - Caenorst

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Robert Crovella · Accepted Answer

Titan X（以及所有当前Maxwell GPU系列的成员）在双精度浮点运算和单精度浮点运算之间的吞吐比为1:32。

通常来说，稀疏矩阵操作是内存带宽限制，而稠密矩阵-矩阵乘法则是计算密集型问题的例子。

因此，在您的示例中，您正在将通常是计算密集型的问题作为稀疏矩阵乘法运行在具有相对较大的内存带宽和相对较小的双精度计算吞吐量的处理器上。

这种情况可能导致两个API之间的界限模糊，而CUBLAS API通常会对这种比较更快。

如果您将代码切换为使用float而不是double，就像我认为您已经尝试过的那样，您将再次看到CUBLAS获胜。同样，如果您在具有不同单精度和双精度吞吐量比率的GPU上运行代码，您也将再次看到CUBLAS获胜。

引用：“显然，cublas不适用于双精度计算...”

与其这样说，我会说GTX Titan X（主要）不适用于双精度计算。尝试Tesla K80、K40或其他具有更接近双精度到单精度吞吐量比率的GPU。

以下是在“未增强”的Tesla K40上运行您程序的输出：

$ ./testcusparsevector
GPU Execution time of CUDA_csr: 8870.386719 ms
GPU Execution time of CUDA_blas: 1045.211792 ms

免责声明：我没有尝试研究您的代码。我浏览了一下，没有明显的问题。但是可能存在我没有发现的问题。