CUDA数学库的头文件是哪个?

6

我尝试使用cuda math api,例如sqrtf()、__fdividef()等,但出现了以下错误:

error C3861: '__fdividef':Identifier not found
error C3861: 'rsqrtf':Identifier not found

看起来"NVIDIA CUDA Math API"没有指定我们在使用这些API时应该包含哪个头文件。在helper_math.h中,函数似乎是:

inline __host__ __device__ float length(float4 v)
{
    return sqrtf(dot(v, v));
}

使用C库math.h中的sqrtf实现,它不是带有设备标识符的主机函数吗?我们能直接在我们的设备函数中使用它吗?
请纠正我。谢谢。

你确定编译上述函数需要CUDA数学头文件吗? - Vitality
@JackOLantern 当然,我不确定,这就是为什么我在这里发布的原因。我想知道length(float4 v)函数调用了sqrtf()函数,后者也应该是设备函数,对吧?否则,在cuda编程模型中,我们将从设备函数调用主机函数,这是不可能的,如果我没记错的话。我猜有一个函数的样式如下:inline host __device float sqrtf(float v)。对吗? - G_fans
1
@JackOLantern 嗯,在进行代码重构后,我修复了错误。是的,你说得对,我们不需要显式地包含cuda数学头文件。顺便说一下,这些数学API函数位于device_functions.h中,不应该被包含在用户代码中。但是知道他们实际上使用哪些基本API来实现这些数学API是很好的。 - G_fans
有人可以回答这个问题吗?G_fans,你可以回答自己的问题并解释一下你是如何修复它的。 - Robert Crovella
1个回答

4
以前,我是这样实现代码的:
// This implementation follows the code from
// https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h

#ifndef UNIFIED_MATH_CUDA_H
#define UNIFIED_MATH_CUDA_H

#include "vector_functions.h"

/*****************************************
Vector
/*****************************************/

__device__
    float fastDiv(float numerator, float denominator)
{
    return __fdividef(numerator, denominator);        
    //return numerator/denominator;        
}

__device__
    float getSqrtf(float f2)
{
    return sqrtf(f2);
}

__device__
    float getReverseSqrt(float f2)
{
    return rsqrtf(f2);
}

__device__
    float3 getCrossProduct(float3 a, float3 b)
{
    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 
}

__device__
    float4 getCrossProduct(float4 a, float4 b)
{
    float3 v1 = make_float3(a.x, a.y, a.z);
    float3 v2 = make_float3(b.x, b.y, b.z);
    float3 v3 = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 

    return make_float4(v3.x, v3.y, v3.z, 0.0f);
}

__device__
    float getDotProduct(float3 a, float3 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

__device__
    float getDotProduct(float4 a, float4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

__device__ float3 getNormalizedVec(const float3 v)
{
    float invLen = 1.0f / sqrtf(getDotProduct(v, v));
    return make_float3(v.x * invLen, v.y * invLen, v.z * invLen);
}

__device__ float4 getNormalizedVec(const float4 v)
{
    float invLen = 1.0f / sqrtf(getDotProduct(v, v));
    return make_float4(v.x * invLen, v.y * invLen, v.z * invLen, v.w * invLen);
}

__device__
    float dot3F4(float4 a, float4 b)
{
    float4 a1 = make_float4(a.x, a.y, a.z,0.f);
    float4 b1 = make_float4(b.x, b.y, b.z,0.f);
    return getDotProduct(a1, b1);
}

__device__
    float getLength(float3 a)
{
    return sqrtf(getDotProduct(a, a));
}

__device__
    float getLength(float4 a)
{
    return sqrtf(getDotProduct(a, a));
}

/*****************************************
Matrix3x3
/*****************************************/

typedef struct
{
    float4 m_row[3];
}Matrix3x3_d;

__device__
    void setZero(Matrix3x3_d& m)
{
    m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}

__device__
    Matrix3x3_d getZeroMatrix3x3()
{
    Matrix3x3_d m;
    m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    return m;
}

__device__ 
    void setIdentity(Matrix3x3_d& m)
{
    m.m_row[0] = make_float4(1,0,0,0);
    m.m_row[1] = make_float4(0,1,0,0);
    m.m_row[2] = make_float4(0,0,1,0);
}

__device__
    Matrix3x3_d getIdentityMatrix3x3()
{
    Matrix3x3_d m;
    m.m_row[0] = make_float4(1,0,0,0);
    m.m_row[1] = make_float4(0,1,0,0);
    m.m_row[2] = make_float4(0,0,1,0);
    return m;
}

__device__
    Matrix3x3_d getTranspose(const Matrix3x3_d m)
{
    Matrix3x3_d out;
    out.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
    out.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
    out.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
    return out;
}

__device__
    Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b)
{
    Matrix3x3_d transB = getTranspose( b );
    Matrix3x3_d ans;
    //        why this doesn't run when 0ing in the for{}
    a.m_row[0].w = 0.f;
    a.m_row[1].w = 0.f;
    a.m_row[2].w = 0.f;
    for(int i=0; i<3; i++)
    {
        //        a.m_row[i].w = 0.f;
        ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
        ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
        ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
        ans.m_row[i].w = 0.f;
    }
    return ans;
}

__device__
    float4 MatrixVectorMul(Matrix3x3_d a, float4 b)
{
    float4 ans;
    ans.x = dot3F4( a.m_row[0], b );
    ans.y = dot3F4( a.m_row[1], b );
    ans.z = dot3F4( a.m_row[2], b );
    ans.w = 0.f;
    return ans;
}


#endif  // UNIFIED_MATH_CUDA_H

由于CUDA会自动内联所有设备代码,因此我忽略了内联标识符。但是,正如我在问题中所述,它出现了错误:

error C3861: '__fdividef':Identifier not found
error C3861: 'rsqrtf':Identifier not found

接下来,我按照传统的C/C++风格重新实现了以下内容。

UnifiedMathCUDA.cuh:

// This implementation follows the code from
// https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h

#ifndef UNIFIED_MATH_CUDA_H
#define UNIFIED_MATH_CUDA_H

#include "vector_functions.h"

/*****************************************
                Vector
/*****************************************/

__device__
    float fastDiv(float numerator, float denominator);

__device__
    float getSqrtf(float f2);

__device__
    float getReverseSqrt(float f2);

__device__
    float3 getCrossProduct(float3 a, float3 b);

__device__
    float4 getCrossProduct(float4 a, float4 b);

__device__
    float getDotProduct(float3 a, float3 b);

__device__
    float getDotProduct(float4 a, float4 b);

__device__ float3 getNormalizedVec(const float3 v);

__device__ float4 getNormalizedVec(const float4 v);

__device__
    float dot3F4(float4 a, float4 b);

__device__
    float getLength(float3 a);

__device__
    float getLength(float4 a);

/*****************************************
                Matrix3x3
/*****************************************/
typedef struct
{
    float4 m_row[3];
}Matrix3x3_d;

__device__
    void setZero(Matrix3x3_d& m);

__device__
    Matrix3x3_d getZeroMatrix3x3();

__device__ 
    void setIdentity(Matrix3x3_d& m);

__device__
    Matrix3x3_d getIdentityMatrix3x3();

__device__
    Matrix3x3_d getTranspose(const Matrix3x3_d m);

__device__
    Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b);

__device__
    float4 MatrixVectorMul(Matrix3x3_d a, float4 b);


#endif  // UNIFIED_MATH_CUDA_H

对应的实现文件如下:

#include "UnifiedMathCUDA.cuh"

/*****************************************
                Vector
/*****************************************/

__device__
    float fastDiv(float numerator, float denominator)
{
    return __fdividef(numerator, denominator);        
    //return numerator/denominator;        
}

__device__
    float getSqrtf(float f2)
{
    return sqrtf(f2);
}

__device__
    float getReverseSqrt(float f2)
{
    return rsqrtf(f2);
}

__device__
    float3 getCrossProduct(float3 a, float3 b)
{
    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 
}

__device__
    float4 getCrossProduct(float4 a, float4 b)
{
    float3 v1 = make_float3(a.x, a.y, a.z);
    float3 v2 = make_float3(b.x, b.y, b.z);
    float3 v3 = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 

    return make_float4(v3.x, v3.y, v3.z, 0.0f);
}

__device__
    float getDotProduct(float3 a, float3 b)
{
     return a.x * b.x + a.y * b.y + a.z * b.z;
}

__device__
    float getDotProduct(float4 a, float4 b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}

__device__ float3 getNormalizedVec(const float3 v)
{
    float invLen = 1.0f / sqrtf(getDotProduct(v, v));
    return make_float3(v.x * invLen, v.y * invLen, v.z * invLen);
}

__device__ float4 getNormalizedVec(const float4 v)
{
    float invLen = 1.0f / sqrtf(getDotProduct(v, v));
    return make_float4(v.x * invLen, v.y * invLen, v.z * invLen, v.w * invLen);
}

__device__
    float dot3F4(float4 a, float4 b)
{
    float4 a1 = make_float4(a.x, a.y, a.z,0.f);
    float4 b1 = make_float4(b.x, b.y, b.z,0.f);
    return getDotProduct(a1, b1);
}

__device__
    float getLength(float3 a)
{
    return sqrtf(getDotProduct(a, a));
}

__device__
    float getLength(float4 a)
{
    return sqrtf(getDotProduct(a, a));
}

/*****************************************
                Matrix3x3
/*****************************************/

__device__
    void setZero(Matrix3x3_d& m)
{
    m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}

__device__
    Matrix3x3_d getZeroMatrix3x3()
{
    Matrix3x3_d m;
    m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    return m;
}

__device__ 
    void setIdentity(Matrix3x3_d& m)
{
    m.m_row[0] = make_float4(1,0,0,0);
    m.m_row[1] = make_float4(0,1,0,0);
    m.m_row[2] = make_float4(0,0,1,0);
}

__device__
    Matrix3x3_d getIdentityMatrix3x3()
{
    Matrix3x3_d m;
    m.m_row[0] = make_float4(1,0,0,0);
    m.m_row[1] = make_float4(0,1,0,0);
    m.m_row[2] = make_float4(0,0,1,0);
    return m;
}

__device__
    Matrix3x3_d getTranspose(const Matrix3x3_d m)
{
    Matrix3x3_d out;
    out.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
    out.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
    out.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
    return out;
}

__device__
    Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b)
{
    Matrix3x3_d transB = getTranspose( b );
    Matrix3x3_d ans;
    //        why this doesn't run when 0ing in the for{}
    a.m_row[0].w = 0.f;
    a.m_row[1].w = 0.f;
    a.m_row[2].w = 0.f;
    for(int i=0; i<3; i++)
    {
        //        a.m_row[i].w = 0.f;
        ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
        ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
        ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
        ans.m_row[i].w = 0.f;
    }
    return ans;
}

__device__
    float4 MatrixVectorMul(Matrix3x3_d a, float4 b)
{
    float4 ans;
    ans.x = dot3F4( a.m_row[0], b );
    ans.y = dot3F4( a.m_row[1], b );
    ans.z = dot3F4( a.m_row[2], b );
    ans.w = 0.f;
    return ans;
}

现在,问题解决了!

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接