我尝试使用cuda math api,例如sqrtf(),__ fdividef(),并出现如下错误:
error C3861: '__fdividef':Identifier not found
error C3861: 'rsqrtf':Identifier not found
似乎“NVIDIA CUDA Math API”没有指定当我们想要使用这些api时我们应该包含哪个标头。在helper_math.h中,它看起来像函数,例如
inline __host__ __device__ float length(float4 v)
{
return sqrtf(dot(v, v));
}
使用c库math.h中的sqrtf实现,不是没有设备标识符的主机功能吗?我们可以直接在设备功能中使用它吗?
请指正。感谢。
答案 0 :(得分:2)
以前,我实现了这样的代码:
// This implementation follows the code from
// https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h
#ifndef UNIFIED_MATH_CUDA_H
#define UNIFIED_MATH_CUDA_H
#include "vector_functions.h"
/*****************************************
Vector
/*****************************************/
__device__
float fastDiv(float numerator, float denominator)
{
return __fdividef(numerator, denominator);
//return numerator/denominator;
}
__device__
float getSqrtf(float f2)
{
return sqrtf(f2);
}
__device__
float getReverseSqrt(float f2)
{
return rsqrtf(f2);
}
__device__
float3 getCrossProduct(float3 a, float3 b)
{
return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}
__device__
float4 getCrossProduct(float4 a, float4 b)
{
float3 v1 = make_float3(a.x, a.y, a.z);
float3 v2 = make_float3(b.x, b.y, b.z);
float3 v3 = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
return make_float4(v3.x, v3.y, v3.z, 0.0f);
}
__device__
float getDotProduct(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__
float getDotProduct(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
__device__ float3 getNormalizedVec(const float3 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float3(v.x * invLen, v.y * invLen, v.z * invLen);
}
__device__ float4 getNormalizedVec(const float4 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float4(v.x * invLen, v.y * invLen, v.z * invLen, v.w * invLen);
}
__device__
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.x, a.y, a.z,0.f);
float4 b1 = make_float4(b.x, b.y, b.z,0.f);
return getDotProduct(a1, b1);
}
__device__
float getLength(float3 a)
{
return sqrtf(getDotProduct(a, a));
}
__device__
float getLength(float4 a)
{
return sqrtf(getDotProduct(a, a));
}
/*****************************************
Matrix3x3
/*****************************************/
typedef struct
{
float4 m_row[3];
}Matrix3x3_d;
__device__
void setZero(Matrix3x3_d& m)
{
m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
__device__
Matrix3x3_d getZeroMatrix3x3()
{
Matrix3x3_d m;
m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
return m;
}
__device__
void setIdentity(Matrix3x3_d& m)
{
m.m_row[0] = make_float4(1,0,0,0);
m.m_row[1] = make_float4(0,1,0,0);
m.m_row[2] = make_float4(0,0,1,0);
}
__device__
Matrix3x3_d getIdentityMatrix3x3()
{
Matrix3x3_d m;
m.m_row[0] = make_float4(1,0,0,0);
m.m_row[1] = make_float4(0,1,0,0);
m.m_row[2] = make_float4(0,0,1,0);
return m;
}
__device__
Matrix3x3_d getTranspose(const Matrix3x3_d m)
{
Matrix3x3_d out;
out.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__device__
Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b)
{
Matrix3x3_d transB = getTranspose( b );
Matrix3x3_d ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
__device__
float4 MatrixVectorMul(Matrix3x3_d a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
#endif // UNIFIED_MATH_CUDA_H
由于cuda会自动内联所有设备代码,我忽略了内联标识符。但它出现了我在问题中所说的错误:
error C3861: '__fdividef':Identifier not found
error C3861: 'rsqrtf':Identifier not found
然后我按照传统的c / c ++样式重新实现如下。
UnifiedMathCUDA.cuh:
// This implementation follows the code from
// https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h
#ifndef UNIFIED_MATH_CUDA_H
#define UNIFIED_MATH_CUDA_H
#include "vector_functions.h"
/*****************************************
Vector
/*****************************************/
__device__
float fastDiv(float numerator, float denominator);
__device__
float getSqrtf(float f2);
__device__
float getReverseSqrt(float f2);
__device__
float3 getCrossProduct(float3 a, float3 b);
__device__
float4 getCrossProduct(float4 a, float4 b);
__device__
float getDotProduct(float3 a, float3 b);
__device__
float getDotProduct(float4 a, float4 b);
__device__ float3 getNormalizedVec(const float3 v);
__device__ float4 getNormalizedVec(const float4 v);
__device__
float dot3F4(float4 a, float4 b);
__device__
float getLength(float3 a);
__device__
float getLength(float4 a);
/*****************************************
Matrix3x3
/*****************************************/
typedef struct
{
float4 m_row[3];
}Matrix3x3_d;
__device__
void setZero(Matrix3x3_d& m);
__device__
Matrix3x3_d getZeroMatrix3x3();
__device__
void setIdentity(Matrix3x3_d& m);
__device__
Matrix3x3_d getIdentityMatrix3x3();
__device__
Matrix3x3_d getTranspose(const Matrix3x3_d m);
__device__
Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b);
__device__
float4 MatrixVectorMul(Matrix3x3_d a, float4 b);
#endif // UNIFIED_MATH_CUDA_H
和对应的实现文件是这样的:
#include "UnifiedMathCUDA.cuh"
/*****************************************
Vector
/*****************************************/
__device__
float fastDiv(float numerator, float denominator)
{
return __fdividef(numerator, denominator);
//return numerator/denominator;
}
__device__
float getSqrtf(float f2)
{
return sqrtf(f2);
}
__device__
float getReverseSqrt(float f2)
{
return rsqrtf(f2);
}
__device__
float3 getCrossProduct(float3 a, float3 b)
{
return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}
__device__
float4 getCrossProduct(float4 a, float4 b)
{
float3 v1 = make_float3(a.x, a.y, a.z);
float3 v2 = make_float3(b.x, b.y, b.z);
float3 v3 = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
return make_float4(v3.x, v3.y, v3.z, 0.0f);
}
__device__
float getDotProduct(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__
float getDotProduct(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
__device__ float3 getNormalizedVec(const float3 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float3(v.x * invLen, v.y * invLen, v.z * invLen);
}
__device__ float4 getNormalizedVec(const float4 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float4(v.x * invLen, v.y * invLen, v.z * invLen, v.w * invLen);
}
__device__
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.x, a.y, a.z,0.f);
float4 b1 = make_float4(b.x, b.y, b.z,0.f);
return getDotProduct(a1, b1);
}
__device__
float getLength(float3 a)
{
return sqrtf(getDotProduct(a, a));
}
__device__
float getLength(float4 a)
{
return sqrtf(getDotProduct(a, a));
}
/*****************************************
Matrix3x3
/*****************************************/
__device__
void setZero(Matrix3x3_d& m)
{
m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
__device__
Matrix3x3_d getZeroMatrix3x3()
{
Matrix3x3_d m;
m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
return m;
}
__device__
void setIdentity(Matrix3x3_d& m)
{
m.m_row[0] = make_float4(1,0,0,0);
m.m_row[1] = make_float4(0,1,0,0);
m.m_row[2] = make_float4(0,0,1,0);
}
__device__
Matrix3x3_d getIdentityMatrix3x3()
{
Matrix3x3_d m;
m.m_row[0] = make_float4(1,0,0,0);
m.m_row[1] = make_float4(0,1,0,0);
m.m_row[2] = make_float4(0,0,1,0);
return m;
}
__device__
Matrix3x3_d getTranspose(const Matrix3x3_d m)
{
Matrix3x3_d out;
out.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__device__
Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b)
{
Matrix3x3_d transB = getTranspose( b );
Matrix3x3_d ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
__device__
float4 MatrixVectorMul(Matrix3x3_d a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
现在,问题解决了!