我有一个CUDA模板库,其中一个函数实际上不一个模板,但是在DECLARE @XML XML = '
<DTS:Executable xmlns:DTS="www.microsoft.com/SqlServer/Dts" DTS:refId="P" DTS:CreationDate="10/01/2015 12:00:00">
<DTS:ConnectionManagers>
<DTS:ConnectionManager DTS:refId="Package.ConnectionManagers[FF]" DTS:CreationName="FLATFILE" DTS:DTSID="{123}" DTS:ObjectName="FF">
<DTS:ObjectData>
<DTS:ConnectionManager DTS:Format="Delimited" DTS:LocaleID="1033" DTS:HeaderRowDelimiter="_x000D__x000A_" DTS:ColumnNamesInFirstDataRow="True" DTS:RowDelimiter="" DTS:TextQualifier="_x0022_" DTS:CodePage="1252" DTS:ConnectionString="Test.csv">
<DTS:FlatFileColumns>
<DTS:FlatFileColumn DTS:ColumnType="Delimited" DTS:ColumnDelimiter="_x002C_" DTS:DataType="11" DTS:TextQualified="True" DTS:ObjectName="TestCN" DTS:DTSID="{012}" DTS:CreationName="" />
</DTS:FlatFileColumns>
</DTS:ConnectionManager>
</DTS:ObjectData>
</DTS:ConnectionManager>
</DTS:ConnectionManagers>
</DTS:Executable>'
;
WITH XMLNAMESPACES (N'www.microsoft.com/SqlServer/Dts' as DTS )
SELECT
y.vals.query('.') AS NodesAsExtracted
, x.vals.value('@DTS:CreationName', 'Varchar(255)') AS CreationName
, x.vals.value('@DTS:ObjectName', 'Varchar(255)') AS ObjectName
, y.vals.value('@DTS:ConnectionString', 'Varchar(255)') AS ConnectionString
, x.vals.value('@DTS:ColumnType', 'Varchar(255)') AS ColumnType
, x.vals.value('@DTS:MaximumWidth', 'Varchar(255)') AS MaximumWidth
FROM @XML.nodes('/DTS:Executable/DTS:ConnectionManagers/DTS:ConnectionManager/DTS:ObjectData/DTS:ConnectionManager') AS y(vals)
CROSS APPLY @XML.nodes('/DTS:Executable/DTS:ConnectionManagers/DTS:ConnectionManager/DTS:ObjectData/DTS:ConnectionManager/DTS:FlatFileColumns/DTS:FlatFileColumn') AS x(vals)
/*
The key piece is you are extracting data with a namespace, which makes things harder when querying.
You need to repeat certain 'nodes' so there is a syntax for that called originally enough 'nodes' that breaks up a 3d object like xml into multiple bits
I do one for the high level and one for the lower and then cross apply them which really is a whole world into itself I won't mention here
It should be represented as a parent 'x' and the values found 'vals'
I showed an example as is first when I query '('.')' which is everything in essence.
My namespace declaration must match on the xml that exists and the declaration.
more on nodes https://msdn.microsoft.com/en-us/library/ms188282.aspx
more on query https://msdn.microsoft.com/en-us/library/ms191474.aspx
more on value https://msdn.microsoft.com/en-us/library/ms178030.aspx
*/
标头中定义的。 (.cuh
下面的vector_add_kernel
。)
如果多个kernel.cuh
文件包含.cu
并且调用kernel.cuh
,则会在链接时导致多个定义错误。在C ++中,可以使用vector_add[_kernel]
限定符来避免此类错误。
但是,inline
- 在我的系统上阻止多重定义错误时 - 会发出警告,inline __global__ ...
限定符已被忽略。
问:是否有更好的方法可以避免多重定义错误,或者只为此功能禁止此警告? inline
是否安全,或者其他主机编译器真正忽略它?
我可以简单地将inline __global__
移动到单独的vector_add_kernel
文件,但它只是 非头文件。我也可以模板.cu
,但在我的库中没什么用。
一个(不那么小,很抱歉)工作示例(使用CUDA 7.0测试,Debian上的gcc 4.7.2测试)如下所示。
澄清一下,vector_add_kernel
是一些用户的代码; main.cu
是一些不属于我的外部库 ;并且lib.cu
是我的模板库的一部分。因此,外部kernel.cuh
和用户的lib
都在使用我的模板库main
- 但是单独使用。
kernel.cuh
:
main.cu
#include "lib.hpp"
#include "kernel.cuh"
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <cstddef>
#include <cstdlib>
#include <iostream>
int main(void)
{
const size_t N = 1u << 7;
float* a = (float*) malloc(N * sizeof(float));
float* b = (float*) malloc(N * sizeof(float));
float* c = (float*) malloc(N * sizeof(float));
for (int i = 0; i < N; ++i) {
a[i] = b[i] = 2.0f * i;
}
lib_vector_add(a, b, c, N);
for (int i = 0; i < N; ++i) {
if (c[i] != 2.0f * i + 2.0f * i)
std::cout << "Error, lib, element " << i << std::endl;
}
thrust::device_vector<float> d_a(a, a + N);
thrust::device_vector<float> d_b(b, b + N);
thrust::device_vector<float> d_c(N);
vector_add(d_a, d_b, d_c);
thrust::host_vector<float> h_c = d_c;
for (int i = 0; i < N; ++i) {
if (h_c[i] != 2.0f * i + 2.0f * i)
std::cout << "Error, element " << i << std::endl;
}
}
,
lib.cu
#include <kernel.cuh>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
void lib_vector_add(float* a, float* b, float* c, size_t N)
{
thrust::host_vector<float> h_a(a, a + N);
thrust::host_vector<float> h_b(b, b + N);
thrust::device_vector<float> d_a = h_a;
thrust::device_vector<float> d_b = h_b;
thrust::device_vector<float> d_c(N);
vector_add(d_a, d_b, d_c);
thrust::host_vector<float> h_c = d_c;
for (int i = 0; i < N; ++i)
{
c[i] = h_c[i];
}
}
,
lib.hpp
#pragma once
#include <cstddef>
void lib_vector_add(float*, float*, float*, size_t);
- 此表单导致链接器错误。取消注释第一个kernel.cuh
以获取正常工作的代码。
inline
#pragma once
#include <thrust/device_vector.h>
#include <cstddef>
// inline keyword avoids multiple definition errors, but produces warnings.
// UNCOMMENT TO GET A WORKING EXECUTABLE.
// inline
__global__ void vector_add_kernel(
const float *const a,
const float *const b,
float *const c,
const size_t N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N)
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
// inline produces no warnings.
inline
void vector_add(
const thrust::device_vector<float>& d_a,
const thrust::device_vector<float>& d_b,
thrust::device_vector<float>& d_c)
{
const float *const a_ptr = thrust::raw_pointer_cast(d_a.data());
const float *const b_ptr = thrust::raw_pointer_cast(d_b.data());
float *const c_ptr = thrust::raw_pointer_cast(d_c.data());
const size_t N = d_a.size();
dim3 block(128);
dim3 grid((N + 127) / 128);
vector_add_kernel<<<grid, block>>>(a_ptr, b_ptr, c_ptr, N);
}
Makefile
答案 0 :(得分:4)
如果您想保留当前的代码组织,那么您有一个非常简单的解决方案,即声明您的内核static
(代替您的inline
关键字)。这将阻止链接器抱怨,但是会产生尽可能多的不同版本的内核,因为将会包含kernel.cuh
的编译单元(目标文件)。
另一个解决方案是 templatise 你的内核。我知道你已经驳回了这种可能性,但你应该重新考虑它,因为你的内核是输入参数的float
类型的自然模板......