Question

我有一个CUDA模板库，其中一个函数实际上不一个模板，但是在DECLARE @XML XML = ' <DTS:Executable xmlns:DTS="www.microsoft.com/SqlServer/Dts" DTS:refId="P" DTS:CreationDate="10/01/2015 12:00:00"> <DTS:ConnectionManagers> <DTS:ConnectionManager DTS:refId="Package.ConnectionManagers[FF]" DTS:CreationName="FLATFILE" DTS:DTSID="{123}" DTS:ObjectName="FF"> <DTS:ObjectData> <DTS:ConnectionManager DTS:Format="Delimited" DTS:LocaleID="1033" DTS:HeaderRowDelimiter="_x000D__x000A_" DTS:ColumnNamesInFirstDataRow="True" DTS:RowDelimiter="" DTS:TextQualifier="_x0022_" DTS:CodePage="1252" DTS:ConnectionString="Test.csv"> <DTS:FlatFileColumns> <DTS:FlatFileColumn DTS:ColumnType="Delimited" DTS:ColumnDelimiter="_x002C_" DTS:DataType="11" DTS:TextQualified="True" DTS:ObjectName="TestCN" DTS:DTSID="{012}" DTS:CreationName="" /> </DTS:FlatFileColumns> </DTS:ConnectionManager> </DTS:ObjectData> </DTS:ConnectionManager> </DTS:ConnectionManagers> </DTS:Executable>' ; WITH XMLNAMESPACES (N'www.microsoft.com/SqlServer/Dts' as DTS ) SELECT y.vals.query('.') AS NodesAsExtracted , x.vals.value('@DTS:CreationName', 'Varchar(255)') AS CreationName , x.vals.value('@DTS:ObjectName', 'Varchar(255)') AS ObjectName , y.vals.value('@DTS:ConnectionString', 'Varchar(255)') AS ConnectionString , x.vals.value('@DTS:ColumnType', 'Varchar(255)') AS ColumnType , x.vals.value('@DTS:MaximumWidth', 'Varchar(255)') AS MaximumWidth FROM @XML.nodes('/DTS:Executable/DTS:ConnectionManagers/DTS:ConnectionManager/DTS:ObjectData/DTS:ConnectionManager') AS y(vals) CROSS APPLY @XML.nodes('/DTS:Executable/DTS:ConnectionManagers/DTS:ConnectionManager/DTS:ObjectData/DTS:ConnectionManager/DTS:FlatFileColumns/DTS:FlatFileColumn') AS x(vals) /* The key piece is you are extracting data with a namespace, which makes things harder when querying. You need to repeat certain 'nodes' so there is a syntax for that called originally enough 'nodes' that breaks up a 3d object like xml into multiple bits I do one for the high level and one for the lower and then cross apply them which really is a whole world into itself I won't mention here It should be represented as a parent 'x' and the values found 'vals' I showed an example as is first when I query '('.')' which is everything in essence. My namespace declaration must match on the xml that exists and the declaration. more on nodes https://msdn.microsoft.com/en-us/library/ms188282.aspx more on query https://msdn.microsoft.com/en-us/library/ms191474.aspx more on value https://msdn.microsoft.com/en-us/library/ms178030.aspx */标头中定义的。（.cuh下面的vector_add_kernel。）

如果多个kernel.cuh文件包含.cu并且调用kernel.cuh，则会在链接时导致多个定义错误。在C ++中，可以使用vector_add[_kernel]限定符来避免此类错误。

但是，inline - 在我的系统上阻止多重定义错误时 - 会发出警告，inline __global__ ...限定符已被忽略。

问：是否有更好的方法可以避免多重定义错误，或者只为此功能禁止此警告？ inline是否安全，或者其他主机编译器真正忽略它？

我可以简单地将inline __global__移动到单独的vector_add_kernel文件，但它只是非头文件。我也可以模板.cu，但在我的库中没什么用。

一个（不那么小，很抱歉）工作示例（使用CUDA 7.0测试，Debian上的gcc 4.7.2测试）如下所示。

澄清一下，vector_add_kernel是一些用户的代码; main.cu是一些不属于我的外部库 ;并且lib.cu是我的模板库的一部分。因此，外部kernel.cuh和用户的lib都在使用我的模板库main - 但是单独使用。

kernel.cuh：

main.cu

#include "lib.hpp" #include "kernel.cuh" #include <thrust/device_vector.h> #include <thrust/host_vector.h> #include <cstddef> #include <cstdlib> #include <iostream> int main(void) { const size_t N = 1u << 7; float* a = (float*) malloc(N * sizeof(float)); float* b = (float*) malloc(N * sizeof(float)); float* c = (float*) malloc(N * sizeof(float)); for (int i = 0; i < N; ++i) { a[i] = b[i] = 2.0f * i; } lib_vector_add(a, b, c, N); for (int i = 0; i < N; ++i) { if (c[i] != 2.0f * i + 2.0f * i) std::cout << "Error, lib, element " << i << std::endl; } thrust::device_vector<float> d_a(a, a + N); thrust::device_vector<float> d_b(b, b + N); thrust::device_vector<float> d_c(N); vector_add(d_a, d_b, d_c); thrust::host_vector<float> h_c = d_c; for (int i = 0; i < N; ++i) { if (h_c[i] != 2.0f * i + 2.0f * i) std::cout << "Error, element " << i << std::endl; } }，

lib.cu

#include <kernel.cuh> #include <thrust/host_vector.h> #include <thrust/device_vector.h> void lib_vector_add(float* a, float* b, float* c, size_t N) { thrust::host_vector<float> h_a(a, a + N); thrust::host_vector<float> h_b(b, b + N); thrust::device_vector<float> d_a = h_a; thrust::device_vector<float> d_b = h_b; thrust::device_vector<float> d_c(N); vector_add(d_a, d_b, d_c); thrust::host_vector<float> h_c = d_c; for (int i = 0; i < N; ++i) { c[i] = h_c[i]; } }，

lib.hpp

#pragma once #include <cstddef> void lib_vector_add(float*, float*, float*, size_t); - 此表单导致链接器错误。取消注释第一个kernel.cuh以获取正常工作的代码。

inline

#pragma once #include <thrust/device_vector.h> #include <cstddef> // inline keyword avoids multiple definition errors, but produces warnings. // UNCOMMENT TO GET A WORKING EXECUTABLE. // inline __global__ void vector_add_kernel( const float *const a, const float *const b, float *const c, const size_t N) { int tid = threadIdx.x + blockIdx.x * blockDim.x; while (tid < N) { c[tid] = a[tid] + b[tid]; tid += blockDim.x * gridDim.x; } } // inline produces no warnings. inline void vector_add( const thrust::device_vector<float>& d_a, const thrust::device_vector<float>& d_b, thrust::device_vector<float>& d_c) { const float *const a_ptr = thrust::raw_pointer_cast(d_a.data()); const float *const b_ptr = thrust::raw_pointer_cast(d_b.data()); float *const c_ptr = thrust::raw_pointer_cast(d_c.data()); const size_t N = d_a.size(); dim3 block(128); dim3 grid((N + 127) / 128); vector_add_kernel<<<grid, block>>>(a_ptr, b_ptr, c_ptr, N); }

Makefile

Answer 1

如果您想保留当前的代码组织，那么您有一个非常简单的解决方案，即声明您的内核static（代替您的inline关键字）。这将阻止链接器抱怨，但是会产生尽可能多的不同版本的内核，因为将会包含kernel.cuh的编译单元（目标文件）。

另一个解决方案是 templatise 你的内核。我知道你已经驳回了这种可能性，但你应该重新考虑它，因为你的内核是输入参数的float类型的自然模板......

'inline'用于global函数以避免多重定义错误

1 个答案:

'inline'用于__global__函数以避免多重定义错误

1 个答案:

'inline'用于global函数以避免多重定义错误