当我尝试编译以前工作过的代码时,我在VS2012中发现了这个奇怪的错误(在我看来)。我使用CUDA生成一个2D数据数组,我的目标是将其写入文本文件......但是当我在主函数末尾附加这个代码片段时
// basic file operations
#include <iostream>
#include <fstream>
using namespace std;
int main () {
ofstream myfile;
myfile.open ("example.txt");
myfile << "Writing this to a file.\n";
myfile.close();
return 0;
}
我得到了
1> C:\Users\Karsten Chu\New Google Drive\Research\Visual Studio 2012\Projects\Dynamic Parallelism Test\Dynamic Parallelism Test>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -dlink -o "x64\Debug\Dynamic Parallelism Test.device-link.obj" -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\x64" cuda.lib cudart.lib cudadevrt.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib -gencode=arch=compute_35,code=sm_35 -G --machine 64 "x64\Debug\CUDA Test 2.cu.obj" "x64\Debug\CUDA Test.cu.obj" "x64\Debug\RKF5 Prototype 2.cu.obj" x64\Debug\version.cu.obj
1>nvlink : error : Undefined reference to '_ZTVSo__St14basic_ofstreamIcSt11char_traitsIcEE' in 'x64/Debug/RKF5 Prototype 2.cu.obj'
1>nvlink : error : Undefined reference to '_ZTVSt9basic_iosIcSt11char_traitsIcEE__So__St14basic_ofstreamIcS1_E' in 'x64/Debug/RKF5 Prototype 2.cu.obj'
1>C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V110\BuildCustomizations\CUDA 5.5.targets(668,9): error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -dlink -o "x64\Debug\Dynamic Parallelism Test.device-link.obj" -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\x64" cuda.lib cudart.lib cudadevrt.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib -gencode=arch=compute_35,code=sm_35 -G --machine 64 "x64\Debug\CUDA Test 2.cu.obj" "x64\Debug\CUDA Test.cu.obj" "x64\Debug\RKF5 Prototype 2.cu.obj" x64\Debug\version.cu.obj" exited with code -1.
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========
现在我的理解是,nvlink与我的代码的那部分CUDA链接有关...为什么我的代码的这两个方面会干扰?我认为这些错误意味着需要添加一个不在我的项目设置中的库,或者函数定义的参数和原型不匹配。
修改
这里是我的代码的#includes和main()...所有的CUDA内容都发布在我之前的问题中。我的编译器选项我不确定如何从错误代码中获得。该项目只是一个Win32控制台应用程序,我只有一个源文件,这个RKF5 Prototype 2.cu文件。我尝试了一个单独的新项目,代码编译也很适合我。
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
//#include <stdio.h>
#include <iostream>
#include <fstream>
//#include <iomanip> //display 2 decimal places
#include <math.h>
using namespace std;
__global__ void rkf5(double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*, int*, size_t, double*, double*, double*);
__global__ void calcK(double*, double*, double*);
__global__ void k1(double*, double*, double*);
__global__ void k2(double*, double*, double*);
__global__ void k3(double*, double*, double*);
__global__ void k4(double*, double*, double*);
__global__ void k5(double*, double*, double*);
__global__ void k6(double*, double*, double*);
__global__ void arrAdd(double*, double*, double*);
__global__ void arrSub(double*, double*, double*);
__global__ void arrMult(double*, double*, double*);
__global__ void arrInit(double*, double);
__global__ void arrCopy(double*, double*);
__device__ void setup(double , double*, double*, double*, double*, int*);
__device__ double flux(int, double*) ;
__device__ double knowles_flux(int, double*);
__device__ void calcStepSize(double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*);
__global__ void storeConcs(double*, size_t, double*, int);
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*);
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*, double*);
//Error checking that I don't understand yet.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//Main program.
int main(int argc, char** argv)
{
//std::cout << std::fixed; //display 2 decimal places
//std::cout << std::setprecision(12); //display 2 decimal places
const int maxlength = 1; //Number of discrete concentrations we are tracking.
double concs[maxlength]; //Meant to store the current concentrations
double temp1[maxlength]; //Used as a bin to store products of Butcher's tableau and k values.
double temp2[maxlength]; //Used as a bin to store products of Butcher's tableau and k values.
double tempsum[maxlength]; //Used as a bin to store cumulative sum of tableau and k values
double k1s[maxlength];
double k2s[maxlength];
double k3s[maxlength];
double k4s[maxlength];
double k5s[maxlength];
double k6s[maxlength];
const int numpoints = 40;
double to = 0;
double tf = 1;
//double dt = static_cast<double>(.5)/static_cast<double>(64);
double dt = (tf-to)/static_cast<double>(numpoints);
double mo = 1;
double concStorage[maxlength][numpoints]; //Stores concs vs. time
//Initialize all the arrays on the host to ensure arrays of 0's are sent to the device.
//Also, here is where we can seed the system.
std::cout<<dt;
std::cout<<"\n";
concs[0]=mo;
std::cout<<concs[0];
std::cout<<" ";
for (int i=0; i<maxlength; i++)
{
for (int j=0; j<numpoints; j++)
concStorage[i][j]=0;
concs[i]=0;
temp1[i]=0;
temp2[i]=0;
tempsum[i]=0;
k1s[i]=0;
k2s[i]=0;
k3s[i]=0;
k4s[i]=0;
k5s[i]=0;
k6s[i]=0;
std::cout<<concs[i];
std::cout<<" ";
}
concs[0]=mo;
std::cout<<"\n";
//Define all the pointers to device array memory addresses. These contain the on-GPU
//addresses of all the data we're generating/using.
double *d_concs;
double *d_temp1;
double *d_temp2;
double *d_tempsum;
double *d_k1s;
double *d_k2s;
double *d_k3s;
double *d_k4s;
double *d_k5s;
double *d_k6s;
double *d_dt;
int *d_maxlength;
int *d_numpoints;
double *d_to;
double *d_tf;
double *d_concStorage;
//Calculate all the sizes of the arrays in order to allocate the proper amount of memory on the GPU.
size_t size_concs = sizeof(concs);
size_t size_temp1 = sizeof(temp1);
size_t size_temp2 = sizeof(temp2);
size_t size_tempsum = sizeof(tempsum);
size_t size_ks = sizeof(k1s);
size_t size_maxlength = sizeof(maxlength);
size_t size_numpoints = sizeof(numpoints);
size_t size_dt = sizeof(dt);
size_t size_to = sizeof(to);
size_t size_tf = sizeof(tf);
size_t h_pitch = numpoints*sizeof(double);
size_t d_pitch;
//Calculate the "pitch" of the 2D array. The pitch is basically the length of a 2D array's row. IT's larger
//than the actual row full of data due to hadware issues. We thusly will use the pitch instead of the data
//size to traverse the array.
gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, numpoints * sizeof(double), maxlength));
//Allocate memory on the GPU for all the arrrays we're going to use in the integrator.
gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));
gpuErrchk(cudaMalloc((void**)&d_temp1, size_temp1));
gpuErrchk(cudaMalloc((void**)&d_temp2, size_temp1));
gpuErrchk(cudaMalloc((void**)&d_tempsum, size_tempsum));
gpuErrchk(cudaMalloc((void**)&d_k1s, size_ks));
gpuErrchk(cudaMalloc((void**)&d_k2s, size_ks));
gpuErrchk(cudaMalloc((void**)&d_k3s, size_ks));
gpuErrchk(cudaMalloc((void**)&d_k4s, size_ks));
gpuErrchk(cudaMalloc((void**)&d_k5s, size_ks));
gpuErrchk(cudaMalloc((void**)&d_k6s, size_ks));
gpuErrchk(cudaMalloc((void**)&d_maxlength, size_maxlength));
gpuErrchk(cudaMalloc((void**)&d_numpoints, size_numpoints));
gpuErrchk(cudaMalloc((void**)&d_dt, size_dt));
gpuErrchk(cudaMalloc((void**)&d_to, size_to));
gpuErrchk(cudaMalloc((void**)&d_tf, size_tf));
//Copy all initial values of arrays to GPU.
gpuErrchk(cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_temp1, &temp1, size_temp1, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_temp2, &temp2, size_temp2, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_tempsum, &tempsum, size_tempsum, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_k1s, &k1s, size_ks, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_k2s, &k2s, size_ks, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_k3s, &k3s, size_ks, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_k4s, &k4s, size_ks, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_k5s, &k5s, size_ks, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_k6s, &k6s, size_ks, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_maxlength, &maxlength, size_maxlength, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_numpoints, &numpoints, size_numpoints, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice));
//Run the integrator.
rkf5<<<1,1>>>(d_concs, d_concStorage, d_temp1, d_temp2, d_tempsum, d_k1s, d_k2s, d_k3s, d_k4s, d_k5s, d_k6s, d_maxlength, d_numpoints, d_pitch, d_dt, d_to, d_tf);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaDeviceSynchronize();
/*
//Sets all of concStorage to 1 after the kernel runs. Used to make sure that 2D array copied over the array.
std::cout << "\n";
for (int i=0; i<maxlength; i++)
for(int j=0; j<numpoints; j++)
concStorage[i][j]=1;
*/
//Copy concentrations from GPU to Host. Almost defunct now that transferring the 2D array works.
cudaMemcpy(concs, d_concs, size_concs, cudaMemcpyDeviceToHost);
//Copy 2D array of concentrations vs. time from GPU to Host.
gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyDeviceToHost) );
//Print concentrations after the integrator kernel runs. Used to test that data was transferring to and from GPU correctly.
std::cout << "\n";
for (int i=0; i<maxlength; i++)
{
std::cout<<concs[i];
std::cout<<" ";
}
double a[10];
double b[10];
double c[10];
for(int i = 0; i< 10; i++)
{
a[i]=0;
b[i]=0;
c[i]=0;
}
//Print out the concStorage array after the kernel runs. Used to test that the 2D array transferred correctly from host to GPU and back.
std::cout << "\n\n";
std::cout << "Calculated Array";
std::cout << "\n\n";
for (int i=0; i<maxlength; i++)
{
for(int j=0; j<numpoints; j++)
{
if (j%(numpoints/10)==0)
{
a[j/(numpoints/10)]=concStorage[i][j];
std::cout<<concStorage[i][j];
std::cout<<" ";
}
}
std::cout << "\n";
}
std::cout << "\n";
std::cout << "Exponential";
std::cout << "\n\n";
for (int i=0; i<10; i++)
{
b[i]=exp(-i*(tf-to)/10);
std::cout<<exp(-i*(tf-to)/10);
std::cout<<" ";
}
std::cout << "\n\n";
std::cout << "Error Array";
std::cout << "\n\n";
for (int i=0; i<10; i++)
{
c[i]=a[i]-b[i];
std::cout<<c[i];
std::cout<<" ";
}
std::cout << "\n\n";
cudaDeviceReset(); //Clean up all memory.
///*
ofstream myfile;
myfile.open ("example.txt");
myfile << "Writing.";
myfile.close();
//*/
return 0;
}
答案 0 :(得分:4)
您的简单代码
#include <iostream>
#include <fstream>
using namespace std;
int main () {
ofstream myfile;
myfile.open ("example.txt");
myfile << "Writing this to a file.\n";
myfile.close();
return 0;
}
使用-rdc=true
选项编译的给出了您报告的nvlink错误。这可能是由于未能使用静态加载的C运行时来匹配CUDA运行时。这可以通过
修复Project -> Properties -> Configuration Properties -> CUDA C/C++ -> Host -> Runtime Library
并选择
Multi-Threaded Debug (/MTd)
如果您处于调试模式或
Multi-Threaded (/MT)
如果您处于发布模式。
我希望这可以解决您的问题。