将.cpp文件中的cuda主机代码分开

时间:2014-03-25 11:46:52

标签: visual-c++ cuda

的main.cpp

#include<iostream>
#include "cuda.h"


using namespace std;
void cuda_calculation();


int main()
{
    cuda_calculation();
    return 0;
}

cu.h

void call(int , int ,float*  , int  );

cuda.cpp

#include <stdio.h>
#include <cuda.h>
#include "cu.h"




void cuda_calculation()
{
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
  void call(n_blocks, block_size,&a_d, N);
  /*square_array <<< n_blocks, block_size >>> (a_d, N);*/
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); cudaFree(a_d);
}

cu.cu

#include <stdio.h>
#include "cu.h"
#include <cuda.h>

// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx<N) a[idx] = a[idx] * a[idx];
}


//}


void call(int a,int b,float* c,int d)
{
square_array <<< 3,4 >>> (c,d);
}

我尝试将内核代码和主机代码分隔在cpp文件中,但是会出现以下错误:

Error    'cudaMemcpy': identifier not found and the other cuda related identifier is not identified.

how to use the cuda related identifier in cpp file and call the kernal functions

2 个答案:

答案 0 :(得分:1)

有一些错误:void cuda_calculation();需要通过头文件(cu.h)对main.cpp可见。

同样确保使用nvcc 编译.cu文件,而不是标准C ++文件。使用CUDA编译规则可以简化此过程(默认情况下作为CUDA工具包的一部分安装)

答案 1 :(得分:0)

经过长时间的试验,我得到了正确的输出,

要在cpp文件中包含cuda标识符,我们不仅需要包含 cuda.h ,还需要包含 cuda_runtime.h

cuda.cpp as

#include <stdio.h>
#include <cuda.h>
#include<cuda_runtime.h>

#include "cu.h"
#include "cud.h"





//void call(int , int ,float * , int  );

void cuda_calculation()
{
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
   call(n_blocks, block_size,a_d, N);
  /*square_array <<< n_blocks, block_size >>> (a_d, N);*/
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); 
  cudaFree(a_d);
}

所以其他文件是

<强>的main.cpp

#include<iostream>
#include "cud.h"


using namespace std;



int main()
{
    cuda_calculation();
    return 0;
}

<强> cud.h

void cuda_calculation();

<强> cu.h

void call(int , int ,float*  , int  );

<强> cu.cu

#include <stdio.h>
#include "cu.h"
#include <cuda.h>

// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx<N) a[idx] = a[idx] * a[idx];
}


//}


void call(int a,int b,float* c,int d)
{
square_array <<< 3,4 >>> (c,d);
}