我试图将9个浮点数组从主机复制到常量内存。 我首先在头文件中声明了一个常量数组:
Header.h:
#ifndef __FUNCTIONS_H__
#define __FUNCTIONS_H__
void CudaKernelWrapper( int sample_indices_list_size,
int *gpu_sample_indices,//size:2*Warp_size * Number_multiprocessors=sample_indices_list_size
int localIterationCount,
float * _pts0_d,
float *_pts1_d);
__constant__ float K1_d[9];
__constant__ float K2_d[9];
#endif
在我的主文件中,数组K1_h[9]
和K2_h[9]
是从文本文件加载的。
main.cpp:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <helper_cuda.h>
#include "Header.h"
#define PARAMS_REQUIRED 5
#define DATA_PT_SIZE 4
#define HYPOTHESIS_SIZE 9
#define localIterationCount 32
extern "C" void ReadData(
int nmatches,
float *pts0_h,
float *pts1_h
);
extern "C" void LoadParameters(float* K1_h, float *K2_h);
// Main program
void main(int argc, const char **argv){
float *pts0_h, *pts1_h, *pts0_d,*pts1_d;
int nmatches=1437,i;
float K1_h[9], K2_h[9];
/*Allocate memory*/
pts0_h=(float*)malloc(nmatches*2*sizeof(float));
pts1_h=(float*)malloc(nmatches*2*sizeof(float));
/*Read Data from text Files*/
ReadData(nmatches, pts0_h,pts1_h);
/*allocate memory for data on device*/
checkCudaErrors( cudaMalloc((void **)&pts0_d, sizeof(float)*2*nmatches));
checkCudaErrors( cudaMalloc((void **)&pts1_d, sizeof(float)*2*nmatches));
int sample_indices_list_size=160;
int *sample_indices_list;
int *gpu_sample_indices;
sample_indices_list = (int*)malloc(sample_indices_list_size*sizeof(int));
checkCudaErrors(cudaMalloc(&gpu_sample_indices, sample_indices_list_size*sizeof(int)));
//Random indices
for( i=0; i < sample_indices_list_size; i++) {
sample_indices_list[i] = (int)(nmatches * (rand()/(1.0 + RAND_MAX)));
}
//Normalization Parameters
LoadParameters(K1_h, K2_h);
//Copy normalization parameters to constant memory
checkCudaErrors( cudaMemcpyToSymbol ( K1_d, K1_h, sizeof(float)*9));
checkCudaErrors( cudaMemcpyToSymbol ( K2_d, K2_h, sizeof(float)*9));
//Copy data to device
checkCudaErrors( cudaMemcpy(pts0_d, pts0_h, sizeof(float)*2*nmatches,cudaMemcpyHostToDevice));
checkCudaErrors( cudaMemcpy(pts1_d, pts1_h, sizeof(float)*2*nmatches,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(gpu_sample_indices, sample_indices_list,sizeof(int)*sample_indices_list_size, cudaMemcpyHostToDevice));
//Launch kernel with a wrapper function
CudaKernelWrapper( sample_indices_list_size,
gpu_sample_indices,
localIterationCount,
pts0_d,
pts1_d);
getLastCudaError("GPU_Kernel failed\n");
free(pts0_h);
free(pts1_h);
free(sample_indices_list);
checkCudaErrors( cudaFree(pts0_d) );
checkCudaErrors( cudaFree(pts1_d) );
checkCudaErrors( cudaFree(gpu_sample_indices));
}
和内核文件。
kernel.cu:
#include <stdio.h>
#include <helper_cuda.h>
#include <float.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Header.h"
#define PARAMS_REQUIRED 5
#define DATA_PT_SIZE 4
__device__ void Normalization (float *src, float *src_norm);
__global__ void CUDA_Estimation_Kernel
(int sample_indices_list_size,
int *gpu_sample_indices,
int localIterationCount,
float *_pts0_d,
float *_pts1_d)
{ int i;
int rand_idx[PARAMS_REQUIRED];
float _src[PARAMS_REQUIRED*DATA_PT_SIZE];
float _src_norm[PARAMS_REQUIRED*DATA_PT_SIZE];
const int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx >= localIterationCount) {
return;
}
#pragma unroll
for (i=0; i<PARAMS_REQUIRED; ++i)
{
rand_idx[i] = gpu_sample_indices[idx*PARAMS_REQUIRED + i];
}
for (i=0; i < PARAMS_REQUIRED; i++) {
_src[DATA_PT_SIZE*i+0]=_pts0_d[(DATA_PT_SIZE/2)*rand_idx[i]+0];
_src[DATA_PT_SIZE*i+1]=_pts0_d[(DATA_PT_SIZE/2)*rand_idx[i]+1];
_src[DATA_PT_SIZE*i+2]=_pts1_d[(DATA_PT_SIZE/2)*rand_idx[i]+0];
_src[DATA_PT_SIZE*i+3]=_pts1_d[(DATA_PT_SIZE/2)*rand_idx[i]+1];
}
Normalization (_src, _src_norm);
}
__device__ void Normalization (float *src, float *src_norm)
{ int i;
for(i=0;i<17;i+=4)
{ src_norm[i]= K1_d[0]*src[i]+K1_d[1]*src[i+1]+K1_d[2];
src_norm[i+1]=K1_d[3]*src[i]+K1_d[4]*src[i+1]+K1_d[5];
src_norm[i+2]=K2_d[0]*src[i+2]+K2_d[1]*src[i+3]+K2_d[2];
src_norm[i+3]=K2_d[3]*src[i+2]+K2_d[4]*src[i+3]+K2_d[5];
}
}
//wrapper
void CudaKernelWrapper(int sample_indices_list_size,
int *gpu_sample_indices,
int localIterationCount,
float * _pts0_d,
float *_pts1_d)
{ CUDA_Estimation_Kernel <<<1,32>>> ( sample_indices_list_size,
gpu_sample_indices,
localIterationCount,
_pts0_d,
_pts1_d);
cudaDeviceSynchronize();
}
编译成功但在运行代码时出现以下错误:
CUDA error at c:\users\xxx\main.cpp:265 code=13(cudaErrorInvalidSymbol) "cudaMemcpyToSymbol ( K1d_inv, K1_inv, sizeof(float)*9)"
从其他类似问题的答案中,我试过了:
checkCudaErrors( cudaMemcpyToSymbol ( "K1_d", K1_h, sizeof(float)*9, 0, cudaMemcpyHostToDevice));
但总是一样的错误。 如何将1D阵列从主机复制到常量内存?