我在CUDA中遇到了一些内存访问问题。 我的代码的核心是
long long addr0,addr1;
addr0=(long long)my_array;
addr1 = ( addr0 ^ (1 << position));
long long *r_addr0, *r_addr1;
r_addr0 = (long long *)addr0;
r_addr1 = (long long *)addr1;
i = *r_addr0;
j = *r_addr1;
其中 my_array 是设备数组的地址。
我将 my_array 的地址存储在 r_addr0 中,然后我逐个翻转 r_addr0 的位。 e.g。
0000 0000 1011 0000 0011 1111 1110 0000 0000 0000 0000 0 addr of my_array
0000 0000 1011 0000 0011 1111 1110 0000 0000 0000 0001 1 flip last bit
0000 0000 1011 0100 0011 1111 1110 0000 0000 0000 0000 31 flip 31 bit.
我每次打印 r_addr0 和 r_addr1 的地址,它对前31位工作良好,但我在32位后遇到非法的存储器地址问题。我正在使用特斯拉K80,它有12GB内存。
有谁知道如何避免CUDA中的非法内存访问
完整代码见下文:
# include <stdio.h>
# include <stdint.h>
# include "cuda_runtime.h"
//compile nvcc test.cu -o test
__global__ void global_latency (int * my_array, int position, int *d_time);
int row_bits(int * h_a, long long N, int pos, int * h_time);
int main(){
cudaSetDevice(0);
long long i, N;
int *h_a;
int h_time0;
int h_time1;
int *h_time;
N = 2*1024*1024*1024L;//2G elements, 4 bytes per element, 8 GB memory used.
printf("\n=====%10.4f GB array with %d GB elements,discover row bits====\n", sizeof(int)*(float)N/1024/1024/1024,N/1024/1024/1024);
/* allocate arrays on CPU */
h_a = (int *)malloc(sizeof(int) * N);
h_time = (int *)malloc(sizeof(int)*N);
/* initialize array elements*/
for (i=0L; i<N; i++){
h_a[i] = i%(1024*1024);
}
for (int k=0;k<2;k++){
h_time[k]=0;
}
printf("... ... ...\n... ... ...\n");
for (int pos = 0; pos < 64; pos++){
h_time0=0;
h_time1=0;
for (int j=0;j<5;j++){
row_bits(h_a,N,pos,h_time);
h_time0 +=h_time[0];
h_time1 +=h_time[1];
}
printf("position = %d, time0 = %d, time1 = %d\n", pos+1,h_time0/5, h_time1/5);
}
printf("===============================================\n\n");
free(h_a);
return 0;
}
int row_bits(int * h_a, long long N, int pos, int * h_time) {
cudaError_t error_id;
int *d_a;
/* allocate arrays on GPU */
error_id = cudaMalloc ((void **) &d_a, sizeof(int) * N);
if (error_id != cudaSuccess) {
printf("Error 1.0 is %s\n", cudaGetErrorString(error_id));
}
/* copy array elements from CPU to GPU */
error_id = cudaMemcpy(d_a, h_a, sizeof(int) * N, cudaMemcpyHostToDevice);
if (error_id != cudaSuccess) {
printf("Error 1.1 is %s\n", cudaGetErrorString(error_id));
}
//int *h_time = (int *)malloc(sizeof(int));
int *d_time;
error_id = cudaMalloc ((void **) &d_time, 4*sizeof(int));
if (error_id != cudaSuccess)
printf("Error 1.2 is %s\n", cudaGetErrorString(error_id));
cudaThreadSynchronize ();
/* launch kernel*/
dim3 Db = dim3(1);
dim3 Dg = dim3(1,1,1);
global_latency <<<Dg, Db>>>(d_a, pos,d_time);
cudaThreadSynchronize ();
error_id = cudaGetLastError();
if (error_id != cudaSuccess) {
printf("Error kernel is %s\n", cudaGetErrorString(error_id));
}
/* copy results from GPU to CPU */
cudaThreadSynchronize ();
error_id = cudaMemcpy((void *)h_time, (void *)d_time, 4*sizeof(int), cudaMemcpyDeviceToHost);
if (error_id != cudaSuccess) {
printf("Error 2.0 is %s\n", cudaGetErrorString(error_id));
}
cudaThreadSynchronize ();
/* free memory on GPU */
cudaFree(d_a);
cudaFree(d_time);
cudaDeviceReset();
return 0;
}
__global__ void global_latency (int * my_array, int position, int *d_time) {
//int tid = blockIdx.x*blockDim.x+threadIdx.x;
int start_time=0;
int mid_time=0;
int end_time=0;
__shared__ int s_tvalue[2];//2: number of threads per block
int i, j;
s_tvalue[0]=0;
s_tvalue[1]=0;
long long addr0,addr1;
//printf("%p\n",my_array);
//int * p = (int *)0x0;
//addr0 = (long long)p;
addr0=(long long)my_array;
//printf("Address i :%p\n",addr0);
addr1 = ( addr0 ^ (1 << position));
//printf("Address i':%p\n",addr1);
//start_time = clock();
long long *r_addr0, *r_addr1;
r_addr0 = (long long *)addr0;
r_addr1 = (long long *)addr1;
start_time = clock();
i = *r_addr0;
s_tvalue[0] = i;
mid_time = clock();
j = *r_addr1;
s_tvalue[1] = j;
//printf("%p",p);
//k =(int)p;
//printf("%d\n",k);
//printf("%d",k);
//__syncthreads();
end_time = clock();
d_time[0] = mid_time-start_time;
d_time[1] = end_time-mid_time;
d_time[2] = s_tvalue[0];
//printf("[%p]=%lld\n",addr0,d_time[1]);
d_time[3] = s_tvalue[1];
//printf("[%p]=%lld\n",addr1,d_time[2]);
}
答案 0 :(得分:2)
当position=0
和原始地址位0为0时,您正在尝试设置
j=*(int*)&(((char*)my_array)[1]);
打破了int
类型的4字节对齐。这会使你的程序崩溃。
当position=3
和原始地址位3为1时,您尝试设置
j=*(int*)&(((char*)my_array)[-8]);
您尝试阅读的地址在my_array
之前。这绝对是一种非法的内存访问。事实上,翻转任何原本等于1的位意味着负数组索引。
另外,您最好使用unsigned long long
或size_t
代替long long
,1ull << position
代替1 << position
,以确保您不会受到打扰由符号位和溢出问题。