我写了一些小的cuda代码来了解共享内存传输事务的全局内存。代码如下:
#include <iostream>
using namespace std;
__global__ void readUChar4(uchar4* c, uchar4* o){
extern __shared__ uchar4 gc[];
int tid = threadIdx.x;
gc[tid] = c[tid];
o[tid] = gc[tid];
}
int main(){
string a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
uchar4* c;
cudaError_t e1 = cudaMalloc((void**)&c, 128*sizeof(uchar4));
if(e1==cudaSuccess){
uchar4* o;
cudaError_t e11 = cudaMalloc((void**)&o, 128*sizeof(uchar4));
if(e11 == cudaSuccess){
cudaError_t e2 = cudaMemcpy(c, a.c_str(), 128*sizeof(uchar4), cudaMemcpyHostToDevice);
if(e2 == cudaSuccess){
readUChar4<<<1,128, 128*sizeof(uchar4)>>>(c, o);
uchar4* oFromGPU = (uchar4*)malloc(128*sizeof(uchar4));
cudaError_t e22 = cudaMemcpy(oFromGPU, o, 128*sizeof(uchar4), cudaMemcpyDeviceToHost);
if(e22 == cudaSuccess){
for(int i =0; i < 128; i++){
cout << oFromGPU[i].x << " ";
cout << oFromGPU[i].y << " ";
cout << oFromGPU[i].z << " ";
cout << oFromGPU[i].w << " " << endl;
}
}
else{
cout << "Failed to copy from GPU" << endl;
}
}
else{
cout << "Failed to copy" << endl;
}
}
else{
cout << "Failed to allocate output memory" << endl;
}
}
else{
cout << "Failed to allocate memory" << endl;
}
return 0;
}
此代码只是将数据从设备内存复制到共享内存并返回到设备内存。我有以下三个问题:
答案 0 :(得分:2)
在您提供的代码中(此处重复),编译器将完全删除共享内存并加载,因为它们不会对代码执行任何必要或有益的操作。
__global__ void readUChar4(uchar4* c, uchar4* o){
extern __shared__ uchar4 gc[];
int tid = threadIdx.x;
gc[tid] = c[tid];
o[tid] = gc[tid];
}
假设您对共享内存做了一些事情,因此没有消除,那么:
uchar4
= 4 * 8位) (每个warp总共128个字节)。 cudaMalloc
连续分配内存。