#include <stdio.h>
#include <vector>
using namespace std;
#define THETA 10
// Error checking.
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
if (code != cudaSuccess)
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
// Pass 2-dim array to GPU and change it there.
void addArrays(double *twoDimArray, size_t pitch)
printf("\n\nOn GPU array : thread : %d\n", threadIdx.x);
int tidx = blockIdx.x * blockDim.x + threadIdx.x; //// tidx = Columns in CUDA
int tidy = blockIdx.y * blockDim.y + threadIdx.y; //// tidy = Rows In cuda
if ((tidx < THETA) && (tidy < THETA))
double tempval = 0;
for(int i=0; i < THETA ; i++)
tempval = 250;
twoDimArray[tidy * THETA + tidx]=tempval;
int main()
// 2-Dimensional Array
printf("\n*******************\n2-DIMENSIONAL ARRAY\n*******************\n\n");
// Create 2-dim array on the CPU.
double arrayOnCpu[THETA][THETA];
double arrayOnCpu2[THETA][THETA];
// Initialise the vector of vector on the CPU.
for (int i = 0; i < THETA; i++) // Aantal buitenste vectoren.
for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
arrayOnCpu2[i][j] = j;
// Print the vector of vectors.
for (int i = 0; i < THETA; i++)
for (int j = 0; j < THETA; j++)
printf("%2.2f\t", arrayOnCpu2[i][j]);
// Create corresponding double array on the GPU.
double *pToArrayOnGpu;
size_t pitch;
gpuErrorCheck( cudaMallocPitch((void **)&pToArrayOnGpu, &pitch, THETA * sizeof(double), THETA) );
// Copy CPU data to vector on GPU.
gpuErrorCheck( cudaMemcpy2D(pToArrayOnGpu, pitch, arrayOnCpu2, pitch, THETA * sizeof(double), THETA, cudaMemcpyHostToDevice) );
// Launch GPU code with THETA threads, one per vector element.
addArrays<<<1, THETA>>>(pToArrayOnGpu, pitch);
gpuErrorCheck( cudaDeviceSynchronize() );
// Copy array from GPU back to CPU.
gpuErrorCheck( cudaMemcpy2D(arrayOnCpu2, pitch, pToArrayOnGpu,pitch, THETA * sizeof(double), THETA, cudaMemcpyDeviceToHost) );
// Print the vector of vectors.
for (int i = 0; i < THETA; i++) // Aantal buitenste vectoren.
for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
printf("%2.2f\t", arrayOnCpu2[i][j]);
// Free up the array on the GPU.
gpuErrorCheck( cudaFree(pToArrayOnGpu) );
return 0;
答案 0 :(得分:1)
在主机中,一个[行] [列]存储在a+row*width_of_column*sizeof(element)+column*sizeof(element)
中。但是在cuda的全局内存访问中,从256字节对齐地址(addr = 0、256、512等)的连续访问效率最高。因此,为了提高内存访问效率,可以使用cudaMallocPitch函数。
会为每个行分配更多字节分配内存以确保width_of_column*sizeof(element)+extra allocated bytes
__host__ cudaError_t cudaMemcpy2D ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind )
gpuErrorCheck(cudaMemcpy2D(pToArrayOnGpu, pitch, arrayOnCpu2, THETA * sizeof(double), THETA * sizeof(double), THETA, cudaMemcpyHostToDevice));
返回,而最后一个音高({pitch)是THETA * sizeof(double)
env: Ubuntu 16.04 Tesla P100
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdio.h>
#include <vector>
using namespace std;
#define THETA 10
// Error checking.
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
if (code != cudaSuccess)
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
// Pass 2-dim array to GPU and change it there.
void addArrays(double *twoDimArray, size_t pitch){
int tidx = threadIdx.x;
if ((tidx < THETA) /*&& (tidy < THETA)*/){
double tempval = 250;
for (int i = 0; i < THETA; i++){
double* row = (double*)((char*)twoDimArray + i * pitch);
row[tidx] *= tempval;
int main(){
double arrayOnCpu2[THETA][THETA];
// Initialise the vector of vector on the CPU.
for (int i = 0; i < THETA; i++){ // Aantal buitenste vectoren.
for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
arrayOnCpu2[i][j] = j;
// Print the vector of vectors.
for (int i = 0; i < THETA; i++){
for (int j = 0; j < THETA; j++)
printf("%2.2f\t", arrayOnCpu2[i][j]);
// Create corresponding double array on the GPU.
double *pToArrayOnGpu;
size_t pitch;
gpuErrorCheck(cudaMallocPitch((void **)&pToArrayOnGpu, &pitch, THETA * sizeof(double), THETA));
// Copy CPU data to vector on GPU.
gpuErrorCheck(cudaMemcpy2D(pToArrayOnGpu, pitch, arrayOnCpu2, THETA * sizeof(double), THETA * sizeof(double), THETA, cudaMemcpyHostToDevice));
// Launch GPU code with THETA threads, one per vector element.
addArrays << <1, THETA >> >(pToArrayOnGpu, pitch);
// Copy array from GPU back to CPU.
gpuErrorCheck(cudaMemcpy2D(arrayOnCpu2, THETA * sizeof(double), pToArrayOnGpu, pitch, THETA * sizeof(double), THETA, cudaMemcpyDeviceToHost));
// Print the vector of vectors.
for (int i = 0; i < THETA; i++){ // Aantal buitenste vectoren.
for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
printf("%2.2f\t", arrayOnCpu2[i][j]);
// Free up the array on the GPU.
return 0;
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
0.00 250.00 500.00 750.00 1000.00 1250.00 1500.00 1750.00 2000.00 2250.00
答案 1 :(得分:0)
除此之外,关于不同音高应该意味着什么似乎有些困惑。通常,数组的音高是从一行的开始到下一行的开始必须跳过的字节数。驱动程序可以选择/需要在2D阵列的行之间添加填充,以使分配满足硬件的对齐要求和/或允许更优化的内存访问。这意味着,对于GPU上的2D阵列,间距可能会大于element size * width
但是,对于您的CPU阵列,间距仅为THETA * sizeof(double)
的调用中,您将GPU阵列的间距作为GPU和CPU的间距传递。为您的CPU阵列使用正确的间距。在设备代码中,您可以通过twoDimArray[tidy * THETA + tidx]
访问GPU阵列。在这里,您实际上是使用THETA * sizeof(double)
double* my_row = reinterpret_cast<double*>(reinterpret_cast<char*>(twoDimArray) + tidy * pitch);
my_row[tidx] = tempval;