#include <cuda_runtime.h>
#include <stdio.h>
void initialint(int *ip,int size)
{
for(int i=0;i<size;i++)
ip[i]=i;
}
void printmatrix(int *C,const int nx,const int ny)
{
int *ic=C;
printf("\n Matrix: (%d.%d) \n",nx,ny);
for(int i=0;i<ny;i++){
for(int j=0;j<nx;j++){
printf("%3d",ic[j+nx*i]);}
printf("\n");
}
printf("\n");
}
__global__ void printthreadindex(int *A,const int nx,const int ny)
{
int ix=threadIdx.x+blockIdx.x*blockDim.x;
int iy=threadIdx.y+blockIdx.y*blockDim.y;
unsigned int idx=ix+iy*nx;
printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index %2d ival %2d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]);
}
int main()
{
int nx=8,ny=6;
int nxy=nx*ny;
int nBytes=nxy*sizeof(float);
int *h_A;
h_A=(int *)malloc(nBytes);
initialint(h_A,nxy);
printmatrix(h_A,nx,ny);
int *d_MatA;
cudaMalloc((void **)&d_MatA,nBytes);
cudaMemcpy(d_MatA,h_A,nBytes,cudaMemcpyHostToDevice);
dim3 block(4,2);
dim3 grid(2,3);
printthreadindex <<<grid,block>>> (d_MatA,nx,ny);
cudaFree(d_MatA);
free(h_A);
system("pause");
return 0;
}
输出:
Matrix: (8.6)
0 1 2 3 4 5 6 7
8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47
thread_id (0,0) block_id (1,0) coordinate (4,0) global index 4 ival 4
thread_id (1,0) block_id (1,0) coordinate (5,0) global index 5 ival 5
thread_id (2,0) block_id (1,0) coordinate (6,0) global index 6 ival 6
thread_id (3,0) block_id (1,0) coordinate (7,0) global index 7 ival 7
thread_id (0,1) block_id (1,0) coordinate (4,1) global index 12 ival 12
thread_id (1,1) block_id (1,0) coordinate (5,1) global index 13 ival 13
thread_id (2,1) block_id (1,0) coordinate (6,1) global index 14 ival 14
thread_id (3,1) block_id (1,0) coordinate (7,1) global index 15 ival 15
thread_id (0,0) block_id (1,1) coordinate (4,2) global index 20 ival 20
thread_id (1,0) block_id (1,1) coordinate (5,2) global index 21 ival 21
thread_id (2,0) block_id (1,1) coordinate (6,2) global index 22 ival 22
thread_id (3,0) block_id (1,1) coordinate (7,2) global index 23 ival 23
thread_id (0,1) block_id (1,1) coordinate (4,3) global index 28 ival 28
thread_id (1,1) block_id (1,1) coordinate (5,3) global index 29 ival 29
thread_id (2,1) block_id (1,1) coordinate (6,3) global index 30 ival 30
thread_id (3,1) block_id (1,1) coordinate (7,3) global index 31 ival 31
thread_id (0,0) block_id (0,2) coordinate (0,4) global index 32 ival 32
thread_id (1,0) block_id (0,2) coordinate (1,4) global index 33 ival 33
thread_id (2,0) block_id (0,2) coordinate (2,4) global index 34 ival 34
thread_id (3,0) block_id (0,2) coordinate (3,4) global index 35 ival 35
thread_id (0,1) block_id (0,2) coordinate (0,5) global index 40 ival 40
thread_id (1,1) block_id (0,2) coordinate (1,5) global index 41 ival 41
thread_id (2,1) block_id (0,2) coordinate (2,5) global index 42 ival 42
thread_id (3,1) block_id (0,2) coordinate (3,5) global index 43 ival 43
thread_id (0,0) block_id (1,2) coordinate (4,4) global index 36 ival 36
thread_id (1,0) block_id (1,2) coordinate (5,4) global index 37 ival 37
thread_id (2,0) block_id (1,2) coordinate (6,4) global index 38 ival 38
thread_id (3,0) block_id (1,2) coordinate (7,4) global index 39 ival 39
thread_id (0,1) block_id (1,2) coordinate (4,5) global index 44 ival 44
thread_id (1,1) block_id (1,2) coordinate (5,5) global index 45 ival 45
thread_id (2,1) block_id (1,2) coordinate (6,5) global index 46 ival 46
thread_id (3,1) block_id (1,2) coordinate (7,5) global index 47 ival 47
thread_id (0,0) block_id (0,1) coordinate (0,2) global index 16 ival 16
thread_id (1,0) block_id (0,1) coordinate (1,2) global index 17 ival 17
thread_id (2,0) block_id (0,1) coordinate (2,2) global index 18 ival 18
thread_id (3,0) block_id (0,1) coordinate (3,2) global index 19 ival 19
thread_id (0,1) block_id (0,1) coordinate (0,3) global index 24 ival 24
thread_id (1,1) block_id (0,1) coordinate (1,3) global index 25 ival 25
thread_id (2,1) block_id (0,1) coordinate (2,3) global index 26 ival 26
thread_id (3,1) block_id (0,1) coordinate (3,3) global index 27 ival 27
thread_id (0,0) block_id (0,0) coordinate (0,0) global index 0 ival 0
thread_id (1,0) block_id (0,0) coordinate (1,0) global index 1 ival 1
thread_id (2,0) block_id (0,0) coordinate (2,0) global index 2 ival 2
thread_id (3,0) block_id (0,0) coordinate (3,0) global index 3 ival 3
thread_id (0,1) block_id (0,0) coordinate (0,1) global index 8 ival 8
thread_id (1,1) block_id (0,0) coordinate (1,1) global index 9 ival 9
thread_id (2,1) block_id (0,0) coordinate (2,1) global index 10 ival 10
thread_id (3,1) block_id (0,0) coordinate (3,1) global index 11 ival 11
嗨,上面的代码是CUDA书中的一个例子,它试图解释如何将2D数组映射到CUDA网格和块,并在每个线程的全局内存中打印矩阵坐标和偏移量。
我对线程的确切映射方式有点困惑,特别是 声明&#34; idx = ix + iy nx&#34;。我试图交换nx,ny的索引值,然后将此语句更改为&#34; idx = iy + ix ny&#34;,但这似乎不起作用。
矩阵元素也映射到线程
块(0,0)-0,1,2,3,8,9,10,11 Block(1,0)-4,5,6,7,12,13,14,15 .....
如果我想要像
这样的映射块(0,0)-0,1,2,3,4,5,6,7 Block(0,1)-8,9,10,11,12,13,14,15 ....
如何修改参数并启动内核。
P.S-我在Windows 8.1上使用带有VS 2012的GTX 860M的i7处理器。
感谢。
答案 0 :(得分:0)
$('#SpaceAccommodation').change(function () {
var selectedText = $(this).text();
});
nx=8, ny=6.
例如带坐标(5,2)的线程。idx=ix+iy*nx:
,所以ix=5, iy=2
为了使用不同的参数启动内核,您应该更改ival=5+2*8=21
和dim3 block(a,b)
指令。
例如,要实现您的示例,您应该使用:
dim3 block(c,d)