为了练习,我正在使用cuda编写一个简单的矩阵初始化程序。我制作了一些顺序版本以作为参考。它只是创建一个n×m数组,并用双精度填充。我一直在阅读其他文章和文档,但是我很困惑,我希望有人可以向我解释如何以类似n乘m大小的方式在cuda中初始化2d数组,如下所示。如果有人愿意解释,我也将对如何填充该cuda矩阵的见解表示赞赏。
您好,关于它可能是重复的,我应该详细说明。链接的帖子并没有真正解释任何内容,它只是示例代码,它是我以前查看过的帖子之一,但由于没有解释而无法理解。谢谢。
顺序版本:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <assert.h>
int n,m, i, j;
double count;
void update(int n, int m, double arr[][m]){
for(i=0; i<n; i++){
for(j=0; j<m; j++){
count++;
arr[i][j] = count;
}
}
}
int main(int argc, char * argv[]) {
assert(argc==3);
n = atoi(argv[2]);
m = atoi(argv[1]);
double (*arr)[n][m] = malloc(sizeof *arr);
update(n,m,arr);
return 0;
}
答案 0 :(得分:1)
您可以在1D中模拟2D数组,并逐行保留数据。这样的二维数组: [a,b] [c,d]变为[a,b,c,d]。为了使事情简单,您可以编写提供此类功能的包装器类。
这是该想法的演示(不是100%防灾,但可以正常工作)
#pragma once
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
typedef int TYPE;
// NOTE: copy consturctor and = operator need to be overloaded as well
template<class T>
struct Matrix
{
Matrix(int r, int c) : rows(r), cols(c) {
data = new T[r*c];
}
~Matrix() {
// As we allocated memory it needs to be freed upon destruction
delete[] data;
data = nullptr;
}
int rows, cols;
T* data;
T* operator[](int row) {
// Returns pointer to "ROW", further call to [] on result will retrieve item at column in this row
return data + (row*cols);
}
};
// Simple cuda kernel
__global__ void add(TYPE *a, TYPE *b, TYPE *c, int rows, int cols) {
// Get element row and col
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// If kernel block/grid is not sized perfectly make sure not to step outside data bounds
if(row < rows && col < cols)
{
int idx = row*cols + col;
c[idx] = a[idx] + b[idx];
}
}
int main() {
// m3 = m1 + m2 using cuda
int rows = 5, cols = 5, total = rows * cols;
Matrix<TYPE> m1{ rows,cols }, m2{ rows,cols }, m3{ rows,cols };
// Initialization as 1D array
for(int i = 0; i < total; i++) {
m1.data[i] = i;
}
// Or initialization as 2D array
for(int r = 0; r < rows; r++)
for(int c = 0; c < cols; c++)
m2[r][c] = r*cols + c + 100;
for(int i = 0; i < total; i++) std::cout << m1.data[i] << ", ";
std::cout << "\n";
for(int r = 0; r < rows; r++) {
for(int c = 0; c < cols; c++)
std::cout << m2[r][c] << ", ";
std::cout << "\n";
}
// CUDA part
TYPE *d_m1, *d_m2, *d_m3;
// Allocation
cudaMalloc((void **) &d_m1, total * sizeof(TYPE));
cudaMalloc((void **) &d_m2, total * sizeof(TYPE));
cudaMalloc((void **) &d_m3, total * sizeof(TYPE));
// Copy m1 and m2 to GPU
cudaMemcpy(d_m1, m1.data, total * sizeof(TYPE), cudaMemcpyHostToDevice);
cudaMemcpy(d_m2, m2.data, total * sizeof(TYPE), cudaMemcpyHostToDevice);
// Oversized on purpose to show row/col guard on add kernel
dim3 grid(5, 5);
dim3 block(5, 5);
add <<< grid, block >>> (d_m1, d_m2, d_m3, rows, cols);
// Copy result to m3
cudaMemcpy(m3.data, d_m3, total * sizeof(TYPE), cudaMemcpyDeviceToHost);
cudaFree(d_m1);
cudaFree(d_m2);
cudaFree(d_m3);
for(int r = 0; r < rows; r++) {
for(int c = 0; c < cols; c++)
std::cout << m3[r][c] << ", ";
std::cout << "\n";
}
system("pause");
return 0;
}