标签: cuda

我的程序创建3个矩阵A,B和C。然后将A点B的点积存储在C中。我试图比较在CPU和GPU上执行此操作所需的时间。我编写了两个函数CPUDot和GPUDot。 CPUDot计算该函数中的点积,GPUDot调用内核函数。



所有矩阵的大小均为m x n。

我不确定这是否是由于我特定的GPU的内存大小所致。 我使用的是GTX 965m,我很确定它具有1024个CUDA内核和2gb VRam。

#include <stdio.h>

#define MAX_THREADS 1024
#define MAX_BLOCKS 65535

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
   if (code != cudaSuccess)
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);

__global__ void Dot(double *A, double *B, double *C, int m, int n, int h){

    int fi = blockIdx.x;
    int fm = gridDim.x;
    int si = threadIdx.x;
    int sm = blockDim.x;

    int index = fi * sm + si;
    int stride = fm * sm;
    //printf("Index: %d\nStride: %d\n", index, stride);
    for(int i = index; i < m*n; i += stride){
        // Pretend this is a two dimensional array. Get [x][y] parameters for C
        int x = i % n;
        int y = i / n;
        double counter = 0;
        for(int j = 0; j < h; ++j){
            // Pretending this is a t dimensional array get [x][y] parameters for A and B
            int ax = j;
            int ay = y;

            int bx = x;
            int by = j;

            // Convert [x][y] -> [i] since data is stored as single dimensional array for A and B
            int ai = ay * h + ax;
            int bi = by * n + bx;
            //printf("ai, bi, ci, %d, %d\n", ai, bi);
            double a = A[ai];
            double b = B[bi];
            counter += a * b;
        C[i] = counter;

class Matrix{
    int width;
    int height;
    double *elements;
    Matrix(int m, int n){
        this->width = n;
        this->height = m;
        gpuErrchk(cudaMallocManaged(&this->elements, m * n * sizeof(double)));
        for(int i = 0; i < m*n; ++i){
            this->elements[i] = i;
    void populate(double value){
        for(int i = 0; i < this->width * this->height; ++i){
            this->elements[i] = value;
    void print(){
        int i = 0;
        for(int y = 0; y < this->height; ++y){
            for(int x = 0; x < this->width; ++x){
                printf("%lf ", this->elements[i]);

void GPUDot(Matrix &A, Matrix &B, Matrix &C){
    double *aData = A.elements;
    double *bData = B.elements;
    double *cData = C.elements;
    int threads = A.height;
    if(threads > MAX_THREADS){
        threads = MAX_THREADS;
    int blocks = A.width;
    if(blocks > MAX_BLOCKS){
        blocks = MAX_BLOCKS;
    Dot<<<1, 1>>>(aData, bData, cData, A.height, B.width, A.width);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

void CPUDot(const Matrix &A, const Matrix &B, Matrix &C){
    // Rectangle matricies are stored as single dimension array
    // Pretending it is stored as a two dimensional array x, y would be the values array[y][x]
    for(int y = 0; y < C.height; ++y){
        for(int x = 0; x < C.width; ++x){
            // Calculate the actual index given two dimensional parameters
            int ci = y * C.width + x;
            // Get new x, y parameters for the A matrix and B matrix
            int ay = y;
            int bx = x;
            C.elements[ci] = 0;
            for(int c = 0; c < A.width; ++c){
                int ax = c;
                int by = c;
                // Calculate the actual index for A and B given x, y
                int ai = ay * A.height + ax;
                int bi = by * B.width + bx;
                C.elements[ci] += A.elements[ai] * B.elements[bi];

void subtract(const Matrix &A, const Matrix &B, Matrix &C){
    for(int i = 0; i < A.height * A.width; ++i){
        C.elements[i] = A.elements[i] - B.elements[i];

double sum(const Matrix &A){
    double sum = 0;
    for(int i = 0; i < A.height * A.width; ++i){
        sum += A.elements[i];
    return sum;

int main(){

    int size = 100;
    printf("Dotting two matricies of size: %d\n", size);
    Matrix A(size, size);
    Matrix B(size, size);
    Matrix C(size, size);
    Matrix D(size, size);
    Matrix diff(size, size);
    time_t t;


    t = clock();
    CPUDot(A, B, C);
    t = clock() - t;
    printf("CPU Dotting took %lf ms\n", ((float)t / CLOCKS_PER_SEC) * 1000);


    t = clock();
    GPUDot(A, B, D);
    t = clock() - t;
    printf("GPU Dotting took %lf ms\n", ((float)t / CLOCKS_PER_SEC) * 1000);


    subtract(C, D, diff);
    printf("Subtracted the matricies\n");
    double error = sum(diff);

    printf("Error: %lf\n", error);

    return 0;

我知道这不是解决此问题的最佳方法-我应该利用Dot <<< 1,threads >>>(参数)中的第一个参数;但是我只是想看看它是否有效,我也不知道为什么不行。




GPUassert:未指定的启动失败 98

我还在another post上读到,“未指定的启动失败”几乎总是一个段错误,但是我假设如果我遇到索引问题,那么它不仅会在矩阵大小为太大了。我也将线程更改为1,所以方括号内的数字始终始终为<<< 1、1 >>>,因此索引和跨度分别始终为0、1

