Question

我正在尝试编译一个基本的CUDA矩阵乘法程序，但我遇到了这个错误：

nvcc    -I. -I/usr/local/cuda/include -c matrixMult1.cu -o matrixMult1.o
make: nvcc: Command not found
make: *** [matrixMult1.o] Error 127

我最初收到了另一个错误，建议我使用nvcc，唯一的问题是我对nvcc一无所知。有人有想法吗？提前谢谢！

生成文件：

GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart

matrixMult1.o:      matrixMult1.cu
            $(GCC)  $(INCLUDES) -c matrixMult1.cu -o $@ 

matrixMult1:        matrixMult1.o
        $(GCC)  -o $@ matrixMult1.o $(CUDA_LIBS)

clean:
        $(RM)   *.o *~

内核：

//********************************************************************
// matrixMul_kernel.cu
//
// Kernel for a basic matrix multiplication program.
//********************************************************************

#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_

#include <stdio.h>

/* Thread block size */
#define BLOCK_SIZE 3

#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA

/* CUDA Kernel */
__global__ void matrixMul (float * C, float * A, float * B, int wA,
               int wB) {

    /* Two dimensional thread ID */
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    /* Computation holder variable */
    float value = 0;

    /* Loop through row of A and column of B to compute cell of C */
    for (int i = 0; i < wA; ++i) {
    float elementA = A[ty * wA + i];
    float elementB = B[i * wB + tx];
    value += elementA * elementB;
    }

    /* Write the result to C */
    C[ty * wA + tx] = value;
}

#endif

主程序：

//********************************************************************
// matrixMult1.c
//
// A basic matrix multiplication program.
//********************************************************************

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMul_kernel.cu>

#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA

void initMatrix(float * matrix, int numIndices);

//*************
// Main Program
//*************
int main(int argc, char** argv) {

    /* Set random seed */
    srand(2013);

    /* Compute memory sizes for matrices A, B, and C */
    unsigned int sizeA = WA * HA;
    unsigned int sizeB = WB * HB;
    unsigned int sizeC = WC * HC;
    unsigned int memoryA = sizeof(float) * sizeA;
    unsigned int memoryB = sizeof(float) * sizeB;
    unsigned int memoryC = sizeof(float) * sizeC;

    /* Allocate memory for matrices A, B, and C */
    float * matrixA = (float *) malloc(memoryA);
    float * matrixB = (float *) malloc(memoryB);
    float * matrixC = (float *) malloc(memoryC);

    /* Initialize matrices A and B */
    initMatrix(matrixA, sizeA);
    initMatrix(matrixB, sizeB);

    /* Print matrix A */
    printf("\nMatrix A:\n");
    for (int i = 0; i < sizeA; i++) {
        printf("%f ", matrixA[i]);

        if (((i + 1) % WA) == 0) {
            printf("\n");
        } else {
            printf(" | ");
        }
    }

    /* Print matrix B */
    printf("\nMatrix B:\n");
    for (int i = 0; i < sizeB; i++) {
        printf("%f ", matrixB[i]);

        if (((i + 1) % WA) == 0) {
            printf("\n");
        } else {
            printf(" | ");
        }
    }

    /* Allocate device memory */
    float* deviceMemA;
    float* deviceMemB;
    float* deviceMemC;
    cudaMalloc((void**) &deviceMemA, memoryA);
    cudaMalloc((void**) &deviceMemB, memoryB);
    cudaMalloc((void**) &deviceMemC, memoryC);

    /* Copy host memory to device */
    cudaMemcpy(deviceMemA, matrixA, memoryA,
           cudaMemcpyHostToDevice);
    cudaMemcpy(deviceMemB, matrixB, memoryB,
           cudaMemcpyHostToDevice);

    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(WC / threads.x, HC / threads.y);

    /* Execute kernel */
    matrixMul<<< grid, threads >>>(deviceMemC, deviceMemA,
                   deviceMemB, WA, WB);

    cudaMemcpy(deviceMemC, matrixC, memoryC,
           cudaMemcpyHostToDevice);

    /* Print matrix C */
    printf("\nMatrix C:\n");
    for (int i = 0; i < sizeC; i++) {
        printf("%f ", matrixC[i]);

        if (((i + 1) % WC) == 0) {
            printf("\n");
        } else {
            printf(" | ");
        }
    }
    printf("\n");

    /* Free up memory */
    free(matrixA);
    free(matrixB);
    free(matrixC);
    cudaFree(deviceMemA);
    cudaFree(deviceMemB);
    cudaFree(deviceMemC);
}

//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
//              matrix.
//
// PRE:  matrix is a pointer to a block of bytes in memory; numIndices
//       is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
//       float value.
//--------------------------------------------------------------------
void initMatrix(float * matrix, int numIndices) {

    /*
    Loop through the block of bytes, assigning a random float
    for each index of the matrix
    */
    for (int i = 0; i < numIndices; ++i) {

        /* Assign a random float between 0 and 1 at this byte */
        matrix[i] = rand() / (float)RAND_MAX;
    }
}

Answer 1

此错误：

nvcc: Command not found

表示nvcc中没有PATH。

要修复它，假设它是bash或类似的：

PATH=$PATH:/usr/local/cuda/bin
make

...或将其添加到系统或用户的个人资料中。

CUDA Makefile nvcc错误

1 个答案: