Question

我很擅长使用c ++进行CUDA编程，很抱歉这个简单的问题。我根本无法弄清楚我在哪里出错了。我正在尝试进行矩阵乘法。我从几个来源找到了灵感，所以可能是我混淆了一些不同的方法。我试图将两个矩阵h_a和h_b相乘。我成功地生成了两个矩阵，但是当我为两个矩阵分配内存时，由于某种原因我失去了该矩阵中的值，即使在乘法之后，所有值都为零。以下是代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <ctime>
#include <stdio.h>
#include <iostream>
#include <math.h>

using namespace std;


__global__ void MulKernel(int *c, const int *a, const int *b, const int P)
{
    float tempsum;
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    if (row < P && col < P){
        for (int i = 0; i < P; i++){
            tempsum += a[row*P + i] * b[i*P + col];
        }
    }
    c[row*P + col] = tempsum;
}


int main()
{

srand(time(NULL));
int *pointer;
int N = 16;
int SIZE = N*N;

int *h_a = new int[SIZE];
int *h_b = new int[SIZE];
int *h_c = new int[SIZE];

for (int i = 0; i < SIZE; i++) {
            h_a[i] = rand() % 1000;
            h_b[i] = rand() % 1000;
    } 
cout << "First values " << h_a[0] << " " << h_b[0] << endl;
    cudaMalloc(&h_a, sizeof(int)*SIZE);
    cudaMalloc(&h_b, sizeof(int)*SIZE);
    cudaMalloc(&h_c, sizeof(int)*SIZE);
    cudaMalloc(&pointer, sizeof(int));

    cout << "Second values " << h_a[0] << " " << h_b[0] << endl;

    cudaMemcpy(h_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(h_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(pointer, &N, sizeof(int), cudaMemcpyHostToDevice);

    cout << "Third values " << h_a[0] <<" "<< h_b[0] << endl;

    MulKernel <<<1, 256 >>>(h_c, h_a, h_b, N);

    cudaMemcpy(h_c, &h_c, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_a, &h_a, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_b, &h_b, sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < 5; i++){
        cout << h_c[i] << "=" << h_a[i] << h_b[i] << endl;
    }
    cout << h_c[1] << endl;
    cudaFree(h_a);
    cudaFree(h_b);
    cudaFree(h_c);
    return 0;
}

终端输出显示：

First values 454 964
Second values 0 0
Third values 0 0
0=00
0=00
0=00
0=00
0=00
0
Press any key to continue . . .

我希望有人能发现错误

祝你好运

Answer 1

您的代码存在很多问题。

如果您在使用cuda代码时遇到问题，我建议proper cuda error checking以及使用cuda-memcheck运行您的代码。在这种情况下，您已经发生了实际导致seg错误的编程错误，因此这些方法没那么有用。
你的内核基本上是可行的。有3个问题。首先，您正在执行int乘法，但已将tempsum变量声明为float。这可能不是一个大问题，但与你的内核不一致。其次，您没有初始化tempsum（它应该设置为零）。第三，你的线程检查（即if - 声明）有点放错了位置。如果线程超出范围，您应该调整内核以使不写入c。
您可能对主机和设备变量感到困惑。我们不会使用new分配主机变量，然后对同一指针执行cudaMalloc操作。这不是事情的运作方式。我们需要创建一组等效的变量来在设备上存储数据。我们称之为*d_a等。我们会在cudaMalloc上调用cudaMemcpy来分配设备空间，然后我们将.x操作中的那些用作设备端变量。
您的内核需要一个2D线程数组（以便内核中的.y和dim3内置变量具有意义）。但是您使用1D变量定义线程数组。这需要在内核启动中修复（即使用d_a变量定义2D数组）。同样，内核启动应指定作为设备指针的N等变量。
在将变量传递给内核时，您可能会对如何处理pointer之类的变量感到困惑。我们可以直接（按价值）传递，而不用你创建的cudaMemcpy任何体操。
您的memcpy操作中的转移尺寸错误。与SIZE类似，您需要指定以字节为单位的传输大小，因此我们需要将大部分传输大小乘以$ cat t1073.cu #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <ctime> #include <stdio.h> #include <iostream> #include <math.h> using namespace std; __global__ void MulKernel(int *c, const int *a, const int *b, const int P) { int tempsum=0; int row = blockIdx.y*blockDim.y + threadIdx.y; int col = blockIdx.x*blockDim.x + threadIdx.x; if (row < P && col < P){ for (int i = 0; i < P; i++){ tempsum += a[row*P + i] * b[i*P + col]; } c[row*P + col] = tempsum; } } int main() { srand(time(NULL)); int N = 16; int SIZE = N*N; int *h_a = new int[SIZE]; int *h_b = new int[SIZE]; int *h_c = new int[SIZE]; for (int i = 0; i < SIZE; i++) { h_a[i] = rand() % 1000; h_b[i] = rand() % 1000; } cout << "First values " << h_a[0] << " " << h_b[0] << endl; int *d_a, *d_b, *d_c; cudaMalloc(&d_a, sizeof(int)*SIZE); cudaMalloc(&d_b, sizeof(int)*SIZE); cudaMalloc(&d_c, sizeof(int)*SIZE); cout << "Second values " << h_a[0] << " " << h_b[0] << endl; cudaMemcpy(d_a, h_a, sizeof(int)*SIZE, cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, sizeof(int)*SIZE, cudaMemcpyHostToDevice); cout << "Third values " << h_a[0] <<" "<< h_b[0] << endl; MulKernel <<<1, dim3(N,N) >>>(d_c, d_a, d_b, N); cudaMemcpy(h_c, d_c, sizeof(int)*SIZE, cudaMemcpyDeviceToHost); cudaMemcpy(h_a, d_a, sizeof(int)*SIZE, cudaMemcpyDeviceToHost); cudaMemcpy(h_b, d_b, sizeof(int)*SIZE, cudaMemcpyDeviceToHost); for (int i = 0; i < 5; i++){ cout << h_c[i] << "=" << h_a[i] << h_b[i] << endl; } cout << h_c[1] << endl; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); return 0; } $ nvcc -o t1073 t1073.cu $ cuda-memcheck ./t1073 ========= CUDA-MEMCHECK First values 698 173 Second values 698 173 Third values 698 173 5502745=698173 5866060=120710 3945532=646669 4432346=582703 4971909=746272 5866060 ========= ERROR SUMMARY: 0 errors $。

以下是代码的修改版本，其中包含上述问题：

就个人而言，我无法轻易解释输出，而且我不确定为什么你选择N符号。对于矩阵乘法，c [i]不等于a [i] * b [i]，如果这是你的想法。如果您想要一个易于理解的简单测试，请尝试将a和b矩阵设置为全部1.然后您可以轻松找到正确的输出，它应该全部为N。

另请注意，为简洁起见，我没有尝试在这个问题上教你CUDA编程的每个方面，只是修正一些错误。仅举一个例子，如果将import java.io.ByteArrayOutputStream; import javax.xml.soap.*; import biz.source_code.base64Coder.*; public class Client { private static String endpoint = "https://example.com/xxx.php", username = "xxx", password = "xxx"; private static SOAPMessage getRequest() throws Exception{ MessageFactory factory = MessageFactory.newInstance(); SOAPMessage message = factory.createMessage(); //set authorization as a HTTP header String authorization = Base64Coder.encodeString(username + ":" + password); MimeHeaders hd = message.getMimeHeaders(); hd.addHeader("Authorization", "Basic " + authorization); //Call getReportList operation return message; } public static void main(String[] args) throws Exception { SOAPConnectionFactory connFactory = SOAPConnectionFactory.newInstance(); SOAPConnection connection = connFactory.createConnection(); // create request message and give it content SOAPMessage request = Client.getRequest(); // call the API endpoint with the request SOAPMessage response = connection.call(request, endpoint); ByteArrayOutputStream out = new ByteArrayOutputStream(); response.writeTo(out); String strMsg = new String(out.toByteArray()); System.out.println(strMsg); } }设置为大于32的值，此代码将会中断。您可能需要了解有关CUDA编程的更多信息，以了解其原因。

CUDA c ++，简单矩阵乘法错误

1 个答案: