Question

我有以下C代码：

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

double enorm(double* v1, int length){
    cublasHandle_t handle;
    double result = 0;
    double* vector;
    cudaMalloc((void**) &vector, length * sizeof(double));
    cublasSetVector(length, sizeof(double), v1, 1, vector,1);
    cublasCreate(&handle);
    cublasDnrm2(handle, length, vector, 1, &result);
    cudaFree(vector);
    return result;
}

double testnorm(double* v1, int len){
    double tmp = 0;
    for(int i = 0; i < len; i++){
        tmp += v1[i]*v1[i];
    }
    return sqrt(tmp);
}

int main() {
    double* a = malloc(2 * sizeof(double));
    a[0] = 3;
    a[1] = 4;
    printf("%.f\n", enorm(a, 2));
    printf("%.f\n", testnorm(a,2));
    return 0;
}

遵循Haskell代码，它借用了上面的函数：

import qualified Foreign.Ptr as P
import System.IO.Unsafe
import Foreign.C.Types
import qualified Data.Vector.Storable as SV
import Foreign.C.Types

foreign import ccall "enorm" c_enorm :: P.Ptr CDouble -> CInt -> CDouble

foreign import ccall "testnorm" c_testnorm :: P.Ptr CDouble -> CInt -> CDouble

enorm :: SV.Vector CDouble -> CDouble
enorm v1 = unsafePerformIO $ do
  let len = fromIntegral $ SV.length v1
  SV.unsafeWith v1 $ \ptr -> return (c_enorm ptr len)

testnorm :: SV.Vector CDouble -> CDouble
testnorm v1 = unsafePerformIO $ do
  let len = fromIntegral $ SV.length v1
  SV.unsafeWith v1 $ \ptr -> return (c_testnorm ptr len)

main :: IO ()
main = do
  let a = SV.fromList [3,4] :: SV.Vector CDouble
  print $ enorm a
  print $ testnorm a

由于某些原因，即使C程序返回，如预期的5和5，Haskell等效返回0和5，这意味着即使数组成功传递给C函数，将数据复制到GPU也存在一些问题。这种行为的原因是什么以及如何做到这样才能产生预期的结果？

C编译为：

gcc -o cmain blas.c -I/opt/cuda/include -L/opt/cuda/lib64 -lcublas -lcudart -lm

Haskell中：

gcc -c -fPIC -o testblas.o blas.c -I/opt/cuda/include -L/opt/cuda/lib64 -lcublas -lcudart -lm
gcc -shared -o libtestblas.so  testblas.o
ghc -o main Main.hs -I/opt/cuda/include -L/opt/cuda/lib64 -lcublas -lcudart -lm -L./ -ltestblas
export LD_LIBRARY_PATH=./; ./main

编辑：在Haskell中，cudaMalloc返回cudaErrorMemoryAllocation（无法分配足够的内存），而在C语言中，一切都以成功结束。

将Storable Vector传递给C函数，在GPU上分配存储的数据

0 个答案: