Question

我正在尝试将CUSP集成到现有的Fortran代码中。现在我只是试图获得推力/ CUSP的基本设置，以便从Fortran中提供数组，并使用它们构建一个CUSP矩阵（现在为coo格式）。我已经能够得到一个像C例程这样的包装器来编译成一个库，并通过这个线程将它与Fortran代码链接起来：unresolved-references-using-ifort-with-nvcc-and-cusp

我可以通过前一个帖子的帮助验证Fortran是否正确地输入了数组指针：Generating CUSP coo_matrix from passed FORTRAN arrays

不幸的是，我仍然无法让CUSP使用这些来生成和打印矩阵。代码和输出如下所示：

输出

$ ./fort_cusp_test
 testing 1 2 3
n: 3, nnz: 9
     i,  row_i,  col_j,        val_v
     0,      1,      1,   1.0000e+00
     1,      1,      2,   2.0000e+00
     2,      1,      3,   3.0000e+00
     3,      2,      1,   4.0000e+00
     4,      2,      2,   5.0000e+00
     5,      2,      3,   6.0000e+00
     6,      3,      1,   7.0000e+00
     7,      3,      2,   8.0000e+00
     8,      3,      3,   9.0000e+00
initialized row_i into thrust
initialized col_j into thrust
initialized val_v into thrust
defined CUSP integer array view for row_i and col_j
defined CUSP float array view for val_v
loaded row_i into a CUSP integer array view
loaded col_j into a CUSP integer array view
loaded val_v into a CUSP float array view
defined CUSP coo_matrix view
Built matrix A from CUSP device views
sparse matrix <3, 3> with 9 entries
libc++abi.dylib: terminating with uncaught exception of type thrust::system::system_error: invalid argument

Program received signal SIGABRT: Process abort signal.

Backtrace for this error:
#0  0x10d0fdff6
#1  0x10d0fd593
#2  0x7fff8593ff19
Abort trap: 6

fort_cusp_test.f90

program fort_cuda_test

   implicit none

 ! interface
 !    subroutine test_coo_mat_print_(row_i,col_j,val_v,n,nnz) bind(C)
 !       use, intrinsic :: ISO_C_BINDING, ONLY: C_INT,C_FLOAT
 !       implicit none
 !       integer(C_INT),value :: n, nnz
 !       integer(C_INT) :: row_i(:), col_j(:)
 !       real(C_FLOAT) :: val_v(:)
 !    end subroutine test_coo_mat_print_
 ! end interface

   integer*4   n
   integer*4   nnz

   integer*4, target :: rowI(9),colJ(9)
   real*4, target :: valV(9)

   integer*4, pointer ::   row_i(:)
   integer*4, pointer ::   col_j(:)
   real*4, pointer ::   val_v(:)

   n     =  3
   nnz   =  9
   rowI =  (/ 1, 1, 1, 2, 2, 2, 3, 3, 3/)
   colJ =  (/ 1, 2, 3, 1, 2, 3, 1, 2, 3/)
   valV =  (/ 1, 2, 3, 4, 5, 6, 7, 8, 9/)

   row_i => rowI
   col_j => colJ
   val_v => valV

   write(*,*) "testing 1 2 3"

   call test_coo_mat_print (rowI,colJ,valV,n,nnz)

end program fort_cuda_test

cusp_runner.cu

#include <stdio.h>
#include <cusp/coo_matrix.h>
#include <iostream>
// #include <cusp/krylov/cg.h>
#include <cusp/print.h>

#if defined(__cplusplus)
extern "C" {
#endif

void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ ) {

   int n, nnz;

   n = *N;
   nnz = *NNZ;

   printf("n: %d, nnz: %d\n",n,nnz);

   printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v");
   for(int i=0;i<n;i++) {
      printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]);
   }
   //if ( false ) {
   //wrap raw input pointers with thrust::device_ptr
   thrust::device_ptr<int> wrapped_device_I(row_i);
   printf("initialized row_i into thrust\n");
   thrust::device_ptr<int> wrapped_device_J(col_j);
   printf("initialized col_j into thrust\n");
   thrust::device_ptr<float> wrapped_device_V(val_v);
   printf("initialized val_v into thrust\n");

   //use array1d_view to wrap individual arrays
   typedef typename cusp::array1d_view< thrust::device_ptr<int> > DeviceIndexArrayView;
   printf("defined CUSP integer array view for row_i and col_j\n");
   typedef typename cusp::array1d_view< thrust::device_ptr<float> > DeviceValueArrayView;
   printf("defined CUSP float array view for val_v\n");

   DeviceIndexArrayView row_indices(wrapped_device_I, wrapped_device_I + nnz);
   printf("loaded row_i into a CUSP integer array view\n");
   DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + nnz);
   printf("loaded col_j into a CUSP integer array view\n");
   DeviceValueArrayView values(wrapped_device_V, wrapped_device_V + nnz);
   printf("loaded val_v into a CUSP float array view\n");

   //combine array1d_views into coo_matrix_view
   typedef cusp::coo_matrix_view<DeviceIndexArrayView,DeviceIndexArrayView,DeviceValueArrayView> DeviceView;
   printf("defined CUSP coo_matrix view\n");

   //construct coo_matrix_view from array1d_views
   DeviceView A(n,n,nnz,row_indices,column_indices,values);
   printf("Built matrix A from CUSP device views\n");

   cusp::print(A);
   printf("Printed matrix A\n");
 //}
}
#if defined(__cplusplus)
}
#endif

生成文件

Test:
   nvcc -Xcompiler="-fPIC" -shared cusp_runner.cu -o cusp_runner.so -I/Developer/NVIDIA/CUDA-6.5/include/cusp
   gfortran -c fort_cusp_test.f90
   gfortran fort_cusp_test.o cusp_runner.so -L/Developer/NVIDIA/CUDA-6.5/lib -lcudart -o fort_cusp_test

clean:
   rm *.o *.so fort_cusp_test

cusp_runner.cu的功能版：

#include <stdio.h>
#include <cusp/coo_matrix.h>
#include <iostream>
// #include <cusp/krylov/cg.h>
#include <cusp/print.h>

#if defined(__cplusplus)
extern "C" {
#endif

void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ ) {

   int n, nnz;

   n = *N;
   nnz = *NNZ;

   printf("n: %d, nnz: %d\n",n,nnz);

   printf("printing input (row_i, col_j, val_v)\n");
   printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v");
   for(int i=0;i<nnz;i++) {
      printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]);
   }

   printf("initializing thrust device vectors\n");
   thrust::device_vector<int> device_I(row_i,row_i+nnz);
   printf("device_I initialized\n");
   thrust::device_vector<int> device_J(col_j,col_j+nnz);
   printf("device_J initialized\n");
   thrust::device_vector<float> device_V(val_v,val_v+nnz);
   printf("device_V initialized\n");

   cusp::coo_matrix<int, float, cusp::device_memory> A(n,n,nnz);
   printf("initialized empty CUSP coo_matrix on device\n");

   A.row_indices = device_I;
   printf("loaded device_I into A.row_indices\n");
   A.column_indices = device_J;
   printf("loaded device_J into A.column_indices\n");
   A.values = device_V;
   printf("loaded device_V into A.values\n");

   cusp::print(A);
   printf("Printed matrix A\n");
 //}
}
#if defined(__cplusplus)
}
#endif

Answer 1

用于处理指针的推力/ CUSP端代码完全不正确。这样：

thrust::device_ptr<int> wrapped_device_I(row_i);

没有做你认为的事情。您实际做的是将主机地址转换为设备地址。除非您正在使用CUDA管理的内存，否则这是非法的，我在此代码中看不到任何证据。您要做的是分配内存并在启动之前将Fortran阵列复制到GPU。做类似的事情：

thrust::device_ptr<int> wrapped_device_I = thrust::device_malloc<int>(nnz);
thrust::copy(row_i, row_i + nnz, wrapped_device_I);

[免责声明：完全未经测试，自担风险使用]

对于每个COO向量。但是，我建议使用test_coo_mat_print_实例替换thrust::vector的GPU设置部分中的大部分代码。除了更容易使用之外，当它们超出范围时，您可以获得可用的内存释放，从而减少设计内存泄漏的可能性。如下所示：

thrust::device_vector<int> device_I(row_i, row_i + nnz);

在一次通话中处理所有事情。

作为最后一个提示，如果您正在开发多语言代码库，请将它们设计为使每种语言中的代码完全独立并具有自己的本机测试代码。如果你在这种情况下已经这样做了，你会发现C ++部分不能独立于你遇到的任何Fortran问题而工作。它会使调试变得更加简单。

如何从传递的数组中正确构造CUSP coo矩阵

1 个答案: