Question

我正在尝试将一些推力调用替换为arrayfire来检查性能。

我不确定我是否使用了正确的阵列火，因为我所采取的结果完全不匹配。

所以，我使用的推力代码是：

cudaMalloc( (void**) &devRow, N * sizeof(float) );
...//devRow is filled

thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < N; i++, SlBegin += PerSlElmts )
{
    thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts, SlBegin );
}

cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow...

Arrayfire：

af::array SlBegin( N , devRow );
for ( int i = 0;i < N; i++,SlBegin += PerSlElmts )
{
    accum( SlBegin );
}

cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow..

我不确定arrayfire如何处理副本：af::array SlBegin( N , devRow );。在推文中我们有设备指针，它指向devRow到SlBegin，但是在arrayfire中？

另外，我想问一下使用gfor。在arrayfire webpage 中，它声明了

请勿直接使用此功能;见GFOR：Parallel For-Loops。

然后是GFOR：

当前版本的ArrayFire
中禁用了GFOR

那么，我们不能使用gfor？

--------- UPDATE ---------------------------

我有一个小的运行示例，显示了不同的结果：

#include <stdio.h>
#include <stdlib.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>

#include "arrayfire.h"

#include <thrust/scan.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

__global__ void Kernel( const int N ,float * const devRow )
{

   int i = threadIdx.x;
   if ( i < N )
        devRow[ i ] = i;

 }

int main(){

    int N = 6;
    int Slices = 2;
    int PerSlElmts = 3;

    float * theRow = (float*) malloc ( N * sizeof( float ));

    for ( int i = 0; i < N; i ++ )
        theRow[ i ] = 0;

    // raw pointer to device memory
    float * devRow;
    cudaMalloc( (void **) &devRow, N * sizeof( float ) );

    Kernel<<< 1,N >>>( N , devRow );
    cudaDeviceSynchronize();

    // wrap raw pointer with a device_ptr
    thrust::device_ptr<float> SlBegin( devRow );

    for ( int i = 0; i < Slices; i++ , SlBegin += PerSlElmts )
        thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts , SlBegin );

    cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );

    for ( int i = 0; i < N; i++ )
        printf("\n Thrust accum : %f",theRow[ i ] );


    //--------------------------------------------------------------------//
    Kernel<<< 1,N >>>( N , devRow );
    cudaDeviceSynchronize();

    af::array SlBeginFire( N, devRow );

    for ( int i = 0; i < Slices; i++ , SlBeginFire += PerSlElmts )
        af::accum( SlBeginFire );

    SlBeginFire.host( theRow );

    for ( int i = 0; i < N; i++ )
            printf("\n Arrayfire accum : %f",theRow[ i ] );

    cudaFree( devRow );
    free( theRow );


    return 0;

}

Answer 1

看起来您正在尝试在2D阵列上运行逐列（在ArrayFire中为0-dim）扫描。以下是您可以使用的一些代码：

af::array SlBegin(N, devRow);
af::array result = accum(SlBegin, 0);

这是一个示例输出

A [5 3 1 1]
0.7402     0.4464     0.7762 
0.9210     0.6673     0.2948 
0.0390     0.1099     0.7140 
0.9690     0.4702     0.3585 
0.9251     0.5132     0.6814 

accum(A, 0) [5 3 1 1]
0.7402     0.4464     0.7762 
1.6612     1.1137     1.0709 
1.7002     1.2236     1.7850 
2.6692     1.6938     2.1435 
3.5943     2.2070     2.8249

这会独立地对每列进行独立扫描。

至于gfor，它已被添加到ArrayFire的开源版本中。由于此代码库仍然是测试版，因此改进和修复程序正在迅速发生。所以请关注我们的github页面。

从推力到阵火 - gfor用法？

1 个答案: