Simpson将实值函数与CUDA集成的方法

时间:2013-04-21 17:39:54

标签: cuda integral

我正在尝试使用Simpson在CUDA中的方法对集成进行编码。

这是辛普森规则的公式

enter image description here

其中x_k = a + k*h

这是我的代码

    __device__ void initThreadBounds(int *n_start, int *n_end, int n, 
                                        int totalBlocks, int blockWidth)
    {
        int threadId = blockWidth * blockIdx.x + threadIdx.x;
        int nextThreadId = threadId + 1;

        int threads = blockWidth * totalBlocks;

        *n_start = (threadId * n)/ threads;
        *n_end =  (nextThreadId * n)/ threads;
    }

    __device__ float reg_func (float x)
    {
        return x;
    }

    typedef float (*p_func) (float);

    __device__ p_func integrale_f = reg_func;

    __device__ void integralSimpsonMethod(int totalBlocks, int totalThreads, 
                    double a, double b, int n, float p_function(float), float* result)
    {
        *result = 0;

        float h = (b - a)/n; 
        //*result = p_function(a)+p_function(a + h * n);
        //parallel
        int idx_start;
        int idx_end;
        initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
        //parallel_ends
        for (int i = idx_start; i < idx_end; i+=2) {
            *result +=  ( p_function(a + h*(i-1)) + 
                          4 * p_function(a + h*(i)) + 
                          p_function(a + h*(i+1)) ) * h/3;

        }   
    } 


    __global__ void integralSimpson(int totalBlocks, int totalThreads,  float* result)
    {
        float res = 0;

        integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
        result[(blockIdx.x*totalThreads + threadIdx.x)] = res;

        //printf ("Simpson method\n");
    }


    __host__ void inttest()
    {

        const int blocksNum = 32;
        const int threadNum = 32;

        float   *device_resultf; 
        float   host_resultf[threadNum*blocksNum]={0};


        cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
            integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
        cudaThreadSynchronize();

        cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum, 
                      cudaMemcpyDeviceToHost);

        float sum = 0;
        for (int i = 0; i != blocksNum*threadNum; ++i) {
            sum += host_resultf[i];
            //  printf ("result in %i cell = %f \n", i, host_resultf[i]);
        }
        printf ("sum = %f \n", sum);
        cudaFree(device_resultf);
    }

int main(int argc, char* argv[])
{


   inttest();


    int i;
    scanf ("%d",&i);

}

问题是:当n低于100000时,它会出错。对于从010的积分,结果为~99,但是n = 100000~50 1}}或更大它工作正常,结果是{{1}}。

怎么了,伙计们?

2 个答案:

答案 0 :(得分:5)

这里的基本问题是你不了解自己的算法。

您的integralSimpsonMethod()函数的设计使得每个线程在整数域中每个子区间至少采样3个正交点。因此,如果选择n使其小于内核调用中线程数的四倍,则每个子区间将不可避免地重叠,并且结果积分将不正确。您需要确保代码检查并缩放线程数或n,以便在计算积分时它们不会产生重叠。

如果您正在为自我修改以外的任何事情进行此操作,那么我建议您查看Simpson规则的复合版本。这更适合并行实现,如果正确实现,性能会更高。

答案 1 :(得分:3)

我建议使用CUDA Thrust来解决Simpson的集成问题。你基本上需要五个步骤:

  1. 生成Simpson的正交权重;
  2. 生成函数采样点;
  3. 生成函数值;
  4. 计算正交权重和函数值之间的元素乘积;
  5. 总结以上产品。
  6. 步骤#1需要创建一个重复多次元素的数组,即Simpson案例的1 4 2 4 2 4 ... 1。这可以通过在cuda thrust library repeat vector multiple times中借用Robert Crovella的方法来实现。

    步骤#2可以通过使用couting_iterators并在Purpose and usage of counting_iterators in CUDA Thrust library中借用talonmies方法来完成。

    步骤#3是thrust::transform的应用程序。

    步骤#4和#5可以通过thrust::inner_product一起完成。

    当其他正交积分规则感兴趣时,也可以利用这种方法。

    这是代码

    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/transform_iterator.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/constant_iterator.h>
    #include <thrust/inner_product.h>
    #include <thrust/functional.h>
    
    #include <thrust/fill.h>
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    
    // for printing
    #include <thrust/copy.h>
    #include <ostream>
    
    #define STRIDE 2
    #define N 100
    
    #define pi_f  3.14159265358979f                 // Greek pi in single precision
    
    struct sin_functor
    {
        __host__ __device__
        float operator()(float x) const
        {
            return sin(2.f*pi_f*x);
        }
    };
    
    template <typename Iterator>
    class strided_range
    {
        public:
    
        typedef typename thrust::iterator_difference<Iterator>::type difference_type;
    
        struct stride_functor : public thrust::unary_function<difference_type,difference_type>
        {
            difference_type stride;
    
            stride_functor(difference_type stride)
                : stride(stride) {}
    
            __host__ __device__
            difference_type operator()(const difference_type& i) const
            {
                return stride * i;
            }
        };
    
        typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
        typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
        typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
    
        // type of the strided_range iterator
        typedef PermutationIterator iterator;
    
        // construct strided_range for the range [first,last)
        strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}
    
        iterator begin(void) const
        {
            return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
        }
    
        iterator end(void) const
        {
            return begin() + ((last - first) + (stride - 1)) / stride;
        }
    
        protected:
            Iterator first;
            Iterator last;
            difference_type stride;
    };
    
    int main(void)
    {
        // --- Generate the integration coefficients
        thrust::host_vector<float> h_coefficients(STRIDE);
        h_coefficients[0] = 4.f;
        h_coefficients[1] = 2.f;
    
        thrust::device_vector<float> d_coefficients(N);
    
        typedef thrust::device_vector<float>::iterator Iterator;
        strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
        strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
    
        thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
        thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
    
        d_coefficients[0]       = 1.f;
        d_coefficients[N-1]     = 1.f;
    
        // print the generated d_coefficients
        std::cout << "d_coefficients: ";
        thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " "));  std::cout << std::endl;
    
        // --- Generate sampling points
        float a     = 0.f;
        float b     = .5f;
    
        float Dx    = (b-a)/(float)(N-1);
    
        thrust::device_vector<float> d_x(N);
    
        thrust::transform(thrust::make_counting_iterator(a/Dx),
            thrust::make_counting_iterator((b+1.f)/Dx),
            thrust::make_constant_iterator(Dx),
            d_x.begin(),
            thrust::multiplies<float>());
    
        // --- Calculate function values
        thrust::device_vector<float> d_y(N);
    
        thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
    
        // --- Calculate integral
        float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
    
        printf("The integral is = %f\n", integral);
    
        getchar();
    
        return 0;
    }