Kepler / Fermi Arch上的CUDA破碎浮动数学

时间:2014-04-16 17:58:15

标签: cuda

我有一个程序可以完成很多单精度数学运算。如果我指定1.0体系结构但是针对2.X和3.X体系结构,它会产生正确的结果。会导致这种情况的原因是什么?

包括以下内容: 很长的代码示例。 编译命令和良好的输出。 编译命令和输出错误。

如果我使用gcc在CPU中运行相同的路由,我会得到与1.0体系结构相匹配的结果。

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>

/* 
 * svdcomp - SVD decomposition routine. 
 * Takes an mxn matrix a and decomposes it into udv, where u,v are
 * left and right orthogonal transformation matrices, and d is a 
 * diagonal matrix of singular values.
 *
 * This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is 
 * code from Numerical Recipes adapted by Luke Tierney and David Betz.
 * Originally from: "Numerical Recipes in C: The Art of Scientific Computing",
 * Press, Flannery, Teukolosky, Vetterling. 1992.
 *
 * Input to dsvd is as follows:
 *   a = mxn matrix to be decomposed, gets overwritten with u
 *   m = row dimension of a
 *   n = column dimension of a
 *   w = returns the vector of singular values of a
 *   v = returns the right orthogonal transformation matrix
*/

#define SIGN(a, b) ((b) >= 0.0f ? fabsf(a) : -fabsf(a))
#define MIN(x,y) ( (x) < (y) ? (x) : (y) )
#define MAX(x,y) ((x)>(y)?(x):(y))

#define PERR(call) \
  if (call) {\
   fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
      cudaGetErrorString(cudaGetLastError()));\
   exit(1);\
  }
#define ERRCHECK \
  if (cudaPeekAtLastError()) { \
    fprintf(stderr, "%s:%d Error [%s]\n", __FILE__, __LINE__,\
       cudaGetErrorString(cudaGetLastError()));\
    exit(1);\
  }

__device__ int 
svd(float *a, int m, int n, float *w, float *v, int skip_u)
{
    int flag, i, its, j, jj, k, l, nm;
    float c, f, h, s, x, y, z;
    float anorm = 0.0f, g = 0.0f, scale = 0.0f;

    float rv1[3];

    /* Householder reduction to bidiagonal form */
    for (i = 0; i < n; i++) 
    {
        /* left-hand reduction */
        l = i + 1;
        rv1[i] = scale * g;
        g = s = scale = 0.0f;
        if (i < m) 
        {
            for (k = i; k < m; k++) 
                scale += fabsf(a[k*n+i]);
            if (scale) 
            {
                for (k = i; k < m; k++) 
                {
                    a[k*n+i] /= scale;
                    s += powf(a[k*n+i], 2);
                }
                f = a[i*n+i];
                g = -SIGN(sqrtf(s), f);
                h = f * g - s;
                a[i*n+i] = f - g;
                if (i != n - 1) 
                {
                    for (j = l; j < n; j++) 
                    {
                        for (s = 0.0f, k = i; k < m; k++) 
                            s += a[k*n+i] * a[k*n+j];
                        f = s / h;
                        for (k = i; k < m; k++) 
                            a[k*n+j] += f * a[k*n+i];
                    }
                }
                for (k = i; k < m; k++) 
                    a[k*n+i] *= scale;
            }
        }
        w[i] = scale * g;

        /* right-hand reduction */
        g = s = scale = 0.0f;
        if (i < m && i != n - 1) 
        {
            for (k = l; k < n; k++) 
                scale += fabsf(a[i*n+k]);
            if (scale) 
            {
                for (k = l; k < n; k++) 
                {
                    a[i*n+k] /= scale;
                    s += powf(a[i*n+k], 2);
                }
                f = a[i*n+l];
                g = -SIGN(sqrtf(s), f);
                h = f * g - s;
                a[i*n+l] = f - g;
                for (k = l; k < n; k++) 
                    rv1[k] = a[i*n+k] / h;
                if (i != m - 1) 
                {
                    for (j = l; j < m; j++) 
                    {
                        for (s = 0.0f, k = l; k < n; k++) 
                            s += a[j*n+k] * a[i*n+k];
                        for (k = l; k < n; k++) 
                            a[j*n+k] += s * rv1[k];
                    }
                }
                for (k = l; k < n; k++) 
                    a[i*n+k] *= scale;
            }
        }
        anorm = MAX(anorm, fabsf(w[i]) + fabsf(rv1[i]));
    }

    /* accumulate the right-hand transformation */
    for (i = n - 1; i >= 0; i--) 
    {
        if (i < n - 1) 
        {
            if (g) 
            {
                for (j = l; j < n; j++)
                    v[j*n+i] = (a[i*n+j] / a[i*n+l]) / g;
                    /* float division to avoid underflow */
                for (j = l; j < n; j++) 
                {
                    for (s = 0.0f, k = l; k < n; k++) 
                        s += a[i*n+k] * v[k*n+j];
                    for (k = l; k < n; k++) 
                        v[k*n+j] += s * v[k*n+i];
                }
            }
            for (j = l; j < n; j++) 
                v[i*n+j] = v[j*n+i] = 0.0f;
        }
        v[i*n+i] = 1.0f;
        g = rv1[i];
        l = i;
    }

    /* accumulate the left-hand transformation */
    if (!skip_u) {
        for (i = n - 1; i >= 0; i--) 
        {
            l = i + 1;
            g = w[i];
            if (i < n - 1) 
                for (j = l; j < n; j++) 
                    a[i*n+j] = 0.0f;
            if (g) 
            {
                g = 1.0f / g;
                if (i != n - 1) 
                {
                    for (j = l; j < n; j++) 
                    {
                        for (s = 0.0f, k = l; k < m; k++) 
                            s += a[k*n+i] * a[k*n+j];
                        f = (s / a[i*n+i]) * g;
                        for (k = i; k < m; k++) 
                            a[k*n+j] += f * a[k*n+i];
                    }
                }
                for (j = i; j < m; j++) 
                    a[j*n+i] = a[j*n+i]*g;
            }
            else 
            {
                for (j = i; j < m; j++) 
                    a[j*n+i] = 0.0f;
            }
            ++a[i*n+i];
        }
    }

    /* diagonalize the bidiagonal form */
    for (k = n - 1; k >= 0; k--) 
    {                             /* loop over singular values */
        for (its = 0; its < 30; its++) 
        {                         /* loop over allowed iterations */
            flag = 1;
            for (l = k; l >= 0; l--) 
            {                     /* test for splitting */
                nm = l - 1;
                if (fabsf(rv1[l]) + anorm == anorm) 
                {
                    flag = 0;
                    break;
                }
                if (fabsf(w[nm]) + anorm == anorm) 
                    break;
            }
            if (flag) 
            {
                c = 0.0f;
                s = 1.0f;
                for (i = l; i <= k; i++) 
                {
                    f = s * rv1[i];
                    if (fabsf(f) + anorm != anorm) 
                    {
                        g = w[i];
                        h = hypotf(f, g);
                        w[i] = h; 
                        h = 1.0f / h;
                        c = g * h;
                        s = (- f * h);
                        if (!skip_u) {
                            for (j = 0; j < m; j++) 
                            {
                                y = a[j*n+nm];
                                z = a[j*n+i];
                                a[j*n+nm] = y * c + z * s;
                                a[j*n+i] = z * c - y * s;
                            }
                        }
                    }
                }
            }
            z = w[k];
            if (l == k) 
            {                  /* convergence */
                if (z < 0.0f) 
                {              /* make singular value nonnegative */
                    w[k] = -z;
                    for (j = 0; j < n; j++) 
                        v[j*n+k] = -v[j*n+k];
                }
                break;
            }
            if (its >= 30) {
            }

            /* shift from bottom 2 x 2 minor */
            x = w[l];
            nm = k - 1;
            y = w[nm];
            g = rv1[nm];
            h = rv1[k];
            f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0f * h * y);
            g = hypotf(f, 1.0f);
            f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;

            /* next QR transformation */
            c = s = 1.0f;
            for (j = l; j <= nm; j++) 
            {
                i = j + 1;
                g = rv1[i];
                y = w[i];
                h = s * g;
                g = c * g;
                z = hypotf(f, h);
                rv1[j] = z;
                c = f / z;
                s = h / z;
                f = x * c + g * s;
                g = g * c - x * s;
                h = y * s;
                y = y * c;
                for (jj = 0; jj < n; jj++) 
                {
                    x = v[jj*n+j];
                    z = v[jj*n+i];
                    v[jj*n+j] = x * c + z * s;
                    v[jj*n+i] = z * c - x * s;
                }
                z = hypotf(f, h);
                w[j] = z;
                if (z) 
                {
                    z = 1.0f / z;
                    c = f * z;
                    s = h * z;
                }
                f = (c * g) + (s * y);
                x = (c * y) - (s * g);
                if (!skip_u) {
                    for (jj = 0; jj < m; jj++) 
                    {
                        y = a[jj*n+j];
                        z = a[jj*n+i];
                        a[jj*n+j] = y * c + z * s;
                        a[jj*n+i] = z * c - y * s;
                    }
                }
            }
            rv1[l] = 0.0f;
            rv1[k] = f;
            w[k] = x;
        }
    }
    return(0);
}

__global__ void
svd_kernel(float *v)
{
    float a[9], w[3];

    a[0] = 8.0f;
    a[1] = 3.0f;
    a[2] = 7.0f;

    a[3] = 7.0f;
    a[4] = 9.0f;
    a[5] = 1.0f;

    a[6] = 3.0f;
    a[7] = 7.0f;
    a[8] = 2.0f;

    svd(a, 3, 3, w, v, 1);
}

int main()
{
  int i, j;
  float *v_d, v[9];

  PERR(cudaMalloc(&v_d, 9*sizeof(float)));
  svd_kernel<<<1,1>>>(v_d);
  cudaDeviceSynchronize();
  ERRCHECK;
  PERR(cudaMemcpy(v, v_d, 9*sizeof(float), cudaMemcpyDeviceToHost));

  for (i = 0; i < 3; i++) {
    for (j = 0; j < 3; j++) {
      printf("%6.3f\t", v[i*3+j]);
    }
    printf("\n");
  }

  return 0;
}

正确的结果:

$ nvcc -arch=sm_10 -o svd svd.cu
$ ./svd
-0.657  -0.685   0.314  
-0.668   0.337  -0.664  
-0.349   0.646   0.679

结果破碎:

$ nvcc -arch=sm_20 -o svd svd.cu
$ ./svd 
-0.661  -0.660   0.356  
-0.642   0.253  -0.724  
 0.019   0.460   0.888

1 个答案:

答案 0 :(得分:1)

似乎CUDA 6解决了这个问题。