有效地转置大(密集)二进制矩阵

时间:2017-10-07 01:00:11

标签: c matrix cuda linear-algebra transpose

我想要转置的二进制矩阵(由零和1组成的矩阵)。矩阵的每一行都是一个32位整数的一维数组,整个矩阵是一行数组。

这是128 x 128二进制矩阵的示例,由128128 / 32 32位整数组成。 (实际上,矩阵是一个N x N矩阵,成千上万的{{​​1}}。)

N

什么是转置这种矩阵的有效方法? (AVX2,CUDA,OpenCL ......一切顺利。)

Hacker's Delight有代码用于转置// gcc example0.c -std=c99 && ./a.out #include <stdlib.h> #include <stdio.h> #include <stdint.h> typedef uint32_t uint32; #define N (32 * 4) // Assume this to be divisible by 32. The matrix is intended to be an N x N matrix #define INTS_PER_ROW (N / 32) int main(int argc, char** argv){ uint32** matrix = malloc(N * sizeof(uint32*)); for(int i=0; i<N; ++i) matrix[i] = malloc(INTS_PER_ROW * sizeof(uint32)); for(int i=0; i<N; ++i) for(int j=0; j<INTS_PER_ROW; ++j) matrix[i][j] = rand(); for(int i=0; i<N; ++i){ for(int j=0; j<INTS_PER_ROW; ++j){ printf(" %08x", matrix[i][j]); } puts(""); } for(int i=0; i<N; ++i) free(matrix[i]); free(matrix); } 二进制矩阵,它建议递归使用它们的方法用于更大的矩阵,但我不知道它是否适用于大型矩阵。)< / p>

1 个答案:

答案 0 :(得分:3)

以下是@tera建议使用__ballot()的一种可行方法。

  1. 以合并方式,通过磁贴将数据加载到共享内存
  2. 在加载期间在共享内存中执行初始转置,以将列排列为行以方便__ballot()操作。
  3. 进行投票以建立&#34;最终&#34;列式数据中的行方式数据(已存储在行中)
  4. 将数据写回来
  5. 步骤1-3应该在GPU上相当有效,至少我知道如何最好。步骤4受到以下事实的阻碍:位块的逐位转置不会产生在存储器中相邻的一组32位量。因此,在一般情况下,全局存储上的内存访问模式是分散的。

    输出索引操作是可视化/安排最繁琐的。

    以下是一个包​​含4个测试用例的完整工作示例。对于测试用例,我还写了一个天真的CPU函数来执行&#34; golden&#34;测试,根据问题从another idea in the comments借用。

    $ cat t435.cu
    #include <stdio.h>
    #include <stdlib.h>
    
    #define IDX(d,x,y,ld) d[y*ld+x]
    
    #define cudaCheckErrors(msg) \
      do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
      } while (0)
    
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    unsigned long dtime_usec(unsigned long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    __global__ void bt(const unsigned * __restrict__ in, unsigned * __restrict__ out, const unsigned idim){
    
    // assumes "square" binary matrix transpose
    // assumes kernel is launched with 1024 threads per block, 2D, 32x32
    // assumes input matrix dimension idim is evenly divisible by 32 bits
    // assumes idim is specified in bits
    
    
      __shared__ unsigned smem[32][33];
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      int idy = threadIdx.y+blockDim.y*blockIdx.y;
    
      smem[threadIdx.x][threadIdx.y] = ((idx < idim/32)&&(idy < idim))?IDX(in,idx,idy,idim/32):0;
      __syncthreads();
      unsigned myval = smem[threadIdx.y][31-threadIdx.x];
      __syncthreads();
      smem[ 0][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<31)&myval));
      smem[ 1][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<30)&myval));
      smem[ 2][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<29)&myval));
      smem[ 3][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<28)&myval));
      smem[ 4][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<27)&myval));
      smem[ 5][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<26)&myval));
      smem[ 6][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<25)&myval));
      smem[ 7][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<24)&myval));
      smem[ 8][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<23)&myval));
      smem[ 9][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<22)&myval));
      smem[10][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<21)&myval));
      smem[11][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<20)&myval));
      smem[12][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<19)&myval));
      smem[13][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<18)&myval));
      smem[14][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<17)&myval));
      smem[15][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<16)&myval));
      smem[16][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<15)&myval));
      smem[17][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<14)&myval));
      smem[18][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<13)&myval));
      smem[19][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<12)&myval));
      smem[20][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<11)&myval));
      smem[21][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<10)&myval));
      smem[22][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 9)&myval));
      smem[23][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 8)&myval));
      smem[24][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 7)&myval));
      smem[25][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 6)&myval));
      smem[26][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 5)&myval));
      smem[27][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 4)&myval));
      smem[28][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 3)&myval));
      smem[29][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 2)&myval));
      smem[30][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 1)&myval));
      smem[31][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 0)&myval));
      __syncthreads();
      int indx = (idx*idim)+(gridDim.y*threadIdx.y)+blockIdx.y;
      if ((idx < (idim/32))&&(idy < idim)) out[indx] = smem[threadIdx.y][threadIdx.x];
    }
    
    void naive_cpu_bt(const unsigned *in, unsigned *out, const unsigned idim){
      memset(out, 0, idim*(idim/32)*sizeof(unsigned));
      for (int i = 0; i < (idim/32); i++)
        for (int j = 0; j < idim; j++){
          for (int bit = 0; bit < 32; bit++){
            if ((in[(j*(idim/32)) + i]>>bit)&1) out[(((i*32)+(31-bit))*(idim/32))+(j/32)] |=  1<<(31 - (j%32));
            }
          }
    }
    
    int main(){
    
    
      unsigned *h_idata, *h_odata, *d_idata, *d_odata, *c_odata;
      unsigned idim;
    // test case 1, 32x32, upper triangular
      idim = 32;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      unsigned data = 0x0FFFFFFFFU;
      for (int i = 0; i < 32; i++){
        h_idata[i] = data;
        data>>=1;}
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,1),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) {printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 32; i++) printf("0x%8x\n", h_odata[i]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 2, 64x64, opposite diagonal
      idim = 64;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      data = 0x01;
      for (int i = 0; i < 32; i++){
        h_idata[2*i] = 0; h_idata[(2*i)+1] = data;
        h_idata[64+(2*i)] = data; h_idata[65+(2*i)] = 0xFFFFFFFFU;
        data<<=1; data++;}
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,2),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 64; i++) printf("0x%8x 0x%8x\n", h_odata[i*2], h_odata[(i*2)+1]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 3, 96x96, ones in alternating columns
      idim = 96;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      data = 0x55555555U;
      for (int i = 0; i < idim*(idim/32); i++)
        h_idata[i] = data;
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,3),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 96; i++) printf("0x%8x 0x%8x 0x%8x\n", h_odata[i*3], h_odata[(i*3)+1], h_odata[(i*3)+2]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 4, 8kx8k random
      idim = 8192;
      int xblocks = (idim/1024)+((idim%1024)?1:0);
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      for (int i = 0; i < idim*(idim/32); i++)
        h_idata[i] = rand();
      unsigned long gt = dtime_usec(0);
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      unsigned long gkt = dtime_usec(0);
      bt<<<dim3(xblocks,(idim/32)),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaDeviceSynchronize();
      gkt = dtime_usec(gkt);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      gt = dtime_usec(gt);
      unsigned long ct = dtime_usec(0);
      naive_cpu_bt(h_idata, c_odata, idim);
      ct = dtime_usec(ct);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      printf("gputime: %fms, kerneltime: %fms, cputime: %fms\n", (gt*1000)/(float)USECPSEC, (gkt*1000)/(float)USECPSEC, (ct*1000)/(float)USECPSEC);
      printf("kernel bandwidth = %fMB/s\n", (idim*(idim/32)*4*2)/(float)gkt);
      printf("Success!\n");
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    
      return 0;
    }
    $ nvcc -arch=sm_61 -o t435 t435.cu
    $ ./t435
    0x80000000
    0xc0000000
    0xe0000000
    0xf0000000
    0xf8000000
    0xfc000000
    0xfe000000
    0xff000000
    0xff800000
    0xffc00000
    0xffe00000
    0xfff00000
    0xfff80000
    0xfffc0000
    0xfffe0000
    0xffff0000
    0xffff8000
    0xffffc000
    0xffffe000
    0xfffff000
    0xfffff800
    0xfffffc00
    0xfffffe00
    0xffffff00
    0xffffff80
    0xffffffc0
    0xffffffe0
    0xfffffff0
    0xfffffff8
    0xfffffffc
    0xfffffffe
    0xffffffff
    0x       0 0x       1
    0x       0 0x       3
    0x       0 0x       7
    0x       0 0x       f
    0x       0 0x      1f
    0x       0 0x      3f
    0x       0 0x      7f
    0x       0 0x      ff
    0x       0 0x     1ff
    0x       0 0x     3ff
    0x       0 0x     7ff
    0x       0 0x     fff
    0x       0 0x    1fff
    0x       0 0x    3fff
    0x       0 0x    7fff
    0x       0 0x    ffff
    0x       0 0x   1ffff
    0x       0 0x   3ffff
    0x       0 0x   7ffff
    0x       0 0x   fffff
    0x       0 0x  1fffff
    0x       0 0x  3fffff
    0x       0 0x  7fffff
    0x       0 0x  ffffff
    0x       0 0x 1ffffff
    0x       0 0x 3ffffff
    0x       0 0x 7ffffff
    0x       0 0x fffffff
    0x       0 0x1fffffff
    0x       0 0x3fffffff
    0x       0 0x7fffffff
    0x       0 0xffffffff
    0x       1 0xffffffff
    0x       3 0xffffffff
    0x       7 0xffffffff
    0x       f 0xffffffff
    0x      1f 0xffffffff
    0x      3f 0xffffffff
    0x      7f 0xffffffff
    0x      ff 0xffffffff
    0x     1ff 0xffffffff
    0x     3ff 0xffffffff
    0x     7ff 0xffffffff
    0x     fff 0xffffffff
    0x    1fff 0xffffffff
    0x    3fff 0xffffffff
    0x    7fff 0xffffffff
    0x    ffff 0xffffffff
    0x   1ffff 0xffffffff
    0x   3ffff 0xffffffff
    0x   7ffff 0xffffffff
    0x   fffff 0xffffffff
    0x  1fffff 0xffffffff
    0x  3fffff 0xffffffff
    0x  7fffff 0xffffffff
    0x  ffffff 0xffffffff
    0x 1ffffff 0xffffffff
    0x 3ffffff 0xffffffff
    0x 7ffffff 0xffffffff
    0x fffffff 0xffffffff
    0x1fffffff 0xffffffff
    0x3fffffff 0xffffffff
    0x7fffffff 0xffffffff
    0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    gputime: 5.530000ms, kerneltime: 0.301000ms, cputime: 659.205994ms
    kernel bandwidth = 55738.257812MB/s
    Success!
    $
    

    对于前3个测试用例,我使用了3种不同的输入模式,并打印出结果,除了进行cpu验证。对于第4个测试用例(大型随机模式),我跳过了打印输出,但保留了cpu验证。

    另请注意,我使用了__ballot_sync()变体to be compliant with CUDA 9

    我还没有做过任何性能分析。像大多数转置操作一样,我希望这里的大部分性能源于有效使用全局内存。主要的抑制因素是上面步骤4中的分散写入。

    可以找到关于CUDA中矩阵转置的一些一般背景信息here,尽管这种按位情况与此大不相同。