我发现了分段故障发生的地方,但我现在不知道该怎么做

时间:2018-03-01 04:32:57

标签: c simd simd-library

基本上,我正在学习一门名为并行编程的课程。但是,我没有使用C编程的经验,也不太了解计算机体系结构。

我知道缓存比内存更快......但是,我不知道这些概念是如何与C编程相关的。

无论如何我的任务是制作矩阵乘法的快速版本。从我在网上搜索,我应该使用SIMD SSE,这是一个允许您一次执行多个操作并使用循环展开的界面。

然而,当我尝试它时,我不断得到段故障。我使用printf()来查找它发生的位置,但我仍然不明白为什么或之后要做什么。

#include <emmintrin.h>
const char* dgemm_desc = "Simple blocked dgemm.";

#if !defined(BLOCK_SIZE)
#define BLOCK_SIZE 41
#endif

#define min(a,b) (((a)<(b))?(a):(b))
void do_block_fast (int lda, int M, int N, int K, double* A, double* B, double* C)
{
 static double a[BLOCK_SIZE*BLOCK_SIZE] __attribute__ ((aligned (16)));
static double temp[1] __attribute__ ((aligned (16)));
  __m128d vec1;
  __m128d vec2;
  __m128d vec3;
  __m128d vec4;

    // make a local aligned copy of A's block
 for( int j = 0; j < K; j++ )
        for( int i = 0; i < M; i++ )
            a[i+j*BLOCK_SIZE] = A[i+j*lda];
    /* For each row i of A */
    for (int i = 0; i < M; ++i)
    /* For each column j of B */
      for (int j = 0; j < N; ++j)
        {
            /* Compute C(i,j) */
            double cij = C[i+j*lda];
            for (int k = 0; k < K; k += 2){
                printf("0");
                vec1 = _mm_load_pd(&a[i+k*BLOCK_SIZE]);
                printf("1");
                vec2 = _mm_loadu_pd (&B[k+j*lda]);
                printf("2");
                vec3 = _mm_mul_pd(vec1, vec2);
                printf("3"); 
                _mm_storeu_pd(&temp[0], vec3);

                printf("4"); // SEGMENTATION fault occurs right after 4 is printed
                cij += temp[0];
                printf("5");
            }
            printf("5");
            C[i+j*lda] = cij;
        }
}

printf(“4”)后发生错误;但是我不确定为什么。我尝试过temp []数组的不同(alligned(x))版本。我甚至尝试用常规变量替换它。但是分段错误仍然发生。 一世 这是调用do_block_fast()

的主程序
/* This routine performs a dgemm operation
 *  *  C := C + A * B
 *   * where A, B, and C are lda-by-lda matrices stored in column-major format. 
 *    * On exit, A and B maintain their input values. */
void square_dgemm (int lda, double* A, double* B, double* C)
{
  /* For each block-row of A */
  for (int i = 0; i < lda; i += BLOCK_SIZE)
    /* For each block-column of B */
    for (int j = 0; j < lda; j += BLOCK_SIZE)
      /* Accumulate block dgemms into block of C */
      for (int k = 0; k < lda; k += BLOCK_SIZE)
      {
        /* Correct block dimensions if block "goes off edge of" the matrix */
        int M = min (BLOCK_SIZE, lda-i);
        int N = min (BLOCK_SIZE, lda-j);
        int K = min (BLOCK_SIZE, lda-k);

        /* Perform individual block dgemm */
        if((M % BLOCK_SIZE == 0) && (N % BLOCK_SIZE == 0) && (K % BLOCK_SIZE == 0))
        {
                do_block_fast(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
         }else{
                do_block(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
         }
      }
}

0 个答案:

没有答案