基本上,我正在学习一门名为并行编程的课程。但是,我没有使用C编程的经验,也不太了解计算机体系结构。
我知道缓存比内存更快......但是,我不知道这些概念是如何与C编程相关的。
无论如何我的任务是制作矩阵乘法的快速版本。从我在网上搜索,我应该使用SIMD SSE,这是一个允许您一次执行多个操作并使用循环展开的界面。
然而,当我尝试它时,我不断得到段故障。我使用printf()来查找它发生的位置,但我仍然不明白为什么或之后要做什么。
#include <emmintrin.h>
const char* dgemm_desc = "Simple blocked dgemm.";
#if !defined(BLOCK_SIZE)
#define BLOCK_SIZE 41
#endif
#define min(a,b) (((a)<(b))?(a):(b))
void do_block_fast (int lda, int M, int N, int K, double* A, double* B, double* C)
{
static double a[BLOCK_SIZE*BLOCK_SIZE] __attribute__ ((aligned (16)));
static double temp[1] __attribute__ ((aligned (16)));
__m128d vec1;
__m128d vec2;
__m128d vec3;
__m128d vec4;
// make a local aligned copy of A's block
for( int j = 0; j < K; j++ )
for( int i = 0; i < M; i++ )
a[i+j*BLOCK_SIZE] = A[i+j*lda];
/* For each row i of A */
for (int i = 0; i < M; ++i)
/* For each column j of B */
for (int j = 0; j < N; ++j)
{
/* Compute C(i,j) */
double cij = C[i+j*lda];
for (int k = 0; k < K; k += 2){
printf("0");
vec1 = _mm_load_pd(&a[i+k*BLOCK_SIZE]);
printf("1");
vec2 = _mm_loadu_pd (&B[k+j*lda]);
printf("2");
vec3 = _mm_mul_pd(vec1, vec2);
printf("3");
_mm_storeu_pd(&temp[0], vec3);
printf("4"); // SEGMENTATION fault occurs right after 4 is printed
cij += temp[0];
printf("5");
}
printf("5");
C[i+j*lda] = cij;
}
}
printf(“4”)后发生错误;但是我不确定为什么。我尝试过temp []数组的不同(alligned(x))版本。我甚至尝试用常规变量替换它。但是分段错误仍然发生。 一世 这是调用do_block_fast()
的主程序/* This routine performs a dgemm operation
* * C := C + A * B
* * where A, B, and C are lda-by-lda matrices stored in column-major format.
* * On exit, A and B maintain their input values. */
void square_dgemm (int lda, double* A, double* B, double* C)
{
/* For each block-row of A */
for (int i = 0; i < lda; i += BLOCK_SIZE)
/* For each block-column of B */
for (int j = 0; j < lda; j += BLOCK_SIZE)
/* Accumulate block dgemms into block of C */
for (int k = 0; k < lda; k += BLOCK_SIZE)
{
/* Correct block dimensions if block "goes off edge of" the matrix */
int M = min (BLOCK_SIZE, lda-i);
int N = min (BLOCK_SIZE, lda-j);
int K = min (BLOCK_SIZE, lda-k);
/* Perform individual block dgemm */
if((M % BLOCK_SIZE == 0) && (N % BLOCK_SIZE == 0) && (K % BLOCK_SIZE == 0))
{
do_block_fast(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
}else{
do_block(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
}
}
}