我正在使用此代码:
#include <stdlib.h>
#include <stdio.h>
#include <emmintrin.h>
#include <iostream>
#define N 3
#define M 3
#define C 3
double res[N][M] __attribute__ ((aligned (64)));
double mul1[N][C] __attribute__ ((aligned (64)));
double mul2[C][M] __attribute__ ((aligned (64)));
#define SM (64 / sizeof (double))
int main(void) {
for (int kk = 0; kk < N; kk++) {
for (int jj = 0; jj < C; jj++) {
mul1[kk][jj] = kk + jj;
}
}
for (int kk = 0; kk < C; kk++) {
for (int jj = 0; jj < M; jj++) {
mul2[kk][jj] = kk + jj + 1;
}
}
unsigned int i, i2, j, j2, k, k2;
double *__restrict rres;
double *__restrict rmul1;
double *__restrict rmul2;
for (i = 0; i < N; i += SM)
for (j = 0; j < N; j += SM)
for (k = 0; k < N; k += SM)
for (i2 = 0, rres = &res[i][j], rmul1 = &mul1[i][k]; i2 < SM;
++i2, rres += N, rmul1 += N) {
_mm_prefetch(&rmul1[8], _MM_HINT_NTA);
for (k2 = 0, rmul2 = &mul2[k][j]; k2 < SM;
++k2, rmul2 += N) {
__m128d m1d = _mm_load_sd(&rmul1[k2]);
m1d = _mm_unpacklo_pd(m1d, m1d);
for (j2 = 0; j2 < SM; j2 += 2) {
__m128d m2 = _mm_load_pd(&rmul2[j2]);<--crash here
__m128d r2 = _mm_load_pd(&rres[j2]);
_mm_store_pd(&rres[j2],
_mm_add_pd(_mm_mul_pd(m2, m1d), r2));
}
}
}
for (int kk = 0; kk < N; kk++) {
for (int jj = 0; jj < M; jj++) {
std::cout << "(" << kk << "," << jj << ")=" << res[kk][jj]
<< std::endl;
}
}
return 0;
}
但是代码不起作用,因为根据我的理解,只有当我有偶数个字节时才能使用SIMD指令。矩阵为3x3,因此有9倍。有什么可以处理的吗?