使用奇数字节的SSE指令

时间:2018-02-06 20:25:09

标签: c++ sse

我正在使用此代码:

#include <stdlib.h>
#include <stdio.h>
#include <emmintrin.h>
#include <iostream>
#define N 3
#define M 3
#define C 3
double res[N][M] __attribute__ ((aligned (64)));
double mul1[N][C] __attribute__ ((aligned (64)));
double mul2[C][M] __attribute__ ((aligned (64)));
#define SM (64 / sizeof (double))

int main(void) {
    for (int kk = 0; kk < N; kk++) {
        for (int jj = 0; jj < C; jj++) {
            mul1[kk][jj] = kk + jj;
        }
    }
    for (int kk = 0; kk < C; kk++) {
        for (int jj = 0; jj < M; jj++) {
            mul2[kk][jj] = kk + jj + 1;
        }
    }
    unsigned int i, i2, j, j2, k, k2;
    double *__restrict rres;
    double *__restrict rmul1;
    double *__restrict rmul2;
    for (i = 0; i < N; i += SM)
        for (j = 0; j < N; j += SM)
            for (k = 0; k < N; k += SM)
                for (i2 = 0, rres = &res[i][j], rmul1 = &mul1[i][k]; i2 < SM;
                        ++i2, rres += N, rmul1 += N) {
                    _mm_prefetch(&rmul1[8], _MM_HINT_NTA);
                    for (k2 = 0, rmul2 = &mul2[k][j]; k2 < SM;
                            ++k2, rmul2 += N) {
                        __m128d m1d = _mm_load_sd(&rmul1[k2]);
                        m1d = _mm_unpacklo_pd(m1d, m1d);
                        for (j2 = 0; j2 < SM; j2 += 2) {
                            __m128d m2 = _mm_load_pd(&rmul2[j2]);<--crash here
                            __m128d r2 = _mm_load_pd(&rres[j2]);
                            _mm_store_pd(&rres[j2],
                                    _mm_add_pd(_mm_mul_pd(m2, m1d), r2));
                        }
                    }
                }

    for (int kk = 0; kk < N; kk++) {
        for (int jj = 0; jj < M; jj++) {
            std::cout << "(" << kk << "," << jj << ")=" << res[kk][jj]
                    << std::endl;
        }
    }
    return 0;
}

但是代码不起作用,因为根据我的理解,只有当我有偶数个字节时才能使用SIMD指令。矩阵为3x3,因此有9倍。有什么可以处理的吗?

0 个答案:

没有答案