依赖性使用自动矢量化和sse加速数据大小

时间:2012-02-13 09:30:11

标签: caching sse vectorization icc

我正在尝试使用英特尔编译器中的自动矢量化并使用sse来加速某些代码。 所有计算都是将struct node_t转换为另一个struct w_t(函数tr()和gen_tr())。 当我尝试使用vectorize函数gen_tr()时,它不会产生任何效果。

如果更改数据存储格式,当每个结构组件存储在不同的浮点数组中,那么自动向量化效果很好,请参见函数genv_tr()。

使用sse的函数称为ssev_tr(N应该均匀地除以4)。

transform.c:

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>

static __inline__ unsigned long getCC(void)
{
    unsigned a, d;
    asm volatile("rdtsc" : "=a" (a), "=d" (d));
    return ((unsigned long)a) | (((unsigned long)d) << 32);
}

typedef struct {
    float x1, x2, x3, x4, x5;
} node_t;

typedef struct {
    float w1, w2, w3, w4;
} w_t;

void tr(node_t *n, float c1, float c2, w_t *w)
{
    const float nv = n->x1;
    const float N00T = n->x3 * c1;

    const float n1v = n->x2;
    const float N01T = n->x4 * c2;

    w->w1 = nv  - N00T;
    w->w2 = nv  + N00T;
    w->w3 = n1v - N01T;
    w->w4 = n1v + N01T;
}

__attribute__ ((noinline))
void gen_tr(node_t *n, w_t *w, const int N, float c1, float c2)
{
    int i;
    #pragma vector aligned
    #pragma ivdep
    for (i = 0; i < N; i++) {
        tr(n + i, c1, c2, w + i);
    }
}

__attribute__ ((noinline))
void genv_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
    int i;
    #pragma vector aligned
    #pragma ivdep
    for (i = 0; i < N; i++) {
        const float N00T = x3[i] * c1;
        const float N01T = x4[i] * c2;

        w1[i] = x1[i] - N00T;
        w2[i] = x1[i] + N00T;
        w3[i] = x2[i] - N01T;
        w4[i] = x2[i] + N01T;
    }
}

__attribute__ ((noinline))
void ssev_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
    __m128 *ws1 = (__m128*)w1;
    __m128 *ws2 = (__m128*)w2;
    __m128 *ws3 = (__m128*)w3;
    __m128 *ws4 = (__m128*)w4;

    __m128 *xs1 = (__m128*)x1;
    __m128 *xs2 = (__m128*)x2;
    __m128 *xs3 = (__m128*)x3;
    __m128 *xs4 = (__m128*)x4;

    const __m128 cs1 = _mm_set1_ps(c1);
    const __m128 cs2 = _mm_set1_ps(c2);

    int i;
    #pragma vector aligned
    #pragma ivdep
    for (i = 0; i < N / 4; i++) {
        const __m128 N00T = _mm_mul_ps(xs3[i], cs1);
        const __m128 N01T = _mm_mul_ps(xs4[i], cs2);

        ws1[i] = _mm_sub_ps(xs1[i], N00T);
        ws2[i] = _mm_add_ps(xs1[i], N00T);
        ws3[i] = _mm_sub_ps(xs2[i], N01T);
        ws4[i] = _mm_add_ps(xs2[i], N01T);
    }
}

#define test(func) \
    for (i = 0; i < n; i++) { \
        x[i].x1 = 1.0; \
        x[i].x2 = 2.0; \
        x[i].x3 = 2.0; \
        x[i].x4 = 2.0; \
        x[i].x5 = 2.0; \
    } \
    \
    t1 = getCC(); \
    for (i = 0; i < rep; i++) { \
        func(x, w, n, c1, c2); \
    } \
    t2 = getCC(); \
    printf("\t%f", ((double)(t2 - t1)) / n / rep);

#define test1(func) \
    for (i = 0; i < n; i++) { \
        x1[i] = 1.0; \
        x2[i] = 2.0; \
        x3[i] = 2.0; \
        x4[i] = 2.0; \
        x5[i] = 2.0; \
    } \
    \
    t1 = getCC(); \
    for (i = 0; i < rep; i++) { \
        func(x1, x2, x3, x4, x5, w1, w2, w3, w4, n, c1, c2); \
    } \
    t2 = getCC(); \
    printf("\t%f", ((double)(t2 - t1)) / n / rep);

int main(int argc, char *argv[])
{
    if (argc < 2) {
        printf("Usage %s vector_size\n", argv[0]);
    }
    int n = atoi(argv[1]);
    printf("%d", n);
    int rep = 100000000 / n;
    int i;
    int inc = 1;
    float c1 = 2.0, c2 = 1.0;
    unsigned long t1, t2;
    node_t *x = (node_t*)malloc(n * sizeof(node_t));
    w_t *w = (w_t*)malloc(n * sizeof(w_t));

    float *x1 = (float*)malloc(n * sizeof(float));
    float *x2 = (float*)malloc(n * sizeof(float));
    float *x3 = (float*)malloc(n * sizeof(float));
    float *x4 = (float*)malloc(n * sizeof(float));
    float *x5 = (float*)malloc(n * sizeof(float));

    float *w1 = (float*)malloc(n * sizeof(float));
    float *w2 = (float*)malloc(n * sizeof(float));
    float *w3 = (float*)malloc(n * sizeof(float));
    float *w4 = (float*)malloc(n * sizeof(float));

    test(gen_tr);
    test1(genv_tr);
    test1(ssev_tr);

    printf("\n");
    return 0;
}

编译选项:icc -O3 -Wall -W -vec-report6 transform.c -o transform

icc版本 - 12.1.2,操作系统 - Fedora 16 x86_64,CPU - Intel Core2 Quad CPU Q8200。

然后我用16到3000的不同大小运行它,步骤64,这里是脚本:

#!/bin/bash

echo "" > run.log

for ((c=16;c<3000;c+=64))
do
./transform $c | tee -a run.log
done

这里有一些工作结果这个脚本(size,gen_tr,genv_tr,ssev_tr),每次都显示每个数组元素:

16      7.710743        3.168577        3.253829
272     7.166493        1.983918        2.618569
528     7.121866        1.920195        2.567109
784     7.115007        1.899451        2.549645
1040    8.104026        2.481062        2.944317
1296    8.137537        5.105032        5.104614
1552    8.118534        5.068812        5.064211
1808    8.138309        5.077831        5.085015
2064    8.149699        5.107503        5.069958
2320    8.164556        5.080981        5.099313
2576    8.151524        5.086056        5.089294
2832    8.212946        5.061927        5.072261

为什么在使用矢量化版本的函数时,它的大小是如此显着的变化?是否因为缓存未命中?是否可以在所有数据范围内保存相同的速度?

1 个答案:

答案 0 :(得分:1)

你有8个浮点数组。当它们的大小为1000时,您的测试正在处理大约32kB的数据。即使您的L1缓存可能稍大(64kB),由于关联性,L1缓存很可能无法同时保存所有32kB数据。

您的测试会反复进行,一遍又一遍地处理相同的数据。考虑两种情况:

  • 大小= 528 :8个阵列可以方便地放入L1缓存中。每次测试迭代(第一次除外)都可以快速访问数据。
  • 大小= 1268 :8个阵列同时不适合L1缓存。每次测试迭代都会不断地从L1中驱逐数据,因此所有的读写操作都会有效地转移到L2。

因此,输入大小1000的跳跃部分是您测试的神器,但并非完全如此。在现实世界中,如果您已经碰巧拥有L1缓存中所需的所有数据,那么genv_tr将非常快。但是在大小> 1000的输入上,所有输入都不适合L1缓存,因此一些访问肯定会进入L2。