我有以下C代码。第一部分只是从标准中读入一个复数的矩阵,称为M
。有趣的部分是第二部分。
#include <stdio.h>
#include <complex.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
int main() {
int n, m, c, d;
float re, im;
scanf("%d %d", &n, &m);
assert(n==m);
complex float M[n][n];
for(c=0; c<n; c++) {
for(d=0; d<n; d++) {
scanf("%f%fi", &re, &im);
M[c][d] = re + im * I;
}
}
for(c=0; c<n; c++) {
for(d=0; d<n; d++) {
printf("%.2f%+.2fi ", creal(M[c][d]), cimag(M[c][d]));
}
printf("\n");
}
/*
Example:input
2 3
1+2i 2+3i 74-4i
3+4i 4+5i -7-8i
*/
/* Part 2. M is now an n by n matrix of complex numbers */
int s=1, i, j;
int *f = malloc(n * sizeof *f);
complex float *delta = malloc(n * sizeof *delta);
complex float *v = malloc(n * sizeof *v);
complex float p = 1, prod;
for (i = 0; i < n; i++) {
v[i] = 0;
for (j = 0; j <n; j++) {
v[i] += M[j][i];
}
p *= v[i];
f[i] = i;
delta[i] = 1;
}
j = 0;
while (j < n-1) {
prod = 1.;
for (i = 0; i < n; i++) {
v[i] -= 2.*delta[j]*M[j][i];
prod *= v[i];
}
delta[j] = -delta[j];
s = -s;
p += s*prod;
f[0] = 0;
f[j] = f[j+1];
f[j+1] = j+1;
j = f[0];
}
free(delta);
free(f);
free(v);
printf("%f + i%f\n", creal(p/pow(2.,(n-1))), cimag(p/pow(2.,(n-1))));
return 0;
}
我使用gcc -fopt-info-vec-all -O3 -ffast-math -march=bdver2 permanent-in-c.c -lm
进行编译。这向我解释了为什么几乎没有循环被矢量化。
表现最重要的部分是第47-50行:
for (i = 0; i < n; i++) {
v[i] -= 2.*delta[j]*M[j][i];
prod *= v[i];
}
gcc告诉我:
permanent-in-c.c:47:7: note: reduction used in loop.
permanent-in-c.c:47:7: note: Unknown def-use cycle pattern.
permanent-in-c.c:47:7: note: reduction used in loop.
permanent-in-c.c:47:7: note: Unknown def-use cycle pattern.
permanent-in-c.c:47:7: note: Unsupported pattern.
permanent-in-c.c:47:7: note: not vectorized: unsupported use in stmt.
permanent-in-c.c:47:7: note: unexpected pattern.
[...]
permanent-in-c.c:48:26: note: SLP: step doesn't divide the vector-size.
permanent-in-c.c:48:26: note: Unknown alignment for access: IMAGPART_EXPR <*M.4_40[j_202]{lb: 0 sz: pretmp_291 * 4}[i_200]>
permanent-in-c.c:48:26: note: SLP: step doesn't divide the vector-size.
permanent-in-c.c:48:26: note: Unknown alignment for access: REALPART_EXPR <*M.4_40[j_202]{lb: 0 sz: pretmp_291 * 4}[i_200]>
[...]
permanent-in-c.c:48:26: note: Build SLP failed: unrolling required in basic block SLP
permanent-in-c.c:48:26: note: Failed to SLP the basic block.
permanent-in-c.c:48:26: note: not vectorized: failed to find SLP opportunities in basic block.
如何解决阻止此部分出现的问题 矢量?
奇怪的是这部分是矢量化的,但我不确定原因:
for (j = 0; j <n; j++) {
v[i] += M[j][i];
gcc -fopt-info-vec-all -O3 -ffast-math -march = bdver2 permanent-in-c.c -lm的完整输出位于https://bpaste.net/show/18ebc3d66a53。
答案 0 :(得分:3)
我想我可能已经弄明白了。经过大量的试验/错误后,很明显gcc内置的矢量化优化是一种硬编码,它不能正确理解复数。我在代码中做了一些更改,并通过gcc输出确认了内部性能敏感循环的向量化(虽然我不确定所需的结果在计算上与你想要的结果相同)。虽然我的理解仅限于您希望代码执行的操作,但我们的结论是,如果您分别计算真实和图像,它将会正常工作。看看:
float t_r = 0.0, t_im = 0.0; // two new temporaries
while (j < n-1) {
prod = 1.;
for (i = 0; i < n; i++) {
// fill the temps after subtraction from V to avoid stmt error
t_r = creal (v[i]) - (2. * creal(delta[j]) * creal (M[j][i]));
t_im = cimag(v[i]) - (2. * cimag(delta[j]) * cimag (M[j][i])) * I;
//v[i] = 2.*delta[j]*M[j][i];
v[i] = t_r + t_im; // sum of real and img
prod *= v[i];
}
delta[j] = -delta[j];
s = -s;
p += s*prod;
f[0] = 0;
f[j] = f[j+1];
f[j+1] = j+1;
j = f[0];
}
答案 1 :(得分:-1)
优化程序日志清楚地表明
访问的未知对齐方式:...
尝试向量化时
printf("%.2f%+.2fi ", creal(M[c][d]), cimag(M[c][d])); //24
v[i] += M[j][i]; //38
p *= v[i]; //40
v[i] -= 2.*delta[j]*M[j][i]; //48
您似乎需要在内存中强制对齐数组M
,delta
和v
。
GCC中的自动矢量化
仅处理对齐的内存访问(不要尝试对包含未对齐访问的循环进行向量化)
正如之前的评论中提到的,我建议你为此目的使用posix_memalign
。
complex float * restrict delta;
posix_memalign(&delta, 64, n * sizeof *delta); //to adapt
您的目标环境是什么? (OS,CPU)