我第一次尝试使用英特尔C编译器,但我得到了完全错误的答案。我做错了什么?
我有一些代码如下:
#include <stdint.h>
#include <stdio.h>
#define CHUNK_SIZE 12
#define NUM_THREADS 8
#define popcnt __builtin_popcountll
#define BILLION (1000 * 1000 * 1000)
#define UPDATE_ROW_PPROD() \
update_row_pprod(row_pprod, row, rows, row_sums, mask, mask_popcnt)
typedef __int128 int128_t;
static inline int64_t update_row_pprod
(
int64_t* row_pprod, int64_t row, int64_t* rows,
int64_t* row_sums, int64_t mask, int64_t mask_popcnt
)
{
int64_t temp = 2 * popcnt(rows[row] & mask) - mask_popcnt;
row_pprod[0] *= temp;
temp -= 1;
row_pprod[1] *= temp;
temp -= row_sums[row];
row_pprod[2] *= temp;
temp += 1;
row_pprod[3] *= temp;
return row + 1;
}
int main(int argc, char* argv[])
{
int64_t size = argc - 1, rows[argc - 1];
int64_t row_sums[argc - 1];
int128_t permanent = 0, sign = size & 1 ? -1 : 1;
if (argc == 2)
{
printf("%d\n", argv[1][0] == '-' ? -1 : 1);
return 0;
}
for (int64_t row = 0; row < size; row++)
{
char positive = argv[row + 1][0] == '+' ? '-' : '+';
sign *= ',' - positive;
rows[row] = row_sums[row] = 0;
for (char* p = &argv[row + 1][1]; *p; p++)
{
rows[row] <<= 1;
rows[row] |= *p == positive;
row_sums[row] += *p == positive;
}
row_sums[row] = 2 * row_sums[row] - size;
}
#pragma omp parallel for reduction(+:permanent) num_threads(NUM_THREADS)
for (int64_t mask = 1; mask < 1LL << (size - 1); mask += 2)
{
int64_t mask_popcnt = popcnt(mask);
int64_t row = 0;
int128_t row_prod = 1 - 2 * (mask_popcnt & 1);
int128_t row_prod_high = -row_prod;
int128_t row_prod_inv = row_prod;
int128_t row_prod_inv_high = -row_prod;
for (int64_t chunk = 0; chunk < size / CHUNK_SIZE; chunk++)
{
int64_t row_pprod[4] = {1, 1, 1, 1};
for (int64_t i = 0; i < CHUNK_SIZE; i++)
row = UPDATE_ROW_PPROD();
row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
}
int64_t row_pprod[4] = {1, 1, 1, 1};
while (row < size)
row = UPDATE_ROW_PPROD();
row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
permanent += row_prod + row_prod_high + row_prod_inv + row_prod_inv_high;
}
permanent *= sign;
if (permanent < 0)
printf("-"), permanent *= -1;
int32_t output[5], print = 0;
output[0] = permanent % BILLION, permanent /= BILLION;
output[1] = permanent % BILLION, permanent /= BILLION;
output[2] = permanent % BILLION, permanent /= BILLION;
output[3] = permanent % BILLION, permanent /= BILLION;
output[4] = permanent % BILLION;
if (output[4])
printf("%u", output[4]), print = 1;
if (print)
printf("%09u", output[3]);
else if (output[3])
printf("%u", output[3]), print = 1;
if (print)
printf("%09u", output[2]);
else if (output[2])
printf("%u", output[2]), print = 1;
if (print)
printf("%09u", output[1]);
else if (output[1])
printf("%u", output[1]), print = 1;
if (print)
printf("%09u\n", output[0]);
else
printf("%u\n", output[0]);
}
如果我用
编译它gcc -Wall -std=c99 -fopenmp -o permanent permanent.c
然后我可以用
运行它permanent -+ -+
并获取输出
-2
这是正确的。
如果我使用带有
的英特尔C编译器(17.0.1)进行编译icc -std=c99 -qopenmp -Wall permanent.c
然后再做
a.out -+ -+
我得到了
11910984139051480114196905982
如评论中所述,如果删除-qopenmp,则icc会生成一个正确运行的版本,尽管只在一个核心上运行。
答案 0 :(得分:7)
看起来像英特尔在__int128
类型变量上处理OpenMP减少的错误。这很容易复制:
#include <stdio.h>
#include <inttypes.h>
int main() {
__int128 sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < 0; i++) {
sum += 1;
}
printf("%" PRIX64 " %" PRIX64 "\n", (uint64_t)sum, (uint64_t)(sum >> 64));
}
使用-fopenmp
输出:
14000000000 78778300
在没有gcc -fopenmp
的情况下正确输出:
0 0
您可以使用简单的自定义缩减声明解决此问题(并保留__int128
类型):
#pragma omp declare reduction(add128: int128_t: omp_out = omp_out + omp_in) initializer(omp_priv = 0)
[...]
#pragma omp parallel for reduction(add128:permanent) num_threads(NUM_THREADS)
据我所知,标准不限制缩减列表元素的类型。至少编译器应该告诉你它是否不受支持。
答案 1 :(得分:5)
感谢测试用例,这绝对是还原变量私有副本初始化中的编译器错误。您可以自由地报告错误,但我们会尝试针对相应的编译器开发人员提交内部错误。
所以,另一个(不是很优雅)的解决方法可能是显式初始化私有副本,即替换
#pragma omp parallel for reduction(+:permanent) num_threads(NUM_THREADS)
与
#pragma omp parallel reduction(+:permanent) num_threads(NUM_THREADS)
{
permanent = 0; // that should be done by compiler actually
#pragma omp for
...
}