我正在尝试使用英特尔编译器(ICC)为Xeon Phi协处理器编写一个简单的缩减代码。但是,我的代码有两个问题:第一个问题是它产生错误的结果并且它比串行解决方案慢。我用这个选项编译了代码(mpi_reduce.c是文件名):
icc mpi_reduce.c -V -openmp -o mpi_reduce.out
这是我的代码:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
float * _Cilk_shared data; //pointer to “shared” memory
_Cilk_shared float MIC_OMPReduction(int size)
{
int i;
#ifdef __MIC__
float Result;
int nThreads = 32;
omp_set_num_threads(nThreads);
#pragma omp parallel for reduction(+:Result)
for (i=0; i<size; ++i)
{
Result += data[i];
}
return Result;
#else
printf("Intel(R) Xeon Phi(TM) Coprocessor not available\n");
#endif
return 0.0f;
}
float reduction_serial(int size)
{
float ret = 0;
int i;
for ( i=0; i<size; ++i)
{
ret += data[i];
}
return ret;
}
int main()
{
struct timeval tv1, tv2;
int sec, usec;
int i;
float result_serial, result_parallel;
size_t size = 1*1e6;
int n_bytes = size*sizeof(float);
data = (_Cilk_shared float *)_Offload_shared_malloc (n_bytes);
printf("begin computation for size: %d \n",size);
for (i=0; i<size; ++i)
{
data[i] = i%10;
}
gettimeofday(&tv1,NULL);
result_serial = reduction_serial(size);
gettimeofday(&tv2,NULL);
sec = (int) (tv2.tv_sec-tv1.tv_sec);
usec = (int) (tv2.tv_usec-tv1.tv_usec);
if (usec < 0){
sec--;
usec += 1000000;
}
printf("reduction_serial: %f sec\n",sec+usec/1000000.0);
printf("reduction_serial result = %f \n",result_serial);
gettimeofday(&tv1,NULL);
result_parallel = _Cilk_offload MIC_OMPReduction(size);
gettimeofday(&tv2,NULL);
sec = (int) (tv2.tv_sec-tv1.tv_sec);
usec = (int) (tv2.tv_usec-tv1.tv_usec);
if (usec < 0){
sec--;
usec += 1000000;
}
printf("reduction_parallel: %f sec\n",sec+usec/1000000.0);
printf("reduction_parallel result = %f \n",result_parallel);
_Offload_shared_free(data);
return 0;
}
这是我代码的输出:
begin computation for size: 1000000
reduction_serial: 0.000239 sec
reduction_serial result = 4500000.000000
reduction_parallel: 0.461872 sec
reduction_parallel result = 4513334.000000
我注意到当我在没有-openmp选项的情况下编译代码时,并行代码的结果是正确的,但是使用-openmp选项时结果是错误的。