Question

我正在使用Xeon Phi通过Stampede处理Collatz猜想问题。我已经测试了我的代码已经过测试，并且对于高达100,000的值可以正常工作，但测试值高达100万，我几乎立即收到了分段错误（“SIGSEV”）。几天我一直在撞墙，但根本无法找出错误。真的很感激任何帮助。

typedef unsigned long long bigInt;

// Number to test to (starting from 1)
   #define bigSize     100000

typedef struct {
    int numSteps;
    bigInt stopPoint;
} batcher;

typedef struct {
    bigInt num;
    batcher to_batch;
} to_ret;
int main () {
    //Stores values as [num][#steps to smaller val][smaller val]
    to_ret retlist[bigSize];
    //Stores values as [#steps to smaller val][smaller val], sorted by num
    batcher results[bigSize];
    ...

    #pragma offload target(mic:0) inout(retlist) shared(retlist)
    {
        #pragma omp parallel for
        for(i = 1; i < bigSize; i++){
            retlist[i].num = i + 1;
            bigInt next = retlist[i].num;
            int count = 0;

            do {
                count++;

                if (next%2 == 1)
                    next=(3*next+1)/2;
                else
                    next/=2;

            } while(next > retlist[i].num);

            retlist[i].to_batch.numSteps = count;
            retlist[i].to_batch.stopPoint = next;
        }
    }

    ///Organizes data into a sorted array
    #pragma omp parallel for
    for (i = 0; i < bigSize; i++){
        results[retlist[i].num - 1] = retlist[i].to_batch;
    }
    ...
}

我非常有信心这个问题会出现在上面的代码段中。

Answer 1

以下代码正确编译：

不会溢出堆栈
不会使用一堆结构的typedef来混淆代码
不会将bigNum隐藏为无符号long long int。
确实包含索引变量的声明＆＃39; i＆＃39;

我无权访问优化编译指示，所以暂时将其注释掉。

//typedef unsigned long long bigInt;

// Number to test to (starting from 1)
#define bigSize     (100000)

struct batcher
{
    int numSteps;
    //bigInt stopPoint;
    unsigned long long stopPoint;
};

struct to_ret
{
    //bigInt num;
    unsigned long long num;
    struct batcher to_batch;
};

//Stores values as [num][#steps to smaller val][smaller val]
static struct to_ret retlist[bigSize];
//Stores values as [#steps to smaller val][smaller val], sorted by num
static struct batcher results[bigSize];

int main ()
{
    int i;
    // more code here

    ////#pragma offload target(mic:0) inout(retlist) shared(retlist)
    {
        ////#pragma omp parallel for
        for(i = 1; i < bigSize; i++)
        {
            retlist[i].num = i + 1;
            //bigInt next = retlist[i].num;
            unsigned long long next = retlist[i].num;
            int count = 0;

            do
            {
                count++;

                if (next%2 == 1)
                    next=(3*next+1)/2;
                else
                    next/=2;

            } while(next > retlist[i].num);

            retlist[i].to_batch.numSteps = count;
            retlist[i].to_batch.stopPoint = next;
        }
    }

    ///Organizes data into a sorted array
    ////#pragma omp parallel for
    for (i = 0; i < bigSize; i++){
        results[retlist[i].num - 1] = retlist[i].to_batch;
    }
    // more code here

    return(0);
} // end function: main

Answer 2

完整的代码可以在github here上找到，虽然虽然它仍然存在很多效率问题（可以使用矢量化支持），但我目前所处的是这个（利用这个建议）由barak-manos）：

typedef unsigned long long bigInt;

/// Number to test up to (starting from 1)
#define bigSize     1000000000 //340282366920938463463374607431768211455

typedef struct {
    int numSteps;
    bigInt stopPoint;
} batcher;

typedef struct {
    bigInt num;
    batcher to_batch;
} to_ret;

__attribute__((target(mic))) to_ret retlist[bigSize]; ///Stores values as [num][#steps to smaller val][smaller val]
__attribute__((target(mic))) batcher results[bigSize]; ///Stores values as [#steps to smaller val][smaller val] & is sorted by num


int main () {
    bigInt j;
    double start, end;

    retlist[0].num = 1; retlist[0].to_batch.numSteps = 0; retlist[0].to_batch.stopPoint = 1;
    start = omp_get_wtime();

    #pragma offload target(mic:0) out(results)
    {
        int count;
        bigInt i, next;

    #pragma omp parallel for
        for(i = 1; i < bigSize; i++){
            next = retlist[i].num = i + 1;
            count = 0;

            do {
                count++;

                if (next%2 == 1)
                    next=(3*next+1)/2;
                else
                    next/=2;

            } while(next > retlist[i].num);

            retlist[i].to_batch.numSteps = count;
            retlist[i].to_batch.stopPoint = next;
        }

    ///Organizes data into a sorted array
    #pragma omp parallel for
    for (i = 0; i < bigSize; i++){
        results[i] = retlist[i].to_batch;
    }
  }
...

    for(j = 0; j < bigSize; j++){
        results[j].numSteps += results[results[j].stopPoint-1].numSteps;
    }

    return(0);
}

如果有人有兴趣，请随意创建我项目的分支。

具有高值的分段错误（Xeon Phi）

2 个答案: