OpenACC的循环指令上的折叠短语偶尔会起作用。为什么呢?

时间:2017-05-15 17:56:03

标签: nested-loops openacc

我有一个包含4个嵌套循环的MD代码。假设循环是线性的,代码的复杂性为O(N^4)

使用OpenACC,我可以向编译器说,在我的外部循环中使用collapse(4)将所有这些都折叠成一个。但是,N的所有值都不是这种情况。请参阅下面的案例:

  • 当N = 20且我使用collapse(2)时,它可以正常工作。
  • 当N = 20且我使用collapse(4)时,它可以正常工作。
  • 当N = 80且我使用collapse(2)时,它可以正常工作。
  • 当N = 80并且我使用collapse(4)时,工作,并且会出现以下错误。
  

内存不足分配4045842432字节的设备内存

     

线程失败:1

     

总/免费CUDA记忆:4291493888/2951409664

     

设备[1]的当前表转储:NVIDIA Tesla GPU 0,计算能力5.2

     

主机:0x18bebe0设备:0x703dc0000大小:12 presentcount:0 + 1行:22名称:s_species

     

...

我在这里要说的是,当我增加系统大小时,我不能再崩溃到4级了。可能是什么原因?

基于这种观察,我可能无法崩溃超过一定数量的N,对吧?

这是真的吗? " collapse数字取决于数据的大小。"

由代码和编译器反馈更新:

#pragma acc parallel loop collapse(4) independent gang vector \
                present(s_boxes_nAtoms[0:s_boxes_nTotalBoxes], \
                        s_boxes_nbrBoxes_array[0:s_boxes_nLocalBoxes * nNbrBoxes], \
                        s_atoms_r[0:fsize][0:3],\
                        s_atoms_f[0:fsize][0:3],\
                        s_atoms_U[0:fsize],\
                        pot_rhobar[0:maxTotalAtoms],\
                        pot_dfEmbed[0:maxTotalAtoms],\
                        pot_rho_values[-1:1],\
                        pot_phi_values[-1:1]) \
                copyin(nNbrBoxes, rCut2)
    for (int iBox = 0; iBox < s_boxes_nLocalBoxes; iBox++) {

        // loop over neighbor boxes of iBox (some may be halo boxes)
#pragma acc loop independent
        for (int jTmp = 0; jTmp < nNbrBoxes; jTmp++) {

            // loop over atoms in iBox
#pragma acc loop independent
            for (int _iOff = 0; _iOff < MAXATOMS; _iOff++) {
                // loop over atoms in jBox
#pragma acc loop independent
                for (int _jOff = 0;_jOff < MAXATOMS; _jOff++) {

                    int nIBox = s_boxes_nAtoms[iBox];
                    int jBox = s_boxes_nbrBoxes_array[iBox * nNbrBoxes + jTmp];
                    int nJBox = s_boxes_nAtoms[jBox];

                    if(_iOff >= nIBox || _jOff >= nJBox)
                        continue;

                    int iOff = MAXATOMS * iBox + _iOff;
                    int jOff = MAXATOMS * jBox + _jOff;

                    real_t r2 = 0.0;
                    real3 dr;

                    dr[0] = s_atoms_r[iOff][0] - s_atoms_r[jOff][0];
                    dr[1] = s_atoms_r[iOff][1] - s_atoms_r[jOff][1];
                    dr[2] = s_atoms_r[iOff][2] - s_atoms_r[jOff][2];
                    r2 += dr[0] * dr[0];
                    r2 += dr[1] * dr[1];
                    r2 += dr[2] * dr[2];

                    if (r2 <= rCut2 && r2 > 0.0) {

                        real_t r = sqrt(r2);

                        real_t rhoTmp, dRho;
                        interpolate_acc(pot_rho_values, pot_rho_x0,
                                pot_rho_invDx, pot_rho_n, r, &rhoTmp, &dRho);

                        r = 1.0f / r;

                        s_atoms_f[iOff][0] -= (pot_dfEmbed[iOff]
                                + pot_dfEmbed[jOff]) * dRho * dr[0] * r;
                        s_atoms_f[iOff][1] -= (pot_dfEmbed[iOff]
                                + pot_dfEmbed[jOff]) * dRho * dr[1] * r;
                        s_atoms_f[iOff][2] -= (pot_dfEmbed[iOff]
                                + pot_dfEmbed[jOff]) * dRho * dr[2] * r;

                    }

                } // loop over atoms in jBox
            } // loop over atoms in iBox
        } // loop over neighbor boxes
    } // loop over local boxes

运行时反馈是:

Out of memory allocating 258933915648 bytes of device memory
Failing in Thread:1
total/free CUDA memory: 4291493888/2885210112
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 5.2
host:0x2502be0 device:0x703dc0000 size:12 presentcount:0+1 line:22 name:s_species
host:0x2504b00 device:0x733b81000 size:2012 presentcount:0+1 line:-1 name:(null)
host:0x25052f0 device:0x733b80800 size:2012 presentcount:1+1 line:-1 name:(null)
host:0x2505ae0 device:0x733b80000 size:2012 presentcount:1+1 line:-1 name:(null)
host:0x7f967d686010 device:0x7306c0000 size:55296000 presentcount:1+1 line:249 name:pot_rhobar
host:0x7f9680b43010 device:0x72d200000 size:55296000 presentcount:1+1 line:248 name:pot_dfEmbed
host:0x7f96883df010 device:0x703fc0000 size:23328000 presentcount:1+1 line:28 name:s_boxes_nbrBoxes
host:0x7f9694038010 device:0x7232c0000 size:55296000 presentcount:1+1 line:45 name:s_atoms_U
host:0x7f96974f5010 device:0x719480000 size:165888000 presentcount:1+1 line:45 name:s_atoms_f
host:0x7f96a132a010 device:0x70f640000 size:165888000 presentcount:0+1 line:45 name:s_atoms_p
host:0x7f96ab15f010 device:0x705800000 size:165888000 presentcount:1+1 line:45 name:s_atoms_r
host:0x7f96b4f94010 device:0x726880000 size:55296000 presentcount:0+1 line:45 name:s_atoms_iSpecies
host:0x7f96b8451010 device:0x729d40000 size:55296000 presentcount:0+1 line:45 name:s_atoms_gid
host:0x7f96bda84010 device:0x726780000 size:864000 presentcount:1+1 line:45 name:s_boxes_nAtoms
host:0x7ffe3146bc60 device:0x703dc0400 size:4 presentcount:1+0 line:418 name:nNbrBoxes
host:0x7ffe3146bd2c device:0x703dc0200 size:4 presentcount:1+0 line:418 name:rCut2
allocated block device:0x703dc0000 size:512 thread:1
allocated block device:0x703dc0200 size:512 thread:1
allocated block device:0x703dc0400 size:512 thread:1
allocated block device:0x703fc0000 size:23328256 thread:1
allocated block device:0x705800000 size:165888000 thread:1
allocated block device:0x70f640000 size:165888000 thread:1
allocated block device:0x719480000 size:165888000 thread:1
allocated block device:0x7232c0000 size:55296000 thread:1
allocated block device:0x726780000 size:864256 thread:1
allocated block device:0x726880000 size:55296000 thread:1
allocated block device:0x729d40000 size:55296000 thread:1
allocated block device:0x72d200000 size:55296000 thread:1
allocated block device:0x7306c0000 size:55296000 thread:1
allocated block device:0x733b80000 size:2048 thread:1
allocated block device:0x733b80800 size:2048 thread:1
allocated block device:0x733b81000 size:2048 thread:1
call to cuMemAlloc returned error 2: Out of memory

而且,这是编译器的反馈:

pgcc  -c99  -acc -ta=nvidia -Minfo=accel -DSINGLE -DNDEBUG -mp -O3 -fast -Minline -Munroll=c:4 -Mlre -Mpre -Mprefetch   -c eam.c -o eam.o
eamForce:
    248, Generating enter data copyin(pot_dfEmbed[:maxTotalAtoms])
    249, Generating enter data copyin(pot_rhobar[:maxTotalAtoms])
    302, Generating present(pot_dfEmbed[:maxTotalAtoms],pot_rhobar[:maxTotalAtoms],s_atoms_U[:fsize],s_atoms_f[:fsize][:])
         Accelerator kernel generated
         Generating Tesla code
        309, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    317, Generating copyin(nNbrBoxes)
         Generating present(pot_phi_values[-1],pot_rho_values[-1],pot_rhobar[:maxTotalAtoms])
         Generating copyin(rCut2)
         Generating present(s_atoms_U[:fsize],s_atoms_f[:fsize][:],s_atoms_r[:fsize][:],s_boxes_nAtoms[:s_boxes_nTotalBoxes],s_boxes_nbrBoxes_array[:nNbrBoxes*s_boxes_nLocalBoxes])
         Accelerator kernel generated
         Generating Tesla code
        317, Generating reduction(+:etot)
        327, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
        332, #pragma acc loop seq
        338, #pragma acc loop seq collapse(2)
        341,   collapsed */
    332, Scalar last value needed after loop for etot at line 485
    338, Scalar last value needed after loop for etot at line 485
    341, Scalar last value needed after loop for etot at line 485
    384, Generating present(pot_dfEmbed[:maxTotalAtoms],pot_f_values[-1],pot_rhobar[:maxTotalAtoms],s_atoms_U[:fsize],s_atoms_f[:fsize][:],s_boxes_nAtoms[:s_boxes_nTotalBoxes],s_boxes_nbrBoxes_array[:nNbrBoxes*s_boxes_nLocalBoxes])
         Accelerator kernel generated
         Generating Tesla code
        384, Generating reduction(+:etot)
        393, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
        398, #pragma acc loop seq
         398, Loop is parallelizable
    414, Generating update self(pot_dfEmbed[:maxTotalAtoms])
    422, Generating copyin(nNbrBoxes)
         Generating present(pot_dfEmbed[:maxTotalAtoms],pot_phi_values[-1],pot_rho_values[-1],pot_rhobar[:maxTotalAtoms])
         Generating copyin(rCut2)
         Generating present(s_atoms_U[:fsize],s_atoms_f[:fsize][:],s_atoms_r[:fsize][:],s_boxes_nAtoms[:s_boxes_nTotalBoxes],s_boxes_nbrBoxes_array[:nNbrBoxes*s_boxes_nLocalBoxes])
         Generating update device(pot_dfEmbed[:maxTotalAtoms])
         Accelerator kernel generated
         Generating Tesla code
        433, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
        438, #pragma acc loop seq
        444, #pragma acc loop seq
        448, #pragma acc loop seq
    438, Loop is parallelizable
    444, Loop is parallelizable
    448, Loop is parallelizable
pgcc  -c99  -acc -ta=nvidia -Minfo=accel -DSINGLE -DNDEBUG -mp -O3 -fast -Minline -Munroll=c:4 -Mlre -Mpre -Mprefetch   -o ../bin/CoMD-acc ljForce.o haloExchange.o linkCells.o yamlOutput.o performanceTimers.o timestep.o cmdLineParser.o decomposition.o mycommand.o initAtoms.o CoMD.o parallel.o eam.o random.o -lm 

0 个答案:

没有答案