模块化算术和NTT(有限域DFT)优化

时间:2013-09-02 16:01:29

标签: c++ performance optimization modular-arithmetic ntt

我想使用NTT进行快速平方(参见Fast bignum square computation),但即使对于非常大的数字,结果也很慢..超过12000位。

所以我的问题是:

  1. 有没有办法优化我的NTT变换? 我并不是说通过并行(线程)加速它;这只是低层。
  2. 有没有办法加速我的模块化算术?
  3. 这是我在C ++ for NTT中的(已经优化的)源代码(它是完整的,100%在C ++中工作,不需要任何第三方库,也应该是线程安全的。请注意源数组用作临时的!!!,它也无法将数组转换为自身。)

    //---------------------------------------------------------------------------
    class fourier_NTT                                    // Number theoretic transform
        {
    
    public:
        DWORD r,L,p,N;
        DWORD W,iW,rN;
        fourier_NTT(){ r=0; L=0; p=0; W=0; iW=0; rN=0; }
    
        // main interface
        void  NTT(DWORD *dst,DWORD *src,DWORD n=0);               // DWORD dst[n] = fast  NTT(DWORD src[n])
        void INTT(DWORD *dst,DWORD *src,DWORD n=0);               // DWORD dst[n] = fast INTT(DWORD src[n])
    
        // Helper functions
        bool init(DWORD n);                                       // init r,L,p,W,iW,rN
        void  NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD w);    // DWORD dst[n] = fast  NTT(DWORD src[n])
    
        // Only for testing
        void  NTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w);    // DWORD dst[n] = slow  NTT(DWORD src[n])
        void INTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w);    // DWORD dst[n] = slow INTT(DWORD src[n])
    
        // DWORD arithmetics
        DWORD shl(DWORD a);
        DWORD shr(DWORD a);
    
        // Modular arithmetics
        DWORD mod(DWORD a);
        DWORD modadd(DWORD a,DWORD b);
        DWORD modsub(DWORD a,DWORD b);
        DWORD modmul(DWORD a,DWORD b);
        DWORD modpow(DWORD a,DWORD b);
        };
    
    //---------------------------------------------------------------------------
    void fourier_NTT:: NTT(DWORD *dst,DWORD *src,DWORD n)
        {
        if (n>0) init(n);
        NTT_fast(dst,src,N,W);
    //    NTT_slow(dst,src,N,W);
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT::INTT(DWORD *dst,DWORD *src,DWORD n)
        {
        if (n>0) init(n);
        NTT_fast(dst,src,N,iW);
        for (DWORD i=0;i<N;i++) dst[i]=modmul(dst[i],rN);
           //    INTT_slow(dst,src,N,W);
        }
    
    //---------------------------------------------------------------------------
    bool fourier_NTT::init(DWORD n)
        {
        // (max(src[])^2)*n < p else NTT overflow can ocur !!!
        r=2; p=0xC0000001; if ((n<2)||(n>0x10000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x30000000/n; // 32:30 bit best for unsigned 32 bit
    //    r=2; p=0x78000001; if ((n<2)||(n>0x04000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x3c000000/n; // 31:27 bit best for signed 32 bit
    //    r=2; p=0x00010001; if ((n<2)||(n>0x00000020)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x00000020/n; // 17:16 bit best for 16 bit
    //    r=2; p=0x0a000001; if ((n<2)||(n>0x01000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x01000000/n; // 28:25 bit
         N=n;                // size of vectors [DWORDs]
         W=modpow(r,    L);    // Wn for NTT
        iW=modpow(r,p-1-L);    // Wn for INTT
        rN=modpow(n,p-2  );    // scale for INTT
        return true;
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT:: NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD w)
        {
        if (n<=1) { if (n==1) dst[0]=src[0]; return; }
        DWORD i,j,a0,a1,n2=n>>1,w2=modmul(w,w);
        // reorder even,odd
        for (i=0,j=0;i<n2;i++,j+=2) dst[i]=src[j];
        for (    j=1;i<n ;i++,j+=2) dst[i]=src[j];
        // recursion
        NTT_fast(src   ,dst   ,n2,w2);    // even
        NTT_fast(src+n2,dst+n2,n2,w2);    // odd
        // restore results
        for (w2=1,i=0,j=n2;i<n2;i++,j++,w2=modmul(w2,w))
            {
            a0=src[i];
            a1=modmul(src[j],w2);
            dst[i]=modadd(a0,a1);
            dst[j]=modsub(a0,a1);
            }
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT:: NTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w)
        {
        DWORD i,j,wj,wi,a,n2=n>>1;
        for (wj=1,j=0;j<n;j++)
            {
            a=0;
            for (wi=1,i=0;i<n;i++)
                {
                a=modadd(a,modmul(wi,src[i]));
                wi=modmul(wi,wj);
                }
            dst[j]=a;
            wj=modmul(wj,w);
            }
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT::INTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w)
        {
        DWORD i,j,wi=1,wj=1,a,n2=n>>1;
        for (wj=1,j=0;j<n;j++)
            {
            a=0;
            for (wi=1,i=0;i<n;i++)
                {
                a=modadd(a,modmul(wi,src[i]));
                wi=modmul(wi,wj);
                }
            dst[j]=modmul(a,rN);
            wj=modmul(wj,iW);
            }
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::shl(DWORD a) { return (a<<1)&0xFFFFFFFE; }
    DWORD fourier_NTT::shr(DWORD a) { return (a>>1)&0x7FFFFFFF; }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::mod(DWORD a)
        {
        DWORD bb;
        for (bb=p;(DWORD(a)>DWORD(bb))&&(!DWORD(bb&0x80000000));bb=shl(bb));
        for (;;)
            {
            if (DWORD(a)>=DWORD(bb)) a-=bb;
            if (bb==p) break;
            bb =shr(bb);
            }
        return a;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modadd(DWORD a,DWORD b)
        {
        DWORD d,cy;
        a=mod(a);
        b=mod(b);
        d=a+b;
        cy=(shr(a)+shr(b)+shr((a&1)+(b&1)))&0x80000000;
        if (cy) d-=p;
        if (DWORD(d)>=DWORD(p)) d-=p;
        return d;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modsub(DWORD a,DWORD b)
        {
        DWORD d;
        a=mod(a);
        b=mod(b);
        d=a-b; if (DWORD(a)<DWORD(b)) d+=p;
        if (DWORD(d)>=DWORD(p)) d-=p;
        return d;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modmul(DWORD a,DWORD b)
        {    // b bez orezania !
        int i;
        DWORD d;
        a=mod(a);
        for (d=0,i=0;i<32;i++)
            {
            if (DWORD(a&1))    d=modadd(d,b);
            a=shr(a);
            b=modadd(b,b);
            }
        return d;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modpow(DWORD a,DWORD b)
        {    // a,b bez orezania !
        int i;
        DWORD d=1;
        for (i=0;i<32;i++)
            {
            d=modmul(d,d);
            if (DWORD(b&0x80000000)) d=modmul(d,a);
            b=shl(b);
            }
        return d;
        }
    //---------------------------------------------------------------------------
    

    我的NTT课程的使用示例:

    fourier_NTT ntt;
    const DWORD n=32
    DWORD x[N]={0,1,2,3,....31},y[N]={32,33,34,35,...63},z[N];
    
    ntt.NTT(z,x,N);    // z[N]=NTT(x[N]), also init constants for N
    ntt.NTT(x,y);    // x[N]=NTT(y[N]), no recompute of constants, use last N
    // modular convolution y[]=z[].x[]
    for (i=0;i<n;i++) y[i]=ntt.modmul(z[i],x[i]);
    ntt.INTT(x,y);    // x[N]=INTT(y[N]), no recompute of constants, use last N
    // x[]=convolution of original x[].y[]
    

    优化前的一些测量(非NTT类):

    a = 0.98765588997654321000 | 389*32 bits
    looped 1x times
    sqr1[ 3.177 ms ] fast sqr
    sqr2[ 720.419 ms ] NTT sqr
    mul1[ 5.588 ms ] simpe mul
    mul2[ 3.172 ms ] karatsuba mul
    mul3[ 1053.382 ms ] NTT mul
    

    优化后的一些测量(当前代码,较低的递归参数大小/数量,以及更好的模块化算法):

    a = 0.98765588997654321000 | 389*32 bits
    looped 1x times
    sqr1[ 3.214 ms ] fast sqr
    sqr2[ 208.298 ms ] NTT sqr
    mul1[ 5.564 ms ] simpe mul
    mul2[ 3.113 ms ] karatsuba mul
    mul3[ 302.740 ms ] NTT mul
    

    检查NTT mul和NTT sqr次数(我的优化速度提高了3倍以上)。这只是循环的1倍,所以它不是很精确(错误~10%),但即使现在加速也很明显(通常我循环1000x甚至更多,但我的NTT太慢了)。

    你可以自由使用我的代码......只需保留我的昵称和/或链接到这个页面(代码中的rem,readme.txt,about或者其他)。我希望它有所帮助......(我没有在任何地方看到快速NTT的C ++源码,所以我必须自己编写)。对所有已接受的N进行了统一根测试,参见fourier_NTT::init(DWORD n)函数。

    P.S。:有关NTT的更多信息,请参阅 https://stackoverflow.com/a/18547575/2521214 。此代码基于该链接中的帖子。

    [edit1:]代码中的进一步更改

    我设法进一步优化我的模块算法,通过利用模数素数总是0xC0000001并消除不必要的调用。由此产生的加速是惊人的(超过40倍),并且在大约1500 * 32位阈值之后NTT乘法比karatsuba快。顺便说一下,我的NTT的速度现在和64位双精度的优化DFFT相同。

    一些测量:

    a = 0.98765588997654321000 | 1553*32bits
    looped 10x times
    mul2[ 28.585 ms ] karatsuba mul
    mul3[ 26.311 ms ] NTT mul
    

    模块化算术的新源代码:

    //---------------------------------------------------------------------------
    DWORD fourier_NTT::mod(DWORD a)
        {
        if (a>p) a-=p;
        return a;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modadd(DWORD a,DWORD b)
        {
        DWORD d,cy;
        if (a>p) a-=p;
        if (b>p) b-=p;
        d=a+b;
        cy=((a>>1)+(b>>1)+(((a&1)+(b&1))>>1))&0x80000000;
        if (cy ) d-=p;
        if (d>p) d-=p;
        return d;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modsub(DWORD a,DWORD b)
        {
        DWORD d;
        if (a>p) a-=p;
        if (b>p) b-=p;
        d=a-b;
        if (a<b) d+=p;
        if (d>p) d-=p;
        return d;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modmul(DWORD a,DWORD b)
        {
        DWORD _a,_b,_p;
        _a=a;
        _b=b;
        _p=p;
        asm    {
            mov    eax,_a
            mov    ebx,_b
            mul    ebx        // H(edx),L(eax) = eax * ebx
            mov    ebx,_p
            div    ebx        // eax = H(edx),L(eax) / ebx
            mov    _a,edx    // edx = H(edx),L(eax) % ebx
            }
        return _a;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modpow(DWORD a,DWORD b)
        {    // b bez orezania!
        int i;
        DWORD d=1;
        if (a>p) a-=p;
        for (i=0;i<32;i++)
            {
            d=modmul(d,d);
            if (DWORD(b&0x80000000)) d=modmul(d,a);
            b<<=1;
            }
        return d;
        }
    
    //---------------------------------------------------------------------------
    

    如您所见,不再使用函数shlshr。我认为modpow可以进一步优化,但它不是一个关键功能,因为它只被调用很少次。最关键的功能是modmul,这似乎是最好的形状。

    其他问题:

    • 还有其他加速NTT的选项吗?
    • 我对模块化算术的优化是否安全? (结果似乎是相同的,但我可能会错过一些东西。)

    [edit2]新优化

    a = 0.99991970486 | 2000*32 bits
    looped 10x
    sqr1[  13.908 ms ] fast sqr
    sqr2[  13.649 ms ] NTT sqr
    mul1[  19.726 ms ] simpe mul
    mul2[  31.808 ms ] karatsuba mul
    mul3[  19.373 ms ] NTT mul
    

    我实施了所有评论中的所有可用内容(感谢您的见解)。

    加速比:

    • + 2.5%通过删除不必要的安全模式(Mandalf The Beige)
    • 使用预先计算的W,iW权力(Mysticial)+ 34.9%
    • + 35%总计

    实际完整源代码:

    //---------------------------------------------------------------------------
    //--- Number theoretic transforms: 2.03 -------------------------------------
    //---------------------------------------------------------------------------
    #ifndef _fourier_NTT_h
    #define _fourier_NTT_h
    //---------------------------------------------------------------------------
    //---------------------------------------------------------------------------
    class fourier_NTT        // Number theoretic transform
        {
    public:
        DWORD r,L,p,N;
        DWORD W,iW,rN;        // W=(r^L) mod p, iW=inverse W, rN = inverse N
        DWORD *WW,*iWW,NN;    // Precomputed (W,iW)^(0,..,NN-1) powers
    
        // Internals
        fourier_NTT(){ r=0; L=0; p=0; W=0; iW=0; rN=0; WW=NULL; iWW=NULL; NN=0; }
        ~fourier_NTT(){ _free(); }
        void _free();                                            // Free precomputed W,iW powers tables
        void _alloc(DWORD n);                                    // Allocate and precompute W,iW powers tables
    
        // Main interface
        void  NTT(DWORD *dst,DWORD *src,DWORD n=0);                // DWORD dst[n] = fast  NTT(DWORD src[n])
        void iNTT(DWORD *dst,DWORD *src,DWORD n=0);               // DWORD dst[n] = fast INTT(DWORD src[n])
    
        // Helper functions
        bool init(DWORD n);                                          // init r,L,p,W,iW,rN
        void  NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD w);    // DWORD dst[n] = fast  NTT(DWORD src[n])
        void  NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD *w2,DWORD i2);
    
        // Only for testing
        void  NTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w);    // DWORD dst[n] = slow  NTT(DWORD src[n])
        void iNTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w);    // DWORD dst[n] = slow INTT(DWORD src[n])
    
        // Modular arithmetics (optimized, but it works only for p >= 0x80000000!!!)
        DWORD mod(DWORD a);
        DWORD modadd(DWORD a,DWORD b);
        DWORD modsub(DWORD a,DWORD b);
        DWORD modmul(DWORD a,DWORD b);
        DWORD modpow(DWORD a,DWORD b);
        };
    //---------------------------------------------------------------------------
    
    //---------------------------------------------------------------------------
    void fourier_NTT::_free()
        {
        NN=0;
        if ( WW) delete[]  WW;  WW=NULL;
        if (iWW) delete[] iWW; iWW=NULL;
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT::_alloc(DWORD n)
        {
        if (n<=NN) return;
        DWORD *tmp,i,w;
        tmp=new DWORD[n]; if ((NN)&&( WW)) for (i=0;i<NN;i++) tmp[i]= WW[i]; if ( WW) delete[]  WW;  WW=tmp;  WW[0]=1; for (i=NN?NN:1,w= WW[i-1];i<n;i++){ w=modmul(w, W);  WW[i]=w; }
        tmp=new DWORD[n]; if ((NN)&&(iWW)) for (i=0;i<NN;i++) tmp[i]=iWW[i]; if (iWW) delete[] iWW; iWW=tmp; iWW[0]=1; for (i=NN?NN:1,w=iWW[i-1];i<n;i++){ w=modmul(w,iW); iWW[i]=w; }
        NN=n;
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT:: NTT(DWORD *dst,DWORD *src,DWORD n)
        {
        if (n>0) init(n);
        NTT_fast(dst,src,N,WW,1);
    //    NTT_fast(dst,src,N,W);
    //    NTT_slow(dst,src,N,W);
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT::iNTT(DWORD *dst,DWORD *src,DWORD n)
        {
        if (n>0) init(n);
        NTT_fast(dst,src,N,iWW,1);
    //    NTT_fast(dst,src,N,iW);
        for (DWORD i=0;i<N;i++) dst[i]=modmul(dst[i],rN);
    //    iNTT_slow(dst,src,N,W);
        }
    
    //---------------------------------------------------------------------------
    bool fourier_NTT::init(DWORD n)
        {
        // (max(src[])^2)*n < p else NTT overflow can ocur!!!
        r=2; p=0xC0000001; if ((n<2)||(n>0x10000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x30000000/n; // 32:30 bit best for unsigned 32 bit
    //    r=2; p=0x78000001; if ((n<2)||(n>0x04000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x3c000000/n; // 31:27 bit best for signed 32 bit
    //    r=2; p=0x00010001; if ((n<2)||(n>0x00000020)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x00000020/n; // 17:16 bit best for 16 bit
    //    r=2; p=0x0a000001; if ((n<2)||(n>0x01000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x01000000/n; // 28:25 bit
         N=n;                // Size of vectors [DWORDs]
         W=modpow(r,    L);  // Wn for NTT
        iW=modpow(r,p-1-L);  // Wn for INTT
        rN=modpow(n,p-2  );  // Scale for INTT
        _alloc(n>>1);        // Precompute W,iW powers
        return true;
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT:: NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD w)
        {
        if (n<=1) { if (n==1) dst[0]=src[0]; return; }
        DWORD i,j,a0,a1,n2=n>>1,w2=modmul(w,w);
    
        // Reorder even,odd
        for (i=0,j=0;i<n2;i++,j+=2) dst[i]=src[j];
        for (    j=1;i<n ;i++,j+=2) dst[i]=src[j];
    
        // Recursion
        NTT_fast(src   ,dst   ,n2,w2);    // Even
        NTT_fast(src+n2,dst+n2,n2,w2);    // Odd
    
        // Restore results
        for (w2=1,i=0,j=n2;i<n2;i++,j++,w2=modmul(w2,w))
            {
            a0=src[i];
            a1=modmul(src[j],w2);
            dst[i]=modadd(a0,a1);
            dst[j]=modsub(a0,a1);
            }
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT:: NTT_fast(DWORD *dst,DWORD *src,DWORD n,DWORD *w2,DWORD i2)
        {
        if (n<=1) { if (n==1) dst[0]=src[0]; return; }
        DWORD i,j,a0,a1,n2=n>>1;
    
        // Reorder even,odd
        for (i=0,j=0;i<n2;i++,j+=2) dst[i]=src[j];
        for (    j=1;i<n ;i++,j+=2) dst[i]=src[j];
    
        // Recursion
        i=i2<<1;
        NTT_fast(src   ,dst   ,n2,w2,i);    // Even
        NTT_fast(src+n2,dst+n2,n2,w2,i);    // Odd
    
        // Restore results
        for (i=0,j=n2;i<n2;i++,j++,w2+=i2)
            {
            a0=src[i];
            a1=modmul(src[j],*w2);
            dst[i]=modadd(a0,a1);
            dst[j]=modsub(a0,a1);
            }
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT:: NTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w)
        {
        DWORD i,j,wj,wi,a;
        for (wj=1,j=0;j<n;j++)
            {
            a=0;
            for (wi=1,i=0;i<n;i++)
                {
                a=modadd(a,modmul(wi,src[i]));
                wi=modmul(wi,wj);
                }
            dst[j]=a;
            wj=modmul(wj,w);
            }
        }
    
    //---------------------------------------------------------------------------
    void fourier_NTT::iNTT_slow(DWORD *dst,DWORD *src,DWORD n,DWORD w)
        {
        DWORD i,j,wi=1,wj=1,a;
        for (wj=1,j=0;j<n;j++)
            {
            a=0;
            for (wi=1,i=0;i<n;i++)
                {
                a=modadd(a,modmul(wi,src[i]));
                wi=modmul(wi,wj);
                }
            dst[j]=modmul(a,rN);
            wj=modmul(wj,iW);
            }
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::mod(DWORD a)
        {
        if (a>p) a-=p;
        return a;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modadd(DWORD a,DWORD b)
        {
        DWORD d,cy;
        //if (a>p) a-=p;
        //if (b>p) b-=p;
        d=a+b;
        cy=((a>>1)+(b>>1)+(((a&1)+(b&1))>>1))&0x80000000;
        if (cy ) d-=p;
        if (d>p) d-=p;
        return d;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modsub(DWORD a,DWORD b)
        {
        DWORD d;
        //if (a>p) a-=p;
        //if (b>p) b-=p;
        d=a-b;
        if (a<b) d+=p;
        if (d>p) d-=p;
        return d;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modmul(DWORD a,DWORD b)
        {
        DWORD _a,_b,_p;
        _a=a;
        _b=b;
        _p=p;
        asm    {
            mov    eax,_a
            mov    ebx,_b
            mul    ebx        // H(edx),L(eax) = eax * ebx
            mov    ebx,_p
            div    ebx        // eax = H(edx),L(eax) / ebx
            mov    _a,edx    // edx = H(edx),L(eax) % ebx
            }
        return _a;
        }
    
    //---------------------------------------------------------------------------
    DWORD fourier_NTT::modpow(DWORD a,DWORD b)
        {    // b is not mod(p)!
        int i;
        DWORD d=1;
        //if (a>p) a-=p;
        for (i=0;i<32;i++)
            {
            d=modmul(d,d);
            if (DWORD(b&0x80000000)) d=modmul(d,a);
            b<<=1;
            }
        return d;
        }
    //---------------------------------------------------------------------------
    //---------------------------------------------------------------------------
    #endif
    //---------------------------------------------------------------------------
    //---------------------------------------------------------------------------
    

    通过将NTT_fast分隔为两个函数,仍然可以减少堆垃圾的使用。一个使用WW[],另一个使用iWW[],这会导致递归调用中的一个参数减少。但是我没想到它(仅限32位指针)而是有一个功能可以在将来更好地进行代码管理。许多函数现在处于休眠状态(用于测试)与慢变体mod和较旧的快速函数(使用w参数而不是*w2,i2)类似。

    为了避免大数据集溢出,请将输入数限制为p/4位。其中p是每 NTT 元素的位数,因此这个32位版本使用最大(32 bit/4 -> 8 bit)个输入值。

    [edit3]用于测试的简单字符串bigint乘法

    //---------------------------------------------------------------------------
    char* mul_NTT(const char *sx,const char *sy)
        {
        char *s;
        int i,j,k,n;
        // n = min power of 2 <= 2 max length(x,y)
        for (i=0;sx[i];i++); for (n=1;n<i;n<<=1);        i--;
        for (j=0;sx[j];j++); for (n=1;n<j;n<<=1); n<<=1; j--;
        DWORD *x,*y,*xx,*yy,a;
        x=new DWORD[n]; xx=new DWORD[n];
        y=new DWORD[n]; yy=new DWORD[n];
    
        // Zero padding
        for (k=0;i>=0;i--,k++) x[k]=sx[i]-'0'; for (;k<n;k++) x[k]=0;
        for (k=0;j>=0;j--,k++) y[k]=sy[j]-'0'; for (;k<n;k++) y[k]=0;
    
        //NTT
        fourier_NTT ntt;
        ntt.NTT(xx,x,n);
        ntt.NTT(yy,y);
    
        // Convolution
        for (i=0;i<n;i++) xx[i]=ntt.modmul(xx[i],yy[i]);
    
        //INTT
        ntt.iNTT(yy,xx);
    
        //suma
        a=0; s=new char[n+1]; for (i=0;i<n;i++) { a+=yy[i]; s[n-i-1]=(a%10)+'0'; a/=10; } s[n]=0;
        delete[] x; delete[] xx;
        delete[] y; delete[] yy;
    
        return s;
        }
    //---------------------------------------------------------------------------
    

    我使用AnsiString,所以我将它移植到char*希望,我没有做错。看起来它正常工作(与AnsiString版本相比)。

    • sx,sy是十进制整数
    • 返回已分配的字符串(char*)=sx*sy

    每32位数据字只有~4位,因此没有溢出的风险,但当然速度较慢。在我的bignum lib中,我使用二进制表示,并为每个32位WORD使用8 bit块用于 NTT 。如果N很大,那么风险更大......

    玩得开心

1 个答案:

答案 0 :(得分:3)

首先,非常感谢您发布并免费使用。我真的很感激。

我能够使用一些技巧消除一些分支,重新排列主循环,并修改了程序集,并且能够获得1.35倍的加速。

另外,我添加了一个64位的预处理器条件,因为Visual Studio不允许在64位模式下进行内联汇编(谢谢微软;随意自行解决)。

当我优化modsub()函数时发生了一些奇怪的事情。我使用像我做的modadd(更快)的比特黑客重写了它。但由于某种原因,modsub的位版本较慢。不知道为什么。可能只是我的电脑。

//
// Mandalf The Beige
// Based on:
// Spektre
// http://stackoverflow.com/questions/18577076/modular-arithmetics-and-ntt-finite-field-dft-optimizations
//
// This code may be freely used however you choose, so long as it is accompanied by this notice.
//




#ifndef H__OPTIMIZED_NUMBER_THEORETIC_TRANSFORM__HDR
#define H__OPTIMIZED_NUMBER_THEORETIC_TRANSFORM__HDR

#include <string.h>

#ifndef uint32
#define uint32 unsigned long int
#endif

#ifndef uint64
#define uint64 unsigned long long int
#endif


class fast_ntt                                   // number theoretic transform
{
    public:
    fast_ntt()
    {
        r = 0; L = 0;
        W = 0; iW = 0; rN = 0;
    }
    // main interface
    void  NTT(uint32 *dst, uint32 *src, uint32 n = 0);             // uint32 dst[n] = fast  NTT(uint32 src[n])
    void INTT(uint32 *dst, uint32 *src, uint32 n = 0);             // uint32 dst[n] = fast INTT(uint32 src[n])
    // helper functions

    private:
    bool init(uint32 n);                                     // init r,L,p,W,iW,rN
    void NTT_calc(uint32 *dst, uint32 *src, uint32 n, uint32 w);  // uint32 dst[n] = fast  NTT(uint32 src[n])

    void  NTT_fast(uint32 *dst, uint32 *src, uint32 n, uint32 w);  // uint32 dst[n] = fast  NTT(uint32 src[n])
    void NTT_fast(uint32 *dst, const uint32 *src, uint32 n, uint32 w);
    // only for testing
    void  NTT_slow(uint32 *dst, uint32 *src, uint32 n, uint32 w);  // uint32 dst[n] = slow  NTT(uint32 src[n])
    void INTT_slow(uint32 *dst, uint32 *src, uint32 n, uint32 w);  // uint32 dst[n] = slow INTT(uint32 src[n])
    // uint32 arithmetics


    // modular arithmetics
    inline uint32 modadd(uint32 a, uint32 b);
    inline uint32 modsub(uint32 a, uint32 b);
    inline uint32 modmul(uint32 a, uint32 b);
    inline uint32 modpow(uint32 a, uint32 b);

    uint32 r, L, N;//, p;
    uint32 W, iW, rN;

    const uint32 p = 0xC0000001;
};

//---------------------------------------------------------------------------
void fast_ntt::NTT(uint32 *dst, uint32 *src, uint32 n)
{
    if (n > 0)
    {
        init(n);
    }
    NTT_fast(dst, src, N, W);
    //  NTT_slow(dst,src,N,W);
}

//---------------------------------------------------------------------------
void fast_ntt::INTT(uint32 *dst, uint32 *src, uint32 n)
{
    if (n > 0)
    {
        init(n);
    }
    NTT_fast(dst, src, N, iW);
    for (uint32 i = 0; i<N; i++)
    {
        dst[i] = modmul(dst[i], rN);
    }
    //  INTT_slow(dst,src,N,W);
}

//---------------------------------------------------------------------------
bool fast_ntt::init(uint32 n)
{
    // (max(src[])^2)*n < p else NTT overflow can ocur !!!
    r = 2;
    //p = 0xC0000001;
    if ((n < 2) || (n > 0x10000000))
    {
        r = 0; L = 0; W = 0; // p = 0;
        iW = 0; rN = 0; N = 0;
        return false;
    }
    L = 0x30000000 / n; // 32:30 bit best for unsigned 32 bit
    //  r=2; p=0x78000001; if ((n<2)||(n>0x04000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x3c000000/n; // 31:27 bit best for signed 32 bit
    //  r=2; p=0x00010001; if ((n<2)||(n>0x00000020)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x00000020/n; // 17:16 bit best for 16 bit
    //  r=2; p=0x0a000001; if ((n<2)||(n>0x01000000)) { r=0; L=0; p=0; W=0; iW=0; rN=0; N=0; return false; } L=0x01000000/n; // 28:25 bit
    N = n;               // size of vectors [uint32s]
    W = modpow(r, L); // Wn for NTT
    iW = modpow(r, p - 1 - L); // Wn for INTT
    rN = modpow(n, p - 2); // scale for INTT
    return true;
}

//---------------------------------------------------------------------------

void fast_ntt::NTT_fast(uint32 *dst, uint32 *src, uint32 n, uint32 w)
{
    if(n > 1)
    {
        if(dst != src)
        {
            NTT_calc(dst, src, n, w);
        }
        else
        {
            uint32* temp = new uint32[n];
            NTT_calc(temp, src, n, w);
            memcpy(dst, temp, n * sizeof(uint32));
            delete [] temp;
        }
    }
    else if(n == 1)
    {
        dst[0] = src[0];
    }
}

void fast_ntt::NTT_fast(uint32 *dst, const uint32 *src, uint32 n, uint32 w)
{
    if (n > 1)
    {
        uint32* temp = new uint32[n];
        memcpy(temp, src, n * sizeof(uint32));
        NTT_calc(dst, temp, n, w);
        delete[] temp;
    }
    else if (n == 1)
    {
        dst[0] = src[0];
    }
}



void fast_ntt::NTT_calc(uint32 *dst, uint32 *src, uint32 n, uint32 w)
{
    if(n > 1)
    {
        uint32 i, j, a0, a1,
        n2 = n >> 1,
        w2 = modmul(w, w);

        // reorder even,odd
        for (i = 0, j = 0; i < n2; i++, j += 2)
        {
            dst[i] = src[j];
        }
        for (j = 1; i < n; i++, j += 2)
        {
            dst[i] = src[j];
        }
        // recursion
        if(n2 > 1)
        {
            NTT_calc(src, dst, n2, w2);  // even
            NTT_calc(src + n2, dst + n2, n2, w2);  // odd
        }
        else if(n2 == 1)
        {
            src[0] = dst[0];
            src[1] = dst[1];
        }

        // restore results

        w2 = 1, i = 0, j = n2;
        a0 = src[i];
        a1 = src[j];
        dst[i] = modadd(a0, a1);
        dst[j] = modsub(a0, a1);
        while (++i < n2)
        {
            w2 = modmul(w2, w);
            j++;
            a0 = src[i];
            a1 = modmul(src[j], w2);
            dst[i] = modadd(a0, a1);
            dst[j] = modsub(a0, a1);
        }
    }
}

//---------------------------------------------------------------------------
void fast_ntt::NTT_slow(uint32 *dst, uint32 *src, uint32 n, uint32 w)
{
    uint32 i, j, wj, wi, a,
        n2 = n >> 1;
    for (wj = 1, j = 0; j < n; j++)
    {
        a = 0;
        for (wi = 1, i = 0; i < n; i++)
        {
            a = modadd(a, modmul(wi, src[i]));
            wi = modmul(wi, wj);
        }
        dst[j] = a;
        wj = modmul(wj, w);
    }
}

//---------------------------------------------------------------------------
void fast_ntt::INTT_slow(uint32 *dst, uint32 *src, uint32 n, uint32 w)
{
    uint32 i, j, wi = 1, wj = 1, a, n2 = n >> 1;

    for (wj = 1, j = 0; j < n; j++)
    {
        a = 0;
        for (wi = 1, i = 0; i < n; i++)
        {
            a = modadd(a, modmul(wi, src[i]));
            wi = modmul(wi, wj);
        }
        dst[j] = modmul(a, rN);
        wj = modmul(wj, iW);
    }
}    


//---------------------------------------------------------------------------
uint32 fast_ntt::modadd(uint32 a, uint32 b)
{
    uint32 d;
    d = a + b;

    if(d < a)
    {
        d -= p;
    }
    if (d >= p)
    {
        d -= p;
    }
    return d;
}

//---------------------------------------------------------------------------
uint32 fast_ntt::modsub(uint32 a, uint32 b)
{
    uint32 d;
    d = a - b;
    if (d > a)
    {
        d += p;
    }
    return d;
}

//---------------------------------------------------------------------------
uint32 fast_ntt::modmul(uint32 a, uint32 b)
{
    uint32 _a = a;
    uint32 _b = b;

    // Original
    uint32 _p = p;
    __asm
    {
        mov eax, _a;
        mul _b;
        div _p;
        mov eax, edx;
    };
}


uint32 fast_ntt::modpow(uint32 a, uint32 b)
{
    //*
    uint64 D, M, A, P;

    P = p; A = a;
    M = 0llu - (b & 1);
    D = (M & A) | ((~M) & 1);

    while ((b >>= 1) != 0)
    {
        A = modmul(A, A);
        //A = (A * A) % P;

        if ((b & 1) == 1)
        {
            //D = (D * A) % P;
            D = modmul(D, A);
        }
    }
    return (uint32)D;
}

新modmul

uint32 fast_ntt::modmul(uint32 a, uint32 b)
{
    uint32 _a = a;
    uint32 _b = b;   

    __asm
    {
    mov eax, a;
    mul b;
    mov ebx, eax;
    mov eax, 2863311530;
    mov ecx, edx;
    mul edx;
    shld edx, eax, 1;
    mov eax, 3221225473;

    mul edx;
    sub ebx, eax;
    mov eax, 3221225473;
    sbb ecx, edx;
    jc addback;

            neg ecx;
            and ecx, eax;
            sub ebx, ecx;

    sub ebx, eax;
    sbb edx, edx;
    and eax, edx;
            addback:
    add eax, ebx;          
    };  
}

<强> [编辑] Spektre,根据您的反馈,我改变了modadd&amp; modsub回到他们原来的。我也意识到我对我不应该的递归NTT函数做了一些修改。

<强> [EDIT2] 删除了不需要的if语句和按位函数。

<强> [EDIT3] 添加了新的modmul内联汇编。