Question

这是我的问题。我想使用Openmp在C中并行化AES-128加密。使用openmp，我几乎没有使用以下代码获得任何加速。我的机器是Quadcore intel i5机器。

这是代码。任何关于如何进一步并行化此代码的建议都会非常感激。请查看代码末尾的main函数。下面的AES代码包含一些实现其功能的函数。请建议如何最好地从中提取并行性。

非常感谢。

/*
******************************************************************
**       Advanced Encryption Standard implementation in C.      **
**       By Niyaz PK                                            **
**       E-mail: niyazpk@gmail.com                              **
**       Downloaded from Website: www.hoozi.com                 **
******************************************************************
This is the source code for encryption using the latest AES algorithm.
******************************************************************
*/

// Include stdio.h for standard input/output.
// Used for giving output to the screen.
#include<omp.h>
#include<stdio.h>
#include<time.h>
#include<stdlib.h>


// The number of columns comprising a state in AES. This is a constant in AES. Value=4
#define Nb 4

// The number of rounds in AES Cipher. It is simply initiated to zero. The actual value is recieved in the program.
int Nr=0;

// The number of 32 bit words in the key. It is simply initiated to zero. The actual value is recieved in the program.
int Nk=0;

// in - it is the array that holds the plain text to be encrypted.
// out - it is the array that holds the output CipherText after encryption.
// state - the array that holds the intermediate results during encryption.
unsigned char in[16], out[16], state[4][4];

// The array that stores the round keys.
unsigned char RoundKey[240];

// The Key input to the AES Program
unsigned char Key[32];



int getSBoxValue(int num)
{
    int sbox[256] =   {
    //0     1    2      3     4    5     6     7      8    9     A      B    C     D     E     F
    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, //0
    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, //1
    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, //2
    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, //3
    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, //4
    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, //5
    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, //6
    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, //7
    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, //8
    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, //9
    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, //A
    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, //B
    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, //C
    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, //D
    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, //E
    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; //F
    return sbox[num];
}

// The round constant word array, Rcon[i], contains the values given by 
// x to th e power (i-1) being powers of x (x is denoted as {02}) in the field GF(28)
// Note that i starts at 1, not 0).
int Rcon[255] = {
    0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 
    0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 
    0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 
    0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 
    0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 
    0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 
    0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 
    0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 
    0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 
    0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 
    0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 
    0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 
    0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 
    0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 
    0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 
    0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb  };

// This function produces Nb(Nr+1) round keys. The round keys are used in each round to encrypt the states. 
void KeyExpansion()
{
    int i,j;
    unsigned char temp[4],k;

    // The first round key is the key itself.
    for(i=0;i<Nk;i++)
    {
        RoundKey[i*4]=Key[i*4];
        RoundKey[i*4+1]=Key[i*4+1];
        RoundKey[i*4+2]=Key[i*4+2];
        RoundKey[i*4+3]=Key[i*4+3];
    }

    // All other round keys are found from the previous round keys.
    while (i < (Nb * (Nr+1)))
    {
        for(j=0;j<4;j++)
        {
            temp[j]=RoundKey[(i-1) * 4 + j];
        }
        if (i % Nk == 0)
        {
            // This function rotates the 4 bytes in a word to the left once.
            // [a0,a1,a2,a3] becomes [a1,a2,a3,a0]

            // Function RotWord()
            {
                k = temp[0];
                temp[0] = temp[1];
                temp[1] = temp[2];
                temp[2] = temp[3];
                temp[3] = k;
            }

            // SubWord() is a function that takes a four-byte input word and 
            // applies the S-box to each of the four bytes to produce an output word.

            // Function Subword()
            {
                temp[0]=getSBoxValue(temp[0]);
                temp[1]=getSBoxValue(temp[1]);
                temp[2]=getSBoxValue(temp[2]);
                temp[3]=getSBoxValue(temp[3]);
            }

            temp[0] =  temp[0] ^ Rcon[i/Nk];
        }
        else if (Nk > 6 && i % Nk == 4)
        {
            // Function Subword()
            {
                temp[0]=getSBoxValue(temp[0]);
                temp[1]=getSBoxValue(temp[1]);
                temp[2]=getSBoxValue(temp[2]);
                temp[3]=getSBoxValue(temp[3]);
            }
        }
        RoundKey[i*4+0] = RoundKey[(i-Nk)*4+0] ^ temp[0];
        RoundKey[i*4+1] = RoundKey[(i-Nk)*4+1] ^ temp[1];
        RoundKey[i*4+2] = RoundKey[(i-Nk)*4+2] ^ temp[2];
        RoundKey[i*4+3] = RoundKey[(i-Nk)*4+3] ^ temp[3];
        i++;
    }
}

// This function adds the round key to state.
// The round key is added to the state by an XOR function.
void AddRoundKey(int round) 
{
    int i,j;
    for(i=0;i<4;i++)
    {
        for(j=0;j<4;j++)
        {
            state[j][i] ^= RoundKey[round * Nb * 4 + i * Nb + j];
        }
    }
}

// The SubBytes Function Substitutes the values in the
// state matrix with values in an S-box.
void SubBytes()
{
    int i,j;
    for(i=0;i<4;i++)
    {
        for(j=0;j<4;j++)
        {
            state[i][j] = getSBoxValue(state[i][j]);

        }
    }
}

// The ShiftRows() function shifts the rows in the state to the left.
// Each row is shifted with different offset.
// Offset = Row number. So the first row is not shifted.
void ShiftRows()
{
    unsigned char temp;

    // Rotate first row 1 columns to left    
    temp=state[1][0];
    state[1][0]=state[1][1];
    state[1][1]=state[1][2];
    state[1][2]=state[1][3];
    state[1][3]=temp;

    // Rotate second row 2 columns to left    
    temp=state[2][0];
    state[2][0]=state[2][2];
    state[2][2]=temp;

    temp=state[2][1];
    state[2][1]=state[2][3];
    state[2][3]=temp;

    // Rotate third row 3 columns to left
    temp=state[3][0];
    state[3][0]=state[3][3];
    state[3][3]=state[3][2];
    state[3][2]=state[3][1];
    state[3][1]=temp;
}

// xtime is a macro that finds the product of {02} and the argument to xtime modulo {1b}  
#define xtime(x)   ((x<<1) ^ (((x>>7) & 1) * 0x1b))

// MixColumns function mixes the columns of the state matrix
// The method used may look complicated, but it is easy if you know the underlying theory.
// Refer the documents specified above.
void MixColumns()
{
    int i;
    unsigned char Tmp,Tm,t;
    for(i=0;i<4;i++)
    {    
        t=state[0][i];
        Tmp = state[0][i] ^ state[1][i] ^ state[2][i] ^ state[3][i] ;
        Tm = state[0][i] ^ state[1][i] ; Tm = xtime(Tm); state[0][i] ^= Tm ^ Tmp ;
        Tm = state[1][i] ^ state[2][i] ; Tm = xtime(Tm); state[1][i] ^= Tm ^ Tmp ;
        Tm = state[2][i] ^ state[3][i] ; Tm = xtime(Tm); state[2][i] ^= Tm ^ Tmp ;
        Tm = state[3][i] ^ t ; Tm = xtime(Tm); state[3][i] ^= Tm ^ Tmp ;
    }
}

// Cipher is the main function that encrypts the PlainText.
void Cipher()
{
    int i,j,round=0;

    //Copy the input PlainText to state array.
    for(i=0;i<4;i++)
    {
        for(j=0;j<4;j++)
        {
            state[j][i] = in[i*4 + j];
        }
    }

    // Add the First round key to the state before starting the rounds.
    AddRoundKey(0); 

    // There will be Nr rounds.
    // The first Nr-1 rounds are identical.
    // These Nr-1 rounds are executed in the loop below.
    for(round=1;round<Nr;round++)
    {
        SubBytes();
        ShiftRows();
        MixColumns();
        AddRoundKey(round);
    }

    // The last round is given below.
    // The MixColumns function is not here in the last round.
    SubBytes();
    ShiftRows();
    AddRoundKey(Nr);

    // The encryption process is over.
    // Copy the state array to output array.
    for(i=0;i<4;i++)
    {
        for(j=0;j<4;j++)
        {
            out[i*4+j]=state[j][i];
        }
    }
}

void encrypt(int *K,int *PT,int *CT)
{
    int i;

    //    int ct;

    // Calculate Nk and Nr from the received value.
    Nr = 128;
    Nk = Nr / 32;
    Nr = Nk + 6;


     // Copy the Key and PlainText
    for(i=0;i<Nk*4;i++)
    {
        Key[i]=K[i];
        in[i]=PT[i];
    }

   /* 
   printf("\nKey for encryption:\n");
    for(i=0; i < Nk*4; i++)
      printf("%02x",Key[i]);
    printf("\n");
*/
/*
    printf("\nText before encryption:\n");
    for(i=0; i < Nk*4; i++)
      printf("%02x",in[i]);
    printf("\n");
*/    
    // The KeyExpansion routine must be called before encryption.
    KeyExpansion();

    // The next function call encrypts the PlainText with the Key using AES algorithm.
    Cipher();


    // Output the encrypted text.
    //io_printf("\nText after encryption:\n");
     for(i=0; i < Nk*4; i++)
    {
        CT[i] = out[i];
        printf("%02x",out[i]);
      }
    printf("\n");

    //  ct = out[15];
    // return ct;

}

//main function
int main()
{


  srand(time(NULL));
  unsigned int rnd[4];

  int key[16];
  int pt[16];
  int ct[16];

  unsigned int i,j;

  #pragma omp parallel for num_threads(4) schedule(dynamic)
  for(i=0; i<65000*10; i++)
  {
   rnd[0]=rand();
   rnd[1]=rand();
   rnd[2]=rand();
   rnd[3]=rand();

   for(j=0; j < 4; j++)
   {
    key[4*j]   = (rnd[j] & 0xff);
    pt[4*j]    = key[4*j];
    key[4*j+1] = ((rnd[j] >> 8)  & 0xff) ; 
    pt[4*j+1]  = key[4*j+1];
    key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
    pt[4*j+2]  = key[4*j+2];
    key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
    pt[4*j+3]  = key[4*j+3];
   }

   #pragma omp task      
   encrypt(key,pt,ct);

  }

  return 0;

}

我已按照Hristo的建议修改了代码。谢谢你的努力。以下是代码的外观。我不明白如何使encrypt（）函数使用局部变量。你可以解释吗。请将代码添加到应有的位置。再次感谢您的努力。其次，如果没有printf语句，您将如何看待输出是否正确。我的意思是有其他机制来显示或保存输出。最后，如下所示的代码仍然比串行执行慢（即，没有openmp）。串行版本中没有printf可以使比较公平。

void encrypt(int *K,int *PT,int *CT)
{
    int i;

    //    int ct;

    // Calculate Nk and Nr from the received value.
    Nr = 128;
    Nk = Nr / 32;
    Nr = Nk + 6;


     // Copy the Key and PlainText
    for(i=0;i<Nk*4;i++)
    {
        Key[i]=K[i];
        in[i]=PT[i];
    }

   /* 
   printf("\nKey for encryption:\n");
    for(i=0; i < Nk*4; i++)
      printf("%02x",Key[i]);
    printf("\n");
*/
/*
    printf("\nText before encryption:\n");
    for(i=0; i < Nk*4; i++)
      printf("%02x",in[i]);
    printf("\n");
*/    
    // The KeyExpansion routine must be called before encryption.
    KeyExpansion();

    // The next function call encrypts the PlainText with the Key using AES algorithm.
    Cipher();


    // Output the encrypted text.
    //io_printf("\nText after encryption:\n");
     for(i=0; i < Nk*4; i++)
    {
        CT[i] = out[i];
//        printf("%02x",out[i]);
      }
//    printf("\n");

    //  ct = out[15];
    // return ct;

}

//main function
int main()
{


  srand(time(NULL));
  unsigned int rnd[4];

//  printf("rand_key = %2x%2x%2x%2x\n",rnd[0],rnd[1],rnd[2],rnd[3]);

  int key[16];
  int pt[16];
  int ct[16];

  unsigned int i,j;
  #pragma omp parallel for private(key,pt,ct) num_threads(2) schedule(static)
  for(i=0; i<65000; i++)
  {
   rnd[0]=rand();
   rnd[1]=rand();
   rnd[2]=rand();
   rnd[3]=rand();

   for(j=0; j < 4; j++)
   {
    key[4*j]   = (rnd[j] & 0xff);
    pt[4*j]    = key[4*j];
    key[4*j+1] = ((rnd[j] >> 8)  & 0xff) ; 
    pt[4*j+1]  = key[4*j+1];
    key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
    pt[4*j+2]  = key[4*j+2];
    key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
    pt[4*j+3]  = key[4*j+3];
   }

   encrypt(key,pt,ct);


  }

  return 0;

}

Answer 1

您既不需要schedule(dynamic)也不需要task构造。据我了解AES的内在函数，这是一个完全常规的问题 - 每次加密都需要完全相同的周期数，因此无论密钥是什么，都需要相同的挂钟时间。这完全排除了使用动态调度和任务的必要性。即使在不平衡问题的情况下，简单地添加schedule(dynamic)也是一个非常糟糕的主意。原因是dynamic的默认块大小为1，这意味着每个线程执行一次迭代，然后向OpenMP运行时请求另一个。在您的情况下，开销乘以650000倍。动态调度在实际应用时非常强大，但是应该仔细选择最佳块大小，后者通常需要进行大量试验，直到找到最佳值。

除此之外，您还可以生成650000个任务。每个任务都有一定的开销与工作线程的创建和后续使用相关联。鉴于AES在Pentium Pro上每个字节大约需要18个周期（参考：Wikipedia），每次调用encrypt()可能需要大约与OpenMP运行时所需的时间相同才能执行任务，如果不是内部的printf()语句。 printf()输出到终端或文件流（如果重定向），并且使用相同描述符执行I / O实际上是串行操作，即它将线程序列化。请参阅this answer，了解printf()对并行效果的影响程度。

但是代码中最糟糕的问题实际上是大量的数据争用。 encrypt()取决于并更改几个全局变量的值。这不仅会导致由于真正的缓存共享而导致速度减慢，而且最有可能导致完全错误的密文。如果全局变量必须保持全局，那么这些全局变量应该全部变为encrypt()或threadprivate的本地变量。然后并行循环使用几个共享变量，即key，pt和ct。这些应该private。

摘要：make encrypt()仅使用局部变量;制作key，pt和ct private;将循环计划更改为static;删除task构造;删除在每次迭代时输出信息的所有printf语句。

奖励：rand()也将其状态保存在全局变量中。

全球变量太多了。只需将它们设为线程私有。在定义最后一个全局变量之后添加以下OpenMP pragma：

...
// The Key input to the AES Program
unsigned char Key[32];

#pragma omp threadprivate(Nr,Nk,in,out,state,RoundKey,Key)

...

同时更改main()功能，如下所示：

unsigned int i;
#pragma omp parallel for num_threads(2) schedule(static)
for(i = 0; i < 65000; i++)
{
  unsigned int rnd[4];
  int key[16];
  int pt[16];
  int ct[16];
  unsigned int j;
  // Per-thread PRNG initialisation
  // It could be done better - this is for illustration purposes only
  unsigned int rand_state = time(NULL) + 1337*omp_get_thread_num();

  rnd[0] = rand_r(&rand_state);
  rnd[1] = rand_r(&rand_state);
  rnd[2] = rand_r(&rand_state);
  rnd[3] = rand_r(&rand_state);

  for(j = 0; j < 4; j++)
  {
    key[4*j]   = (rnd[j] & 0xff);
    pt[4*j]    = key[4*j];
    key[4*j+1] = ((rnd[j] >> 8)  & 0xff) ; 
    pt[4*j+1]  = key[4*j+1];
    key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
    pt[4*j+2]  = key[4*j+2];
    key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
    pt[4*j+3]  = key[4*j+3];
  }

  encrypt(key, pt, ct);
}

注意 - key，pt，j等变量在使用它们的范围内定义。这使您无需将它们全部放在private子句中，因为这些变量预先设置为private。此外，每个线程现在都有自己的PRNG状态。

使用Openmp并行模拟C语言中的AES

1 个答案: