转换16字阵列的最快方法

时间:2016-07-28 20:05:51

标签: c++ arrays performance

我有以下代码:

void shuffle_words(WORD_TYPE* _state) 
{
    WORD_TYPE temp[DATA_SIZE];

    temp[7]  = _state[0];
    temp[12] = _state[1];
    temp[14] = _state[2];
    temp[9]  = _state[3];
    temp[2]  = _state[4];
    temp[1]  = _state[5];
    temp[5]  = _state[6];
    temp[15] = _state[7];
    temp[11] = _state[8];
    temp[6]  = _state[9];
    temp[13] = _state[10];
    temp[0]  = _state[11];
    temp[4]  = _state[12];
    temp[8]  = _state[13];
    temp[10] = _state[14];
    temp[3]  = _state[15];

    memcpy_s(_state, temp, DATA_SIZE * WORD_SIZE);
}


int prp(WORD_TYPE* data, WORD_TYPE key) 
{
    shuffle_words(data);
    key = round_function<14, 15>(data, key);
    key = round_function<13, 14>(data, key);
    key = round_function<12, 13>(data, key);
    key = round_function<11, 12>(data, key);
    key = round_function<10, 11>(data, key);
    key = round_function<9, 10>(data, key);
    key = round_function<8, 9>(data, key);
    key = round_function<7, 8>(data, key);
    key = round_function<6, 7>(data, key);
    key = round_function<5, 6>(data, key);
    key = round_function<4, 5>(data, key);
    key = round_function<3, 4>(data, key);
    key = round_function<2, 3>(data, key);
    key = round_function<1, 2>(data, key);
    key = round_function<0, 1>(data, key);
    key = round_function<15, 0>(data, key);
    return key;
}

我想知道是否有更快的方法来执行shuffle_words操作。我已经看到有关矩阵转置的问题,但这些问题似乎集中在矩阵大或多维的情况下。

我的数组总是大小为16个字,prp函数将在同一个数组上多次应用,一个接一个地应用。这让我相信只需访问转置顺序中的元素而不实际转置它们是一种选择。

round_function已经将数据写入数组,如果将shuffle移动到更合适的位置是可以接受的。如果需要,以下是代码:

template <int left_index, int right_index> 
WORD_TYPE round_function(WORD_TYPE* state, WORD_TYPE key) 
{
    WORD_TYPE left, right;
    left = state[left_index];
    right = state[right_index];

    key ^= right;
    right = rotate_left<ROTATION_AMOUNT>(right + key + left_index);
    key ^= right;

    key ^= left;
    left += right >> (BIT_WIDTH / 2);
    left ^= rotate_left<(left_index % BIT_WIDTH) ^ ROTATION_AMOUNT>(right);
    key ^= left;

    state[left_index] = left;
    state[right_index] = right;
    return key;
}

我想为round_function提供目标索引,但这样做会覆盖尚未操作的字节,这会破坏目标索引处的数据。

执行单词转置步骤的最有效方法是什么? 是否有可能在没有临时存储和memcpy的情况下有效地执行shuffle_words?如果我保留原样,编译器会为我优化吗?

编辑:

对于16个空单词的示例输入,我得到以下输出:

5390936987981438580
7289498000187791405
11630888819098945478
4862561973623181657
11364775727483781365
1302861686580238483
10934483497681452460
376472396741801
17443576244438476890
17213444377027086447
15287741771379858051
16772715748200046576
6216997191100954620
16389751604649919423
2033403819063771136
14517213842436349075

我使用了这些#defines:

#define ROTATION_AMOUNT 41
#define BIT_WIDTH 64
#define DATA_SIZE 16
typedef unsigned long long WORD_TYPE;

如果能够提高效率,可以稍微修改功能,我很好。

1 个答案:

答案 0 :(得分:2)

Yes!

void shuffle_words(WORD_TYPE* _state) {

    WORD_TYPE temp = _state[0];

    _state[0] = _state[11];
    _state[11] = _state[8];
    _state[8] = _state[13];
    _state[13] = _state[10];
    _state[10] = _state[14];
    _state[14] = _state[2];
    _state[2] = _state[4];
    _state[4] = _state[12];
    _state[12] = _state[1];
    _state[1] = _state[5];
    _state[5] = _state[6];
    _state[6] = _state[9];
    _state[9] = _state[3];
    _state[3] = _state[15];
    _state[15] = _state[7];
    _state[7] = temp;
}