Question

我考虑如何使 2字节数组的有效 XORing 。我把这个字节数组定义为unsigned char * 我认为 XORing 它们uint64_t会更快。这是真的吗？如何有效地将unsigned char *转换为此uint64_t *，最好在XORing循环中？如果％8的长度 ％8不是<0> ，如何填充最后一个字节的填充？

这是我当前的代码，XORs bytes数组，但每个字节（unsigned char）分开：

unsigned char *bitwise_xor(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) { unsigned char *XOR_Bytes_Array; // allocate XORed bytes array XOR_Bytes_Array = malloc(sizeof(unsigned char) * length); // perform bitwise XOR operation on bytes arrays A and B for(int i=0; i < length; i++) XOR_Bytes_Array[i] = (unsigned char)(A_Bytes_Array[i] ^ B_Bytes_Array[i]); return XOR_Bytes_Array; }

好的，与此同时我试图这样做。我的bytes_array相当大（rgba位图4 * 1440 * 900？）。

static uint64_t next64bitsFromBytesArray(const unsigned char *bytesArray, const int i) { uint64_t next64bits = (uint64_t) bytesArray[i+7] | ((uint64_t) bytesArray[i+6] << 8) | ((uint64_t) bytesArray[i+5] << 16) | ((uint64_t) bytesArray[i+4] << 24) | ((uint64_t) bytesArray[i+3] << 32) | ((uint64_t) bytesArray[i+2] << 40) | ((uint64_t) bytesArray[i+1] << 48) | ((uint64_t)bytesArray[i] << 56); return next64bits; } unsigned char *bitwise_xor64(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) { unsigned char *XOR_Bytes_Array; // allocate XORed bytes array XOR_Bytes_Array = malloc(sizeof(unsigned char) * length); // perform bitwise XOR operation on bytes arrays A and B using uint64_t for(int i=0; i<length; i+=8) { uint64_t A_Bytes = next64bitsFromBytesArray(A_Bytes_Array, i); uint64_t B_Bytes = next64bitsFromBytesArray(B_Bytes_Array, i); uint64_t XOR_Bytes = A_Bytes ^ B_Bytes; memcpy(XOR_Bytes_Array + i, &XOR_Bytes, 8); } return XOR_Bytes_Array; }

更新:(解决此问题的第二种方法）

unsigned char *bitwise_xor64(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) { const uint64_t *aBytes = (const uint64_t *) A_Bytes_Array; const uint64_t *bBytes = (const uint64_t *) B_Bytes_Array; unsigned char *xorBytes = malloc(sizeof(unsigned char)*length); for(int i = 0, j=0; i < length; i +=8) { uint64_t aXORbBytes = aBytes[j] ^ bBytes[j]; //printf("a XOR b = 0x%" PRIx64 "\n", aXORbBytes); memcpy(xorBytes + i, &aXORbBytes, 8); j++; } return xorBytes; }

Answer 1

所以我做了一个实验：

#include <stdlib.h>
#include <stdint.h>

#ifndef TYPE
#define TYPE uint64_t
#endif

TYPE *
xor(const void *va, const void *vb, size_t l)
{
        const TYPE *a = va;
        const TYPE *b = vb;
        TYPE *r = malloc(l);
        size_t i;

        for (i = 0; i < l / sizeof(TYPE); i++) {
                *r++ = *a++ ^ *b++;
        }
        return r;
}

使用基本优化的clang编译了uint64_t和uint8_t。在这两种情况下，编译器都将地狱矢量化了。不同之处在于，当l不是8的倍数时，uint8_t版本具有处理代码。因此，如果我们添加代码来处理大小不是8的倍数，那么最终可能会产生等效的生成码。此外，64位版本展开了几次循环，并有代码来处理，所以对于足够大的数组，你可能会在这里获得几个百分点。另一方面，在足够大的数组上，你将受内存限制，xor操作无关紧要。

你确定你的编译器不会处理这个吗？这是一种微观优化，只有在你测量东西时才有意义，然后你就不需要问哪一个更快，你知道。

C中的按位异或使用64位而不是8位

1 个答案: