我有一个函数可以将二进制数据从一个区域复制到另一个区域,但前提是这些字节与特定值不同。这是一个代码示例:
void copy_if(char* src, char* dest, size_t size, char ignore)
{
for (size_t i = 0; i < size; ++i)
{
if (src[i] != ignore)
dest[i] = src[i];
}
}
问题是这对我目前的需求来说太慢了。有没有办法以更快的方式获得相同的结果?
更新 基于答案,我尝试了两个新的实现:
void copy_if_vectorized(const uint8_t* src, uint8_t* dest, size_t size, char ignore)
{
for (size_t i = 0; i < size; ++i)
{
char temps = src[i];
char tempd = dest[i];
dest[i] = temps == ignore ? tempd : temps;
}
}
void copy_if_SSE(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore)
{
const __m128i vignore = _mm_set1_epi8(ignore);
size_t i;
for (i = 0; i + 16 <= size; i += 16)
{
__m128i v = _mm_loadu_si128((__m128i *)&src[i]);
__m128i vmask = _mm_cmpeq_epi8(v, vignore);
vmask = _mm_xor_si128(vmask, _mm_set1_epi8(-1));
_mm_maskmoveu_si128(v, vmask, (char *)&dest[i]);
}
for (; i < size; ++i)
{
if (src[i] != ignore)
dest[i] = src[i];
}
}
我得到了以下结果:
Naive:
Duration: 2.04844s
Vectorized:
Pass: PASS
Duration: 3.18553s
SIMD:
Pass: PASS
Duration: 0.481888s
我想我的编译器无法进行矢量化(最后一次MSVC),但SIMD解决方案已经足够了,谢谢!
更新(之二) 我设法使用一些pragma指令为我的编译(MSVC)进行矢量化,事实上它实际上比SIMD更快,这是最终的代码:
void copy_if_vectorized(const uint8_t* src, uint8_t* dest, size_t size, char ignore)
{
#pragma loop(hint_parallel(0))
#pragma loop(ivdep)
for (int i = 0; i < size; ++i) // Sadly no parallelization if i is unsigned, but more than 2Go of data is very unlikely
{
char temps = src[i];
char tempd = dest[i];
dest[i] = temps == ignore ? tempd : temps;
}
}
答案 0 :(得分:6)
我的gcc 4.8.4矢量化以下代码:
#include <stddef.h>
void copy_if(char* src, char* dest, size_t size, char ignore)
{
for (size_t i = 0; i < size; ++i)
{
char temps = src[i];
char tempd = dest[i];
dest[i] = temps == ignore ? tempd : temps;
}
}
请注意,来自dest[i]
的负载和赋值都是无条件的,因此编译器不受禁止在多线程程序中发明存储的限制。
对于-march=core-avx2
,生成的程序集包含此向量化循环,一次处理32个字节:
.L9:
vmovdqu (%rdi,%rcx), %ymm1
addq $1, %r10
vmovdqu (%rsi,%rcx), %ymm2
vpcmpeqb %ymm0, %ymm1, %ymm3
vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
vmovdqu %ymm1, (%rsi,%rcx)
addq $32, %rcx
cmpq %r10, %r8
ja .L9
对于通用x86-64,生成的程序集包含此向量化循环,一次处理16个字节:
.L9:
movdqu (%rdi,%r8), %xmm3
addq $1, %r10
movdqa %xmm3, %xmm1
movdqu (%rsi,%r8), %xmm2
pcmpeqb %xmm0, %xmm1
pand %xmm1, %xmm2
pandn %xmm3, %xmm1
por %xmm2, %xmm1
movdqu %xmm1, (%rsi,%r8)
addq $16, %r8
cmpq %r9, %r10
jb .L9
对于armv7l-neon,clang-3.7
生成以下循环,一次处理16个字节:
.LBB0_9: @ %vector.body
@ =>This Inner Loop Header: Depth=1
vld1.8 {d18, d19}, [r5]!
subs.w lr, lr, #16
vceq.i8 q10, q9, q8
vld1.8 {d22, d23}, [r4]
vbsl q10, q11, q9
vst1.8 {d20, d21}, [r4]!
bne .LBB0_9
因此,代码不仅比汇编或内在函数更具可读性,而且还可以可移植到多个体系结构和编译器。重新编译可以轻松使用新的体系结构和指令集扩展。
答案 1 :(得分:5)
以下是使用SSE2 instrinsics来利用maskmovdqu指令的示例。 SIMD版本的运行速度似乎是Haswell CPU上原始版本的2倍(用clang编译的代码):
#import <UIKit/UIKit.h>
@interface CellNumber : UICollectionViewCell
@property (strong, nonatomic)IBOutlet UILabel *lblNo;
@end
编译和测试:
#include <stdio.h>
#include <string.h>
#include <emmintrin.h> // SSE2
#include <sys/time.h> // gettimeofday
void copy_if_ref(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore)
{
for (size_t i = 0; i < size; ++i)
{
if (src[i] != ignore)
dest[i] = src[i];
}
}
void copy_if_SSE(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore)
{
const __m128i vignore = _mm_set1_epi8(ignore);
size_t i;
for (i = 0; i + 16 <= size; i += 16)
{
__m128i v = _mm_loadu_si128((__m128i *)&src[i]);
__m128i vmask = _mm_cmpeq_epi8(v, vignore);
vmask = _mm_xor_si128(vmask, _mm_set1_epi8(-1));
_mm_maskmoveu_si128 (v, vmask, (char *)&dest[i]);
}
for ( ; i < size; ++i)
{
if (src[i] != ignore)
dest[i] = src[i];
}
}
#define TIME_IT(init, copy_if, src, dest, size, ignore) \
do { \
const int kLoops = 1000; \
struct timeval t0, t1; \
double t_ms = 0.0; \
\
for (int i = 0; i < kLoops; ++i) \
{ \
init; \
gettimeofday(&t0, NULL); \
copy_if(src, dest, size, ignore); \
gettimeofday(&t1, NULL); \
t_ms += ((double)(t1.tv_sec - t0.tv_sec) + (double)(t1.tv_usec - t0.tv_usec) * 1.0e-6) * 1.0e3; \
} \
printf("%s: %.3g ns / element\n", #copy_if, t_ms * 1.0e6 / (double)(kLoops * size)); \
} while (0)
int main()
{
const size_t N = 10000000;
uint8_t *src = malloc(N);
uint8_t *dest_ref = malloc(N);
uint8_t *dest_init = malloc(N);
uint8_t *dest_test = malloc(N);
for (size_t i = 0; i < N; ++i)
{
src[i] = (uint8_t)rand();
dest_init[i] = (uint8_t)rand();
}
memcpy(dest_ref, dest_init, N);
copy_if_ref(src, dest_ref, N, 0x42);
memcpy(dest_test, dest_init, N);
copy_if_SSE(src, dest_test, N, 0x42);
printf("copy_if_SSE: %s\n", memcmp(dest_ref, dest_test, N) == 0 ? "PASS" : "FAIL");
TIME_IT(memcpy(dest_test, dest_init, N), copy_if_ref, src, dest_ref, N, 0x42);
TIME_IT(memcpy(dest_test, dest_init, N), copy_if_SSE, src, dest_test, N, 0x42);
return 0;
}
(注意:这个答案的早期版本在时序代码中有一个16的杂散因子,所以早期的数字比它们应该的数字高16倍。)
<强>更新强>
受@ EOF解决方案和编译器生成的代码的启发,我尝试了一种与SSE4不同的方法,并获得了更好的结果:
$ gcc -Wall -msse2 -O3 copy_if.c && ./a.out
copy_if_SSE: PASS
copy_if_ref: 0.416 ns / element
copy_if_SSE: 0.239 ns / element
编译和测试:
#include <stdio.h>
#include <string.h>
#include <smmintrin.h> // SSE4
#include <sys/time.h> // gettimeofday
void copy_if_ref(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore)
{
for (size_t i = 0; i < size; ++i)
{
if (src[i] != ignore)
dest[i] = src[i];
}
}
void copy_if_EOF(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore)
{
for (size_t i = 0; i < size; ++i)
{
char temps = src[i];
char tempd = dest[i];
dest[i] = temps == ignore ? tempd : temps;
}
}
void copy_if_SSE(const uint8_t* src, uint8_t* dest, size_t size, uint8_t ignore)
{
const __m128i vignore = _mm_set1_epi8(ignore);
size_t i;
for (i = 0; i + 16 <= size; i += 16)
{
__m128i vsrc = _mm_loadu_si128((__m128i *)&src[i]);
__m128i vdest = _mm_loadu_si128((__m128i *)&dest[i]);
__m128i vmask = _mm_cmpeq_epi8(vsrc, vignore);
vdest = _mm_blendv_epi8(vsrc, vdest, vmask);
_mm_storeu_si128 ((__m128i *)&dest[i], vdest);
}
for ( ; i < size; ++i)
{
if (src[i] != ignore)
dest[i] = src[i];
}
}
#define TIME_IT(init, copy_if, src, dest, size, ignore) \
do { \
const int kLoops = 1000; \
struct timeval t0, t1; \
double t_ms = 0.0; \
\
for (int i = 0; i < kLoops; ++i) \
{ \
init; \
gettimeofday(&t0, NULL); \
copy_if(src, dest, size, ignore); \
gettimeofday(&t1, NULL); \
t_ms += ((double)(t1.tv_sec - t0.tv_sec) + (double)(t1.tv_usec - t0.tv_usec) * 1.0e-6) * 1.0e3; \
} \
printf("%s: %.3g ns / element\n", #copy_if, t_ms * 1.0e6 / (double)(kLoops * size)); \
} while (0)
int main()
{
const size_t N = 10000000;
uint8_t *src = malloc(N);
uint8_t *dest_ref = malloc(N);
uint8_t *dest_init = malloc(N);
uint8_t *dest_test = malloc(N);
for (size_t i = 0; i < N; ++i)
{
src[i] = (uint8_t)rand();
dest_init[i] = (uint8_t)rand();
}
memcpy(dest_ref, dest_init, N);
copy_if_ref(src, dest_ref, N, 0x42);
memcpy(dest_test, dest_init, N);
copy_if_EOF(src, dest_test, N, 0x42);
printf("copy_if_EOF: %s\n", memcmp(dest_ref, dest_test, N) == 0 ? "PASS" : "FAIL");
memcpy(dest_test, dest_init, N);
copy_if_SSE(src, dest_test, N, 0x42);
printf("copy_if_SSE: %s\n", memcmp(dest_ref, dest_test, N) == 0 ? "PASS" : "FAIL");
TIME_IT(memcpy(dest_test, dest_init, N), copy_if_ref, src, dest_ref, N, 0x42);
TIME_IT(memcpy(dest_test, dest_init, N), copy_if_EOF, src, dest_test, N, 0x42);
TIME_IT(memcpy(dest_test, dest_init, N), copy_if_SSE, src, dest_test, N, 0x42);
return 0;
}
结论:虽然$ gcc -Wall -msse4 -O3 copy_if_2.c && ./a.out
copy_if_EOF: PASS
copy_if_SSE: PASS
copy_if_ref: 0.419 ns / element
copy_if_EOF: 0.114 ns / element
copy_if_SSE: 0.114 ns / element
从功能角度来看似乎是解决此问题的一个很好的解决方案,但它似乎与使用显式加载,屏蔽和存储一样高效。此外,在这种情况下,编译器生成的代码(参见@ EOF&#39;答案)似乎与显式编码的SIMD一样快。
答案 2 :(得分:0)
以下是一种改进,虽然编译器可以自己设计。
void copy_if(char* src, char* dest, size_t size, char ignore)
{
while (size--)
{
if (*src != ignore)
*dest = *src;
src++; dest++;
}
}
答案 3 :(得分:0)
如果忽略的频率不高,则下面的memcpy代码可能会有所帮助。
size_t copy_if(char* src, char* dest, size_t size, char ignore)
{
size_t i=0, count =0 , res= 0;
while (count < size)
{
while (*src != ignore){
count++;
if (count > size)
break;
src++;
i++;
res++;
}
count++;
if (i> 0){
memcpy(dest,src-i, i);
dest += i;
}
i = 0;
src++;
}
return res;
}