在各种情况下,例如生物信息学,对字节大小整数的计算就足够了。为了获得最佳性能,许多处理器架构都提供SIMD指令集(例如MMX,SSE,AVX),它们将寄存器分为字节,半字和字大小的组件,然后分别对相应的组件执行算术,逻辑和移位操作。 / p>
然而,某些架构不提供这样的SIMD指令,需要对它们进行仿真,这通常需要大量的比特纠缠。目前,我正在研究SIMD比较,特别是有符号,字节大小的整数的并行比较。我有一个解决方案,我认为使用可移植的C代码是非常有效的(参见下面的函数vsetles4
)。它基于Peter Montgomery在newsgroup posting中在2000年进行的观察,(A+B)/2 = (A AND B) + (A XOR B)/2
在中间计算中没有溢出。
这个特定的仿真代码(函数vsetles4
)是否可以进一步加速?首先订购具有较低基本操作数的任何解决方案都是合格的。我正在寻找便携式ISO-C99的解决方案,而不使用机器特定的内在函数。大多数架构都支持ANDN
(a& ~b),因此可以假设这在效率方面可以作为单个操作使用。
#include <stdint.h>
/*
vsetles4 treats its inputs as arrays of bytes each of which comprises
a signed integers in [-128,127]. Compute in byte-wise fashion, between
corresponding bytes of 'a' and 'b', the boolean predicate "less than
or equal" as a value in [0,1] into the corresponding byte of the result.
*/
/* reference implementation */
uint32_t vsetles4_ref (uint32_t a, uint32_t b)
{
uint8_t a0 = (uint8_t)((a >> 0) & 0xff);
uint8_t a1 = (uint8_t)((a >> 8) & 0xff);
uint8_t a2 = (uint8_t)((a >> 16) & 0xff);
uint8_t a3 = (uint8_t)((a >> 24) & 0xff);
uint8_t b0 = (uint8_t)((b >> 0) & 0xff);
uint8_t b1 = (uint8_t)((b >> 8) & 0xff);
uint8_t b2 = (uint8_t)((b >> 16) & 0xff);
uint8_t b3 = (uint8_t)((b >> 24) & 0xff);
int p0 = (int32_t)(int8_t)a0 <= (int32_t)(int8_t)b0;
int p1 = (int32_t)(int8_t)a1 <= (int32_t)(int8_t)b1;
int p2 = (int32_t)(int8_t)a2 <= (int32_t)(int8_t)b2;
int p3 = (int32_t)(int8_t)a3 <= (int32_t)(int8_t)b3;
return (((uint32_t)p3 << 24) | ((uint32_t)p2 << 16) |
((uint32_t)p1 << 8) | ((uint32_t)p0 << 0));
}
/* Optimized implementation:
a <= b; a - b <= 0; a + ~b + 1 <= 0; a + ~b < 0; (a + ~b)/2 < 0.
Compute avg(a,~b) without overflow, rounding towards -INF; then
lteq(a,b) = sign bit of result. In other words: compute 'lteq' as
(a & ~b) + arithmetic_right_shift (a ^ ~b, 1) giving the desired
predicate in the MSB of each byte.
*/
uint32_t vsetles4 (uint32_t a, uint32_t b)
{
uint32_t m, s, t, nb;
nb = ~b; // ~b
s = a & nb; // a & ~b
t = a ^ nb; // a ^ ~b
m = t & 0xfefefefe; // don't cross byte boundaries during shift
m = m >> 1; // logical portion of arithmetic right shift
s = s + m; // start (a & ~b) + arithmetic_right_shift (a ^ ~b, 1)
s = s ^ t; // complete arithmetic right shift and addition
s = s & 0x80808080; // MSB of each byte now contains predicate
t = s >> 7; // result is byte-wise predicate in [0,1]
return t;
}
答案 0 :(得分:0)
为了[可能]为你节省一些工作并回答user2357112的问题,我为此创建了一个[原始]基准。我一次添加字节作为基本参考:
given().
param("APPID","xxxxxx").
param("q","London").
get(EndPoint.GET_ENDPOINT).
then().
statusCode(200).
log().everything();
以下是运行的输出:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
long opt_R;
long opt_N;
void *aptr;
void *bptr;
void *cptr;
/*
vsetles4 treats its inputs as arrays of bytes each of which comprises
a signed integers in [-128,127]. Compute in byte-wise fashion, between
corresponding bytes of 'a' and 'b', the boolean predicate "less than
or equal" as a value in [0,1] into the corresponding byte of the result.
*/
/* base implementation */
void
vsetles4_base(const void *va, const void *vb, long count, void *vc)
{
const char *aptr;
const char *bptr;
char *cptr;
long idx;
count *= 4;
aptr = va;
bptr = vb;
cptr = vc;
for (idx = 0; idx < count; ++idx)
cptr[idx] = (aptr[idx] <= bptr[idx]);
}
/* reference implementation */
static inline uint32_t
_vsetles4_ref(uint32_t a, uint32_t b)
{
uint8_t a0 = (uint8_t)((a >> 0) & 0xff);
uint8_t a1 = (uint8_t)((a >> 8) & 0xff);
uint8_t a2 = (uint8_t)((a >> 16) & 0xff);
uint8_t a3 = (uint8_t)((a >> 24) & 0xff);
uint8_t b0 = (uint8_t)((b >> 0) & 0xff);
uint8_t b1 = (uint8_t)((b >> 8) & 0xff);
uint8_t b2 = (uint8_t)((b >> 16) & 0xff);
uint8_t b3 = (uint8_t)((b >> 24) & 0xff);
int p0 = (int32_t)(int8_t)a0 <= (int32_t)(int8_t)b0;
int p1 = (int32_t)(int8_t)a1 <= (int32_t)(int8_t)b1;
int p2 = (int32_t)(int8_t)a2 <= (int32_t)(int8_t)b2;
int p3 = (int32_t)(int8_t)a3 <= (int32_t)(int8_t)b3;
return (((uint32_t)p3 << 24) | ((uint32_t)p2 << 16) |
((uint32_t)p1 << 8) | ((uint32_t)p0 << 0));
}
uint32_t
vsetles4_ref(uint32_t a, uint32_t b)
{
return _vsetles4_ref(a,b);
}
/* Optimized implementation:
a <= b; a - b <= 0; a + ~b + 1 <= 0; a + ~b < 0; (a + ~b)/2 < 0.
Compute avg(a,~b) without overflow, rounding towards -INF; then
lteq(a,b) = sign bit of result. In other words: compute 'lteq' as
(a & ~b) + arithmetic_right_shift (a ^ ~b, 1) giving the desired
predicate in the MSB of each byte.
*/
static inline uint32_t
_vsetles4(uint32_t a, uint32_t b)
{
uint32_t m, s, t, nb;
nb = ~b; // ~b
s = a & nb; // a & ~b
t = a ^ nb; // a ^ ~b
m = t & 0xfefefefe; // don't cross byte boundaries during shift
m = m >> 1; // logical portion of arithmetic right shift
s = s + m; // start (a & ~b) + arithmetic_right_shift (a ^ ~b, 1)
s = s ^ t; // complete arithmetic right shift and addition
s = s & 0x80808080; // MSB of each byte now contains predicate
t = s >> 7; // result is byte-wise predicate in [0,1]
return t;
}
uint32_t
vsetles4(uint32_t a, uint32_t b)
{
return _vsetles4(a,b);
}
/* Optimized implementation:
a <= b; a - b <= 0; a + ~b + 1 <= 0; a + ~b < 0; (a + ~b)/2 < 0.
Compute avg(a,~b) without overflow, rounding towards -INF; then
lteq(a,b) = sign bit of result. In other words: compute 'lteq' as
(a & ~b) + arithmetic_right_shift (a ^ ~b, 1) giving the desired
predicate in the MSB of each byte.
*/
static inline uint64_t
_vsetles8(uint64_t a, uint64_t b)
{
uint64_t m, s, t, nb;
nb = ~b; // ~b
s = a & nb; // a & ~b
t = a ^ nb; // a ^ ~b
m = t & 0xfefefefefefefefell; // don't cross byte boundaries during shift
m = m >> 1; // logical portion of arithmetic right shift
s = s + m; // start (a & ~b) + arithmetic_right_shift (a ^ ~b, 1)
s = s ^ t; // complete arithmetic right shift and addition
s = s & 0x8080808080808080ll; // MSB of each byte now contains predicate
t = s >> 7; // result is byte-wise predicate in [0,1]
return t;
}
uint32_t
vsetles8(uint64_t a, uint64_t b)
{
return _vsetles8(a,b);
}
void
aryref(const void *va,const void *vb,long count,void *vc)
{
long idx;
const uint32_t *aptr;
const uint32_t *bptr;
uint32_t *cptr;
aptr = va;
bptr = vb;
cptr = vc;
for (idx = 0; idx < count; ++idx)
cptr[idx] = _vsetles4_ref(aptr[idx],bptr[idx]);
}
void
arybest4(const void *va,const void *vb,long count,void *vc)
{
long idx;
const uint32_t *aptr;
const uint32_t *bptr;
uint32_t *cptr;
aptr = va;
bptr = vb;
cptr = vc;
for (idx = 0; idx < count; ++idx)
cptr[idx] = _vsetles4(aptr[idx],bptr[idx]);
}
void
arybest8(const void *va,const void *vb,long count,void *vc)
{
long idx;
const uint64_t *aptr;
const uint64_t *bptr;
uint64_t *cptr;
count >>= 1;
aptr = va;
bptr = vb;
cptr = vc;
for (idx = 0; idx < count; ++idx)
cptr[idx] = _vsetles8(aptr[idx],bptr[idx]);
}
double
tvgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_REALTIME,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
void
timeit(void (*fnc)(const void *,const void *,long,void *),const char *sym)
{
double tvbeg;
double tvend;
tvbeg = tvgetf();
fnc(aptr,bptr,opt_N,cptr);
tvend = tvgetf();
printf("timeit: %.9f %s\n",tvend - tvbeg,sym);
}
// fill -- fill array with random numbers
void
fill(void *vptr)
{
uint32_t *iptr = vptr;
for (long idx = 0; idx < opt_N; ++idx)
iptr[idx] = rand();
}
// main -- main program
int
main(int argc,char **argv)
{
char *cp;
--argc;
++argv;
for (; argc > 0; --argc, ++argv) {
cp = *argv;
if (*cp != '-')
break;
switch (cp[1]) {
case 'R':
opt_R = strtol(cp + 2,&cp,10);
break;
case 'N':
opt_N = strtol(cp + 2,&cp,10);
break;
default:
break;
}
}
if (opt_R == 0)
opt_R = 1;
srand(opt_R);
printf("R=%ld\n",opt_R);
if (opt_N == 0)
opt_N = 100000000;
printf("N=%ld\n",opt_N);
aptr = calloc(opt_N,sizeof(uint32_t));
bptr = calloc(opt_N,sizeof(uint32_t));
cptr = calloc(opt_N,sizeof(uint32_t));
fill(aptr);
fill(bptr);
timeit(vsetles4_base,"base");
timeit(aryref,"aryref");
timeit(arybest4,"arybest4");
timeit(arybest8,"arybest8");
timeit(vsetles4_base,"base");
return 0;
}
请注意,你的引用并不比一次一个字节快得多[几乎不值得复杂,IMO]。
您的优化算法 提供最佳性能,缺少SIMD,我将其扩展为使用R=1
N=100000000
timeit: 0.550527096 base
timeit: 0.483014107 aryref
timeit: 0.236460924 arybest4
timeit: 0.147254944 arybest8
timeit: 0.440311432 base
,这[自然]再次使速度加倍。
对SIMD版本进行基准测试可能也很有趣。只是为了证明他们真的是最快的。