我的代码中有热点,我正在执行pow()
占用执行时间的10-20%。
我对pow(x,y)
的输入是非常具体的,所以我想知道是否有办法滚动两个pow()
近似值(每个指数一个)具有更高的性能:
float
向量。如果可以利用平台细节,请立即使用!最大误差率约为0.01%是理想的,尽管我对全精度(float
)算法感兴趣。
我已经在使用快速pow()
approximation,但它没有考虑这些限制。有可能做得更好吗?
答案 0 :(得分:32)
答案 1 :(得分:23)
在IEEE 754黑客攻击中,这是另一种更快,更少“神奇”的解决方案。它在大约十几个时钟周期内实现了0.08%的误差范围(对于p = 2.4,在Intel Merom CPU上)。
浮点数最初是作为对数的近似值发明的,因此您可以使用整数值作为log2
的近似值。通过将整数转换指令应用于浮点值,可以在某种程度上实现这一点,以获得另一个浮点值。
要完成pow
计算,可以乘以常数因子并将对数转换回convert-to-integer指令。在SSE上,相关说明为cvtdq2ps
和cvtps2dq
。
x^p
= exp2( p * log2( x ) )
= exp2( p * ( log2( x ) + 127 - 127 ) - 127 + 127 )
= cvtps2dq( p * ( log2( x ) + 127 - 127 - 127 / p ) )
= cvtps2dq( p * ( log2( x ) + 127 - log2( exp2( 127 - 127 / p ) ) )
= cvtps2dq( p * ( log2( x * exp2( 127 / p - 127 ) ) + 127 ) )
= cvtps2dq( p * ( cvtdq2ps( x * exp2( 127 / p - 127 ) ) ) )
exp2( 127 / p - 127 )
是常数因素。这个函数相当专业:它不适用于小的分数指数,因为常数因子随着指数的倒数呈指数增长并且会溢出。它不适用于负指数。大指数导致高误差,因为尾数位通过乘法与指数位混合。
但是,这只是4个快速指令。预乘,从“整数”(到对数)转换,幂乘法,转换为“整数”(从对数)。对于SSE的实施,转换非常快。我们还可以在第一次乘法中压缩一个额外的常数系数。
template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
__m128 fastpow( __m128 arg ) {
__m128 ret = arg;
// std::printf( "arg = %,vg\n", ret );
// Apply a constant pre-correction factor.
ret = _mm_mul_ps( ret, _mm_set1_ps( exp2( 127. * expden / expnum - 127. )
* pow( 1. * coeffnum / coeffden, 1. * expden / expnum ) ) );
// std::printf( "scaled = %,vg\n", ret );
// Reinterpret arg as integer to obtain logarithm.
asm ( "cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret) );
// std::printf( "log = %,vg\n", ret );
// Multiply logarithm by power.
ret = _mm_mul_ps( ret, _mm_set1_ps( 1. * expnum / expden ) );
// std::printf( "powered = %,vg\n", ret );
// Convert back to "integer" to exponentiate.
asm ( "cvtps2dq %1, %0" : "=x" (ret) : "x" (ret) );
// std::printf( "result = %,vg\n", ret );
return ret;
}
指数= 2.4的一些试验表明,这种情况持续高估约5%。 (例程总是保证高估。)你可以简单地乘以0.95,但是更多的指令会给我们提供大约4个十进制数字的准确度,这对于图形应该足够了。
关键是要将过高估计与低估相匹配,并取平均值。
rsqrtps
。 (这非常准确,但确实牺牲了使用零的能力。)mulps
。rsqrtps
。mulps
。mulps
。mulps
。这是高估。mulps
。这是低估的。addps
,一个mulps
。指令计数:十四,包括两次延迟= 5的转换和两次吞吐量= 4的倒数平方根估计。
为了正确地取平均值,我们希望根据预期误差对估算值进行加权。低估将误差提高到0.6和0.4的幂,所以我们预计它是错误的1.5倍。加权不添加任何说明;它可以在前因子中完成。调用系数a:a ^ 0.5 = 1.5 a ^ -0.75,a = 1.38316186。
最终错误大约为.015%,或比初始fastpow
结果好2个数量级。对于具有volatile
源和目标变量的繁忙循环,运行时大约是十几个周期......虽然它与迭代重叠,但实际使用情况也会看到指令级并行性。考虑到SIMD,这是每3个周期一个标量结果的吞吐量!
int main() {
__m128 const x0 = _mm_set_ps( 0.01, 1, 5, 1234.567 );
std::printf( "Input: %,vg\n", x0 );
// Approx 5% accuracy from one call. Always an overestimate.
__m128 x1 = fastpow< 24, 10, 1, 1 >( x0 );
std::printf( "Direct x^2.4: %,vg\n", x1 );
// Lower exponents provide lower initial error, but too low causes overflow.
__m128 xf = fastpow< 8, 10, int( 1.38316186 * 1e9 ), int( 1e9 ) >( x0 );
std::printf( "1.38 x^0.8: %,vg\n", xf );
// Imprecise 4-cycle sqrt is still far better than fastpow, good enough.
__m128 xfm4 = _mm_rsqrt_ps( xf );
__m128 xf4 = _mm_mul_ps( xf, xfm4 );
// Precisely calculate x^2 and x^3
__m128 x2 = _mm_mul_ps( x0, x0 );
__m128 x3 = _mm_mul_ps( x2, x0 );
// Overestimate of x^2 * x^0.4
x2 = _mm_mul_ps( x2, xf4 );
// Get x^-0.2 from x^0.4. Combine with x^-0.4 into x^-0.6 and x^2.4.
__m128 xfm2 = _mm_rsqrt_ps( xf4 );
x3 = _mm_mul_ps( x3, xfm4 );
x3 = _mm_mul_ps( x3, xfm2 );
std::printf( "x^2 * x^0.4: %,vg\n", x2 );
std::printf( "x^3 / x^0.6: %,vg\n", x3 );
x2 = _mm_mul_ps( _mm_add_ps( x2, x3 ), _mm_set1_ps( 1/ 1.960131704207789 ) );
// Final accuracy about 0.015%, 200x better than x^0.8 calculation.
std::printf( "average = %,vg\n", x2 );
}
嗯...对不起,我不能早点发布。并将其扩展到x ^ 1 / 2.4作为练习; v)。
我实施了一个小测试工具和两个与上述相对应的x ( 5 / 12 )案例。
#include <cstdio>
#include <xmmintrin.h>
#include <cmath>
#include <cfloat>
#include <algorithm>
using namespace std;
template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
__m128 fastpow( __m128 arg ) {
__m128 ret = arg;
// std::printf( "arg = %,vg\n", ret );
// Apply a constant pre-correction factor.
ret = _mm_mul_ps( ret, _mm_set1_ps( exp2( 127. * expden / expnum - 127. )
* pow( 1. * coeffnum / coeffden, 1. * expden / expnum ) ) );
// std::printf( "scaled = %,vg\n", ret );
// Reinterpret arg as integer to obtain logarithm.
asm ( "cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret) );
// std::printf( "log = %,vg\n", ret );
// Multiply logarithm by power.
ret = _mm_mul_ps( ret, _mm_set1_ps( 1. * expnum / expden ) );
// std::printf( "powered = %,vg\n", ret );
// Convert back to "integer" to exponentiate.
asm ( "cvtps2dq %1, %0" : "=x" (ret) : "x" (ret) );
// std::printf( "result = %,vg\n", ret );
return ret;
}
__m128 pow125_4( __m128 arg ) {
// Lower exponents provide lower initial error, but too low causes overflow.
__m128 xf = fastpow< 4, 5, int( 1.38316186 * 1e9 ), int( 1e9 ) >( arg );
// Imprecise 4-cycle sqrt is still far better than fastpow, good enough.
__m128 xfm4 = _mm_rsqrt_ps( xf );
__m128 xf4 = _mm_mul_ps( xf, xfm4 );
// Precisely calculate x^2 and x^3
__m128 x2 = _mm_mul_ps( arg, arg );
__m128 x3 = _mm_mul_ps( x2, arg );
// Overestimate of x^2 * x^0.4
x2 = _mm_mul_ps( x2, xf4 );
// Get x^-0.2 from x^0.4, and square it for x^-0.4. Combine into x^-0.6.
__m128 xfm2 = _mm_rsqrt_ps( xf4 );
x3 = _mm_mul_ps( x3, xfm4 );
x3 = _mm_mul_ps( x3, xfm2 );
return _mm_mul_ps( _mm_add_ps( x2, x3 ), _mm_set1_ps( 1/ 1.960131704207789 * 0.9999 ) );
}
__m128 pow512_2( __m128 arg ) {
// 5/12 is too small, so compute the sqrt of 10/12 instead.
__m128 x = fastpow< 5, 6, int( 0.992245 * 1e9 ), int( 1e9 ) >( arg );
return _mm_mul_ps( _mm_rsqrt_ps( x ), x );
}
__m128 pow512_4( __m128 arg ) {
// 5/12 is too small, so compute the 4th root of 20/12 instead.
// 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
// weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
__m128 xf = fastpow< 2, 3, int( 0.629960524947437 * 1e9 ), int( 1e9 ) >( arg );
__m128 xover = _mm_mul_ps( arg, xf );
__m128 xfm1 = _mm_rsqrt_ps( xf );
__m128 x2 = _mm_mul_ps( arg, arg );
__m128 xunder = _mm_mul_ps( x2, xfm1 );
// sqrt2 * over + 2 * sqrt2 * under
__m128 xavg = _mm_mul_ps( _mm_set1_ps( 1/( 3 * 0.629960524947437 ) * 0.999852 ),
_mm_add_ps( xover, xunder ) );
xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
xavg = _mm_mul_ps( xavg, _mm_rsqrt_ps( xavg ) );
return xavg;
}
__m128 mm_succ_ps( __m128 arg ) {
return (__m128) _mm_add_epi32( (__m128i) arg, _mm_set1_epi32( 4 ) );
}
void test_pow( double p, __m128 (*f)( __m128 ) ) {
__m128 arg;
for ( arg = _mm_set1_ps( FLT_MIN / FLT_EPSILON );
! isfinite( _mm_cvtss_f32( f( arg ) ) );
arg = mm_succ_ps( arg ) ) ;
for ( ; _mm_cvtss_f32( f( arg ) ) == 0;
arg = mm_succ_ps( arg ) ) ;
std::printf( "Domain from %g\n", _mm_cvtss_f32( arg ) );
int n;
int const bucket_size = 1 << 25;
do {
float max_error = 0;
double total_error = 0, cum_error = 0;
for ( n = 0; n != bucket_size; ++ n ) {
float result = _mm_cvtss_f32( f( arg ) );
if ( ! isfinite( result ) ) break;
float actual = ::powf( _mm_cvtss_f32( arg ), p );
float error = ( result - actual ) / actual;
cum_error += error;
error = std::abs( error );
max_error = std::max( max_error, error );
total_error += error;
arg = mm_succ_ps( arg );
}
std::printf( "error max = %8g\t" "avg = %8g\t" "|avg| = %8g\t" "to %8g\n",
max_error, cum_error / n, total_error / n, _mm_cvtss_f32( arg ) );
} while ( n == bucket_size );
}
int main() {
std::printf( "4 insn x^12/5:\n" );
test_pow( 12./5, & fastpow< 12, 5, 1059, 1000 > );
std::printf( "14 insn x^12/5:\n" );
test_pow( 12./5, & pow125_4 );
std::printf( "6 insn x^5/12:\n" );
test_pow( 5./12, & pow512_2 );
std::printf( "14 insn x^5/12:\n" );
test_pow( 5./12, & pow512_4 );
}
输出:
4 insn x^12/5:
Domain from 1.36909e-23
error max = inf avg = inf |avg| = inf to 8.97249e-19
error max = 2267.14 avg = 139.175 |avg| = 139.193 to 5.88021e-14
error max = 0.123606 avg = -0.000102963 |avg| = 0.0371122 to 3.85365e-09
error max = 0.123607 avg = -0.000108978 |avg| = 0.0368548 to 0.000252553
error max = 0.12361 avg = 7.28909e-05 |avg| = 0.037507 to 16.5513
error max = 0.123612 avg = -0.000258619 |avg| = 0.0365618 to 1.08471e+06
error max = 0.123611 avg = 8.70966e-05 |avg| = 0.0374369 to 7.10874e+10
error max = 0.12361 avg = -0.000103047 |avg| = 0.0371122 to 4.65878e+15
error max = 0.123609 avg = nan |avg| = nan to 1.16469e+16
14 insn x^12/5:
Domain from 1.42795e-19
error max = inf avg = nan |avg| = nan to 9.35823e-15
error max = 0.000936462 avg = 2.0202e-05 |avg| = 0.000133764 to 6.13301e-10
error max = 0.000792752 avg = 1.45717e-05 |avg| = 0.000129936 to 4.01933e-05
error max = 0.000791785 avg = 7.0132e-06 |avg| = 0.000129923 to 2.63411
error max = 0.000787589 avg = 1.20745e-05 |avg| = 0.000129347 to 172629
error max = 0.000786553 avg = 1.62351e-05 |avg| = 0.000132397 to 1.13134e+10
error max = 0.000785586 avg = 8.25205e-06 |avg| = 0.00013037 to 6.98147e+12
6 insn x^5/12:
Domain from 9.86076e-32
error max = 0.0284339 avg = 0.000441158 |avg| = 0.00967327 to 6.46235e-27
error max = 0.0284342 avg = -5.79938e-06 |avg| = 0.00897913 to 4.23516e-22
error max = 0.0284341 avg = -0.000140706 |avg| = 0.00897084 to 2.77556e-17
error max = 0.028434 avg = 0.000440504 |avg| = 0.00967325 to 1.81899e-12
error max = 0.0284339 avg = -6.11153e-06 |avg| = 0.00897915 to 1.19209e-07
error max = 0.0284298 avg = -0.000140597 |avg| = 0.00897084 to 0.0078125
error max = 0.0284371 avg = 0.000439748 |avg| = 0.00967319 to 512
error max = 0.028437 avg = -7.74294e-06 |avg| = 0.00897924 to 3.35544e+07
error max = 0.0284369 avg = -0.000142036 |avg| = 0.00897089 to 2.19902e+12
error max = 0.0284368 avg = 0.000439183 |avg| = 0.0096732 to 1.44115e+17
error max = 0.0284367 avg = -7.41244e-06 |avg| = 0.00897923 to 9.44473e+21
error max = 0.0284366 avg = -0.000141706 |avg| = 0.00897088 to 6.1897e+26
error max = 0.485129 avg = -0.0401671 |avg| = 0.048422 to 4.05648e+31
error max = 0.994932 avg = -0.891494 |avg| = 0.891494 to 2.65846e+36
error max = 0.999329 avg = nan |avg| = nan to -0
14 insn x^5/12:
Domain from 2.64698e-23
error max = 0.13556 avg = 0.00125936 |avg| = 0.00354677 to 1.73472e-18
error max = 0.000564988 avg = 2.51458e-06 |avg| = 0.000113709 to 1.13687e-13
error max = 0.000565065 avg = -1.49258e-06 |avg| = 0.000112553 to 7.45058e-09
error max = 0.000565143 avg = 1.5293e-06 |avg| = 0.000112864 to 0.000488281
error max = 0.000565298 avg = 2.76457e-06 |avg| = 0.000113713 to 32
error max = 0.000565453 avg = -1.61276e-06 |avg| = 0.000112561 to 2.09715e+06
error max = 0.000565531 avg = 1.42628e-06 |avg| = 0.000112866 to 1.37439e+11
error max = 0.000565686 avg = 2.71505e-06 |avg| = 0.000113715 to 9.0072e+15
error max = 0.000565763 avg = -1.56586e-06 |avg| = 0.000112415 to 1.84467e+19
我怀疑更准确的5/12的准确性受到rsqrt
操作的限制。
答案 2 :(得分:20)
Ian Stephenson写了this code,他声称自己的表现优于pow()
。他describes the idea如下:
Pow基本上是使用 log's:
pow(a,b)=x(logx(a)*b)
。所以我们 需要快速记录和快速指数 - 它 不管x是什么,所以我们使用2。 诀窍是一个浮点 number已经是日志样式 格式:a=M*2E
记录双方的日志给出:
log2(a)=log2(M)+E
或更简单:
log2(a)~=E
换句话说,如果我们采取浮动 点数表示,和 提取我们得到的指数 这是一个很好的起点 作为它的日志。事实证明,当我们 通过按摩位模式来做到这一点, Mantissa最终得到了一个好处 近似于错误,它 效果很好。
这应该足够简单了 照明计算,但如果你需要 更好的东西,你可以提取 尾数,用它来 计算二次修正系数 这非常准确。
答案 3 :(得分:16)
首先,使用花车不会在现今的大多数机器上买得多。事实上,双打可以更快。你的力量1.0 / 2.4是5/12或1/3 *(1 + 1/4)。即使这是调用cbrt(一次)和sqrt(两次!),它仍然是使用pow()的两倍。 (优化:-O3,编译器:i686-apple-darwin10-g ++ - 4.2.1)。
#include <math.h> // cmath does not provide cbrt; C99 does.
double xpow512 (double x) {
double cbrtx = cbrt(x);
return cbrtx*sqrt(sqrt(cbrtx));
}
答案 4 :(得分:15)
这可能无法解答您的问题。
2.4f
和1/2.4f
让我非常怀疑,因为这些正是用于在sRGB和线性RGB色彩空间之间进行转换的功能。因此,您可能实际上正在尝试优化,。我不知道,这就是为什么这可能无法回答你的问题。
如果是这种情况,请尝试使用查找表。类似的东西:
__attribute__((aligned(64))
static const unsigned short SRGB_TO_LINEAR[256] = { ... };
__attribute__((aligned(64))
static const unsigned short LINEAR_TO_SRGB[256] = { ... };
void apply_lut(const unsigned short lut[256], unsigned char *src, ...
如果您使用的是16位数据,请根据需要进行更改。无论如何我都会将表格设为16位,因此在使用8位数据时,如果需要,可以将结果抖动。如果你的数据是浮点数,这显然不会很好 - 但是将sRGB数据存储在浮点状态并没有用,所以你最好先转换为16位/ 8位数据然后进行从线性到sRGB的转换。
(sRGB没有意义,因为浮点是HDR应该是线性的,sRGB只是方便存储在磁盘上或在屏幕上显示,但不便于操作。)
答案 5 :(得分:3)
Binomial series确实考虑了一个常数指数,但只有当你可以将所有输入归一化到范围[1,2]时,你才能使用它。 (注意它计算(1 + x)^ a)。您必须进行一些分析,以确定所需准确度所需的术语数量。
答案 6 :(得分:2)
我将回答你想要想要问的问题,这是如何做快速sRGB&lt; - &gt;线性RGB转换。为了精确有效地做到这一点,我们可以使用多项式近似。使用sollya生成以下多项式近似值,最差情况下相对误差为0.0144%。
float abc(float p){
float a = p * globalmouseposition.x;
return a;
}
用于生成多项式的sollya输入:
NSMutableArray *notifications = [[NSMutableArray alloc] init];
[notifications addObject:notification];
app.scheduledLocalNotifications = notifications;
UIApplication *app = [UIApplication sharedApplication];
NSArray *eventArray = [app scheduledLocalNotifications];
for (int i=0; i<[eventArray count]; i++)
{
UILocalNotification* oneEvent = [eventArray objectAtIndex:i];
NSDictionary *userInfoCurrent = oneEvent.userInfo;
NSString *uid=[NSString stringWithFormat:@"%@",[userInfoCurrent valueForKey:@"uid"]];
if ([uid isEqualToString:uidtodelete])
{
//Cancelling local notification
[app cancelLocalNotification:oneEvent];
break;
}
}
NSArray *arrayOfLocalNotifications = [[UIApplication sharedApplication] scheduledLocalNotifications] ;
for (UILocalNotification *localNotification in arrayOfLocalNotifications) {
if ([localNotification.alertBody isEqualToString:savedTitle]) {
NSLog(@"the notification this is canceld is %@", localNotification.alertBody);
[[UIApplication sharedApplication] cancelLocalNotification:localNotification] ; // delete the notification from the system
}
}
答案 7 :(得分:1)
对于2.4的指数,如果表格不够准确(基本上是一个巨大的日志表),您可以为所有2.4值和lirp或者更高阶函数创建一个查找表来填充in-betweem值。)
或者,将值平方*值设置为2 / 5s,它可以从函数的前半部分获取初始平方值,然后从第5根开始。对于第5个根,你可以牛顿它或做一些其他快速逼近器,但老实说,一旦你达到这一点,你可能最好只用你自己的适当的缩写系列函数做exp和log函数。
答案 8 :(得分:1)
以下是您可以使用任何快速计算方法的想法。是否有助于事情变得更快取决于数据的到达方式。您可以使用以下事实:如果您知道x
和pow(x, n)
,则可以使用权力的变化率来计算小pow(x + delta, n)
的{{1}}的合理近似值,使用单个乘法并添加(或多或少)。如果您为电源功能提供的连续值足够接近,这将分摊多个函数调用的准确计算的全部成本。请注意,您无需额外的功率计算即可获得衍生产品。您可以扩展它以使用二阶导数,这样您就可以使用二次方,这将增加您可以使用的delta
,并且仍然可以获得相同的精度。
答案 9 :(得分:0)
传统上,powf(x, p) = x^p
通过重写x
作为x=2^(log2(x))
制作powf(x,p) = 2^(p*log2(x))
来解决,exp2()
将问题转换为两个近似log2()
&amp; p
。这具有使用更大功率p
的优点,但是缺点是这不是恒定功率0 ≤ x ≤ 1
和指定输入范围p > 1
的最佳解决方案。
当功率0 ≤ x ≤ 1
时,答案是在p = 12/5 = 2.4
范围内的一个微不足道的极小极大多项式,float pow12_5(float x){
float mp;
// Minimax horner polynomials for x^(5/12), Note: choose the accurarcy required then implement with fma() [Fused Multiply Accumulates]
// mp = 0x4.a84a38p-12 + x * (-0xd.e5648p-8 + x * (0xa.d82fep-4 + x * 0x6.062668p-4)); // 1.13705697e-3
mp = 0x1.117542p-12 + x * (-0x5.91e6ap-8 + x * (0x8.0f50ep-4 + x * (0xa.aa231p-4 + x * (-0x2.62787p-4)))); // 2.6079002e-4
// mp = 0x5.a522ap-16 + x * (-0x2.d997fcp-8 + x * (0x6.8f6d1p-4 + x * (0xf.21285p-4 + x * (-0x7.b5b248p-4 + x * 0x2.32b668p-4)))); // 8.61377e-5
// mp = 0x2.4f5538p-16 + x * (-0x1.abcdecp-8 + x * (0x5.97464p-4 + x * (0x1.399edap0 + x * (-0x1.0d363ap0 + x * (0xa.a54a3p-4 + x * (-0x2.e8a77cp-4)))))); // 3.524655e-5
return(mp);
}
就是这种情况,如下所示:
p < 1
然而,0 ≤ x ≤ 1
当y=x^p=x^(p+m)/x^m
上的极小极大近似值超出约束m=1,2,3
时,没有适当地收敛到所需的精度。一个选项[不是真的]是重写问题p > 1
,其中x
是一个正整数,使得新的幂近似x = mx* 2^(ex) where 1 ≤ mx < 2
y = x^(5/12) = mx^(5/12) * 2^((5/12)*ex), let ey = floor(5*ex/12), k = (5*ex) % 12
= mx^(5/12) * 2^(k/12) * 2^(ey)
,但这引入了本质上较慢的除法。
然而,另一种选择是将输入mx^(5/12)
分解为其浮点指数和尾数形式:
1 ≤ mx < 2
2^(k/12)
超过float powk_12LUT[] = {0x1.0p0, 0x1.0f38fap0, 0x1.1f59acp0, 0x1.306fep0, 0x1.428a3p0, 0x1.55b81p0, 0x1.6a09e6p0, 0x1.7f910ep0, 0x1.965feap0, 0x1.ae89fap0, 0x1.c823ep0, 0x1.e3437ep0};
float pow5_12(float x){
union{float f; uint32_t u;} v, e2;
float poff, m, e, ei;
int xe;
v.f = x;
xe = ((v.u >> 23) - 127);
if(xe < -127) return(0.0f);
// Calculate remainder k in 2^(k/12) to find LUT
e = xe * (5.0f/12.0f);
ei = floorf(e);
poff = powk_12LUT[(int)(12.0f * (e - ei))];
e2.u = ((int)ei + 127) << 23; // Calculate the exponent
v.u = (v.u & ~(0xFFuL << 23)) | (0x7FuL << 23); // Normalize exponent to zero
// Approximate mx^(5/12) on [1,2), with appropriate degree minimax
// m = 0x8.87592p-4 + v.f * (0x8.8f056p-4 + v.f * (-0x1.134044p-4)); // 7.6125e-4
// m = 0x7.582138p-4 + v.f * (0xb.1666bp-4 + v.f * (-0x2.d21954p-4 + v.f * 0x6.3ea0cp-8)); // 8.4522726e-5
m = 0x6.9465cp-4 + v.f * (0xd.43015p-4 + v.f * (-0x5.17b2a8p-4 + v.f * (0x1.6cb1f8p-4 + v.f * (-0x2.c5b76p-8)))); // 1.04091259e-5
// m = 0x6.08242p-4 + v.f * (0xf.352bdp-4 + v.f * (-0x7.d0c1bp-4 + v.f * (0x3.4d153p-4 + v.f * (-0xc.f7a42p-8 + v.f * 0x1.5d840cp-8)))); // 1.367401e-6
return(m * poff * e2.f);
}
的极小极大近似现在比以前快得多,没有除法,但需要RequestMapping(value = "/reports/performance/indicator/{indicatorType}", method = RequestMethod.POST)
public String generatePerformanceReportsIndicator(ModelMap map,HttpServletResponse response, @PathVariable("indicatorType") Long indicatorType,
@RequestParam(value = "siteIds", required = false) List<Long> siteIds,
@RequestParam(value = "timeframeIds", required = false) List<String> timeframeIds,
@RequestParam(value = "showTarget", required = false) String showTarget,Locale locale) {
的12点LUT。代码如下:
new FirstController().generatePerformanceReportsIndicator(....);