有人能找到任何方法来提高下一个Bilinear调整大小算法的速度吗? 我需要提高速度,因为这是至关重要的,保持良好的图像质量。预计将用于具有低速CPU的移动设备。 该算法主要用于大规模调整大小。任何其他更快的双线性算法也将被理解。感谢
void resize(int* input, int* output, int sourceWidth, int sourceHeight, int targetWidth, int targetHeight)
{
int a, b, c, d, x, y, index;
float x_ratio = ((float)(sourceWidth - 1)) / targetWidth;
float y_ratio = ((float)(sourceHeight - 1)) / targetHeight;
float x_diff, y_diff, blue, red, green ;
int offset = 0 ;
for (int i = 0; i < targetHeight; i++)
{
for (int j = 0; j < targetWidth; j++)
{
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
a = input[index] ;
b = input[index + 1] ;
c = input[index + sourceWidth] ;
d = input[index + sourceWidth + 1] ;
// blue element
blue = (a&0xff)*(1-x_diff)*(1-y_diff) + (b&0xff)*(x_diff)*(1-y_diff) +
(c&0xff)*(y_diff)*(1-x_diff) + (d&0xff)*(x_diff*y_diff);
// green element
green = ((a>>8)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>8)&0xff)*(x_diff)*(1-y_diff) +
((c>>8)&0xff)*(y_diff)*(1-x_diff) + ((d>>8)&0xff)*(x_diff*y_diff);
// red element
red = ((a>>16)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>16)&0xff)*(x_diff)*(1-y_diff) +
((c>>16)&0xff)*(y_diff)*(1-x_diff) + ((d>>16)&0xff)*(x_diff*y_diff);
output [offset++] =
0x000000ff | // alpha
((((int)red) << 24)&0xff0000) |
((((int)green) << 16)&0xff00) |
((((int)blue) << 8)&0xff00);
}
}
}
答案 0 :(得分:3)
脱离我的头顶:
当然,还要进行大量的分析和测量。
答案 1 :(得分:3)
缓存您算法中的计算。
避免重复计算(例如(1-y_diff)
或(x_ratio * j)
)
浏览算法的所有行,并尝试识别重复的模式。将这些提取到局部变量。并且可能提取到函数,如果它们足够短以便内联,以使事物更具可读性。
使用查找表
很可能,如果你可以节省一些内存,你可以为你的RGB值实现一个“存储”,并根据产生它们的输入简单地“获取”它们。也许你不需要存储所有这些,但你可以试验,看看是否有人经常回来。或者,你可以“捏造”你的颜色,从而最终得到更少的值来存储更多的查找输入。
如果您知道输入的边界,则可以计算完整的域空间并找出缓存的意义。例如,如果您无法缓存整个R
,G
,B
值,则可能至少可以预先计算转换((b>>16)
等等。 ..)在您的情况下最可能是确定性的。)
如果您可以避免double
和float
个变量,请使用int
。在大多数体系结构中,由于内存模型,int
将测试更快的计算类型。您只需移动单位即可获得不错的精确度(例如,使用1026
作为int
,而不是1.026
作为double
或float
。这个技巧很可能对你来说已经足够了。
答案 2 :(得分:0)
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
肯定可以使用一些优化:您之前几个周期使用x_ration * j-1
,所以您真正需要的是x+=x_ratio
答案 3 :(得分:0)
这是我的版本,偷了一些想法。我的C-fu非常弱,所以有些行是伪代码,但你可以修复它们。
void resize(int* input, int* output,
int sourceWidth, int sourceHeight,
int targetWidth, int targetHeight
) {
// Let's create some lookup tables!
// you can move them into 2-dimensional arrays to
// group together values used at the same time to help processor cache
int sx[0..targetWidth ]; // target->source X lookup
int sy[0..targetHeight]; // target->source Y lookup
int mx[0..targetWidth ]; // left pixel's multiplier
int my[0..targetHeight]; // bottom pixel's multiplier
// we don't have to calc indexes every time, find out when
bool reloadPixels[0..targetWidth ];
bool shiftPixels[0..targetWidth ];
int shiftReloadPixels[0..targetWidth ]; // can be combined if necessary
int v; // temporary value
for (int j = 0; j < targetWidth; j++){
// (8bit + targetBits + sourceBits) should be < max int
v = 256 * j * (sourceWidth-1) / (targetWidth-1);
sx[j] = v / 256;
mx[j] = v % 256;
reloadPixels[j] = j ? ( sx[j-1] != sx[j] ? 1 : 0)
: 1; // always load first pixel
// if no reload -> then no shift too
shiftPixels[j] = j ? ( sx[j-1]+1 = sx[j] ? 2 : 0)
: 0; // nothing to shift at first pixel
shiftReloadPixels[j] = reloadPixels[i] | shiftPixels[j];
}
for (int i = 0; i < targetHeight; i++){
v = 256 * i * (sourceHeight-1) / (targetHeight-1);
sy[i] = v / 256;
my[i] = v % 256;
}
int shiftReload;
int srcIndex;
int srcRowIndex;
int offset = 0;
int lm, rm, tm, bm; // left / right / top / bottom multipliers
int a, b, c, d;
for (int i = 0; i < targetHeight; i++){
srcRowIndex = sy[ i ] * sourceWidth;
tm = my[i];
bm = 255 - tm;
for (int j = 0; j < targetWidth; j++){
// too much ifs can be too slow, measure.
// always true for first pixel in a row
if( shiftReload = shiftReloadPixels[ j ] ){
srcIndex = srcRowIndex + sx[j];
if( shiftReload & 2 ){
a = b;
c = d;
}else{
a = input[ srcIndex ];
c = input[ srcIndex + sourceWidth ];
}
b = input[ srcIndex + 1 ];
d = input[ srcIndex + 1 + sourceWidth ];
}
lm = mx[j];
rm = 255 - lm;
// WTF?
// Input AA RR GG BB
// Output RR GG BB AA
if( j ){
leftOutput = rightOutput ^ 0xFFFFFF00;
}else{
leftOutput =
// blue element
((( ( (a&0xFF)*tm
+ (c&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((a>>8)&0xFF)*tm
+ ((c>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((a>>16)&0xFF)*tm
+ ((c>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
}
rightOutput =
// blue element
((( ( (b&0xFF)*tm
+ (d&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((b>>8)&0xFF)*tm
+ ((d>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((b>>16)&0xFF)*tm
+ ((d>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
output[offset++] =
// alpha
0x000000ff
| leftOutput
| rightOutput
;
}
}
}
答案 4 :(得分:0)
我的随机猜测(使用分析器而不是让人猜!):
编译器必须生成在输入和输出重叠时生效,这意味着它必须生成大量冗余存储和负载。将restrict
添加到输入和输出参数以删除该安全功能。
您也可以尝试使用a=b;
和c=d;
,而不是再次加载它们。