我是优化的新手,并且被赋予了一项优化尽可能多地处理图像的功能的任务。它会拍摄一张图像,使其模糊,然后保存模糊的图像,然后继续并锐化图像,并保存锐化的图像。
这是我的代码:
typedef struct {
unsigned char red;
unsigned char green;
unsigned char blue;
} pixel;
// I delete the other struct because we can do the same operations with use of only addresses
//use macro instead of function is more efficient
#define calculateIndex(i, j, n) ((i)*(n)+(j))
// I combine all the functions in one because it is time consuming
void myfunction(Image *image, char* srcImgpName, char* blurRsltImgName, char* sharpRsltImgName) {
// use variable from type 'register int' is much more efficient from 'int'
register int i,j, ii, jj, sum_red, sum_green, sum_blue;
//using local variable is much more efficient than using pointer to pixels from the original image,and updat its value in each iteration
pixel current_pixel , p;
//dst will point on the first pixel in the image
pixel* dst = (pixel*)image->data;
int squareN = n*n;
//instead of multiply by 3 - I used shift
register int sizeToAllocate = ((squareN)<<1)+(squareN); // use variable from type 'register int' is much more efficient from 'int'
pixel* src = malloc(sizeToAllocate);
register int index;
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// first step : smooth //////////////////////////////////////////////////////////////////////
/**the smooth blur is step that apply the blur-kernel (matrix of ints) over each pixel in the bouns - and make the image more smooth.
*this function was originally used this matrix :
* [1, 1, 1]
* [1, 1, 1]
* [1, 1, 1]
*because the matrix is full of 1 , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable.
*/
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
index = calculateIndex(1, 1, n);
for (i = 1 ; i < n - 1; ++i) {
for (j = 1 ; j < n - 1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
for(ii = i-1; ii <= i+1; ++ii) {
for(jj =j-1; jj <= j+1; ++jj) {
//take care of the [ii,jj] pixel in the matrix
//calculate the adrees of the current pixel
pixel p = src[calculateIndex(ii, jj, n)];
//sum the colors' values of the neighbors of the current pixel
sum_red += p.red;
sum_green += p.green;
sum_blue += p.blue;
}
}
//calculate the avarage of the colors' values around the current pixel - as written in the instructions
sum_red = (((sum_red) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_green = (((sum_green) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_blue = (((sum_blue) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
current_pixel.red = (unsigned char)sum_red;
current_pixel.green = (unsigned char)sum_green;
current_pixel.blue = (unsigned char)sum_blue;
dst[index++] = current_pixel;
}
}
// write result image to file
writeBMP(image, srcImgpName, blurRsltImgName);
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// second step : sharp //////////////////////////////////////////////////////////////////////
/** I want to sharp the smooth image . In this step I apply the sharpen kernel (matrix of ints) over each pixel in the bouns - and make the image more sharp.
*this function was originally used this matrix :
* [-1, -1, -1]
* [-1, 9, -1]
* [-1, -1, -1]
*because the matrix is full of (-1) , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable. I operato like that : insted of multiply in (-1) in the end of the step , I define counter initializes with zero , and
*substruct all te colors' values from it. the result is acctually the same as multiply by (-1), in more efficient way.
*/
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
for (i = 1 ; i < n-1; ++i) {
for (j = 1 ; j < n-1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
// Do central pixel first
p=src[calculateIndex(i,j,n)];
sum_red = 10*p.red;
sum_green = 10*p.green;
sum_blue = 10*p.blue;
for(ii =i-1; ii <= i + 1; ++ii) {
for(jj = j-1; jj <= j + 1; ++jj) {
p = src[calculateIndex(ii, jj, n)];
//operate according to the instructions
sum_red -= p.red;
sum_green -= p.green;
sum_blue -= p.blue;
}
}
//each pixel's colors' values must match the range [0,255] - I used the idea from the original code
//the red value must be in the range [0,255]
if (sum_red < 0) {
sum_red = 0;
} else if (sum_red > 255 ) {
sum_red = 255;
}
current_pixel.red = (unsigned char)sum_red;
//the green value must be in the range [0,255]
if (sum_green < 0) {
sum_green = 0;
} else if (sum_green > 255 ) {
sum_green = 255;
}
current_pixel.green = (unsigned char)sum_green;
//the blue value must be in the range [0,255]
if (sum_blue < 0) {
sum_blue = 0;
} else if (sum_blue > 255 ) {
sum_blue = 255;
}
current_pixel.blue = (unsigned char)sum_blue;
// put the updated pixel in [i,j] in the image
dst[calculateIndex(i, j, n)] = current_pixel;
}
}
//free the allocated space to prevent memory leaks
free(src);
// write result image to file
writeBMP(image, srcImgpName, sharpRsltImgName);
}
我想询问if语句,有什么更好的可以取代那些吗?而且更一般地说,任何人都可以在这里发现优化错误,或者可以提供他的输入吗?
非常感谢!
更新代码:
typedef struct {
unsigned char red;
unsigned char green;
unsigned char blue;
} pixel;
// I delete the other struct because we can do the same operations with use of only addresses
//use macro instead of function is more efficient
#define calculateIndex(i, j, n) ((i)*(n)+(j))
// I combine all the functions in one because it is time consuming
void myfunction(Image *image, char* srcImgpName, char* blurRsltImgName, char* sharpRsltImgName) {
// use variable from type 'register int' is much more efficient from 'int'
register int i,j, ii, jj, sum_red, sum_green, sum_blue;
//using local variable is much more efficient than using pointer to pixels from the original image,and updat its value in each iteration
pixel current_pixel , p;
//dst will point on the first pixel in the image
pixel* dst = (pixel*)image->data;
int squareN = n*n;
//instead of multiply by 3 - I used shift
register int sizeToAllocate = ((squareN)<<1)+(squareN); // use variable from type 'register int' is much more efficient from 'int'
pixel* src = malloc(sizeToAllocate);
register int index;
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// first step : smooth //////////////////////////////////////////////////////////////////////
/**the smooth blur is step that apply the blur-kernel (matrix of ints) over each pixel in the bouns - and make the image more smooth.
*this function was originally used this matrix :
* [1, 1, 1]
* [1, 1, 1]
* [1, 1, 1]
*because the matrix is full of 1 , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable.
*/
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
index = calculateIndex(1, 1, n);
for (i = 1 ; i < n - 1; ++i) {
for (j = 1 ; j < n - 1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
for(ii = i-1; ii <= i+1; ++ii) {
for(jj =j-1; jj <= j+1; ++jj) {
//take care of the [ii,jj] pixel in the matrix
//calculate the adrees of the current pixel
pixel p = src[calculateIndex(ii, jj, n)];
//sum the colors' values of the neighbors of the current pixel
sum_red += p.red;
sum_green += p.green;
sum_blue += p.blue;
}
}
//calculate the avarage of the colors' values around the current pixel - as written in the instructions
sum_red = (((sum_red) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_green = (((sum_green) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_blue = (((sum_blue) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
current_pixel.red = (unsigned char)sum_red;
current_pixel.green = (unsigned char)sum_green;
current_pixel.blue = (unsigned char)sum_blue;
dst[index++] = current_pixel;
}
index += 2;
}
// write result image to file
writeBMP(image, srcImgpName, blurRsltImgName);
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// second step : sharp //////////////////////////////////////////////////////////////////////
/** I want to sharp the smooth image . In this step I apply the sharpen kernel (matrix of ints) over each pixel in the bouns - and make the image more sharp.
*this function was originally used this matrix :
* [-1, -1, -1]
* [-1, 9, -1]
* [-1, -1, -1]
*because the matrix is full of (-1) , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable. I operato like that : insted of multiply in (-1) in the end of the step , I define counter initializes with zero , and
*substruct all te colors' values from it. the result is acctually the same as multiply by (-1), in more efficient way.
*/
index = calculateIndex(1,1,n);
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
for (i = 1 ; i < n-1; ++i) {
for (j = 1 ; j < n-1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
// Do central pixel first
p=src[index];
sum_red = 10*p.red;
sum_green = 10*p.green;
sum_blue = 10*p.blue;
for(ii =i-1; ii <= i + 1; ++ii) {
for(jj = j-1; jj <= j + 1; ++jj) {
p = src[calculateIndex(ii, jj, n)];
//operate according to the instructions
sum_red -= p.red;
sum_green -= p.green;
sum_blue -= p.blue;
}
index += 2;
}
//each pixel's colors' values must match the range [0,255] - I used the idea from the original code
//the red value must be in the range [0,255]
if (sum_red < 0) {
sum_red = 0;
} else if (sum_red > 255 ) {
sum_red = 255;
}
current_pixel.red = (unsigned char)sum_red;
//the green value must be in the range [0,255]
if (sum_green < 0) {
sum_green = 0;
} else if (sum_green > 255 ) {
sum_green = 255;
}
current_pixel.green = (unsigned char)sum_green;
//the blue value must be in the range [0,255]
if (sum_blue < 0) {
sum_blue = 0;
} else if (sum_blue > 255 ) {
sum_blue = 255;
}
current_pixel.blue = (unsigned char)sum_blue;
// put the updated pixel in [i,j] in the image
dst[calculateIndex(i, j, n)] = current_pixel;
}
}
//free the allocated space to prevent memory leaks
free(src);
// write result image to file
writeBMP(image, srcImgpName, sharpRsltImgName);
}
----------------------------------------------- -------------------------------更新代码:
typedef struct {
unsigned char red;
unsigned char green;
unsigned char blue;
} pixel;
// I delete the other struct because we can do the same operations with use of only addresses
//use macro instead of function is more efficient
#define calculateIndex(i, j, n) ((i)*(n)+(j))
// I combine all the functions in one because it is time consuming
void myfunction(Image *image, char* srcImgpName, char* blurRsltImgName, char* sharpRsltImgName) {
// use variable from type 'register int' is much more efficient from 'int'
register int i,j, ii, jj, sum_red, sum_green, sum_blue;
//using local variable is much more efficient than using pointer to pixels from the original image,and updat its value in each iteration
pixel current_pixel , p;
//dst will point on the first pixel in the image
pixel* dst = (pixel*)image->data;
int squareN = n*n;
//instead of multiply by 3 - I used shift
register int sizeToAllocate = ((squareN)<<1)+(squareN); // use variable from type 'register int' is much more efficient from 'int'
pixel* src = malloc(sizeToAllocate);
register int index;
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// first step : smooth //////////////////////////////////////////////////////////////////////
/**the smooth blur is step that apply the blur-kernel (matrix of ints) over each pixel in the bouns - and make the image more smooth.
*this function was originally used this matrix :
* [1, 1, 1]
* [1, 1, 1]
* [1, 1, 1]
*because the matrix is full of 1 , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable.
*/
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
index = n + 1;
for (i = 1 ; i < n - 1; ++i) {
for (j = 1 ; j < n - 1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
for(ii = i-1; ii <= i+1; ++ii) {
for(jj =j-1; jj <= j+1; ++jj) {
//take care of the [ii,jj] pixel in the matrix
//calculate the adrees of the current pixel
pixel p = src[calculateIndex(ii, jj, n)];
//sum the colors' values of the neighbors of the current pixel
sum_red += p.red;
sum_green += p.green;
sum_blue += p.blue;
}
}
//calculate the avarage of the colors' values around the current pixel - as written in the instructions
sum_red = (((sum_red) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_green = (((sum_green) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_blue = (((sum_blue) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
current_pixel.red = (unsigned char)sum_red;
current_pixel.green = (unsigned char)sum_green;
current_pixel.blue = (unsigned char)sum_blue;
dst[index++] = current_pixel;
}
index += 2;
}
// write result image to file
writeBMP(image, srcImgpName, blurRsltImgName);
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// second step : sharp //////////////////////////////////////////////////////////////////////
/** I want to sharp the smooth image . In this step I apply the sharpen kernel (matrix of ints) over each pixel in the bouns - and make the image more sharp.
*this function was originally used this matrix :
* [-1, -1, -1]
* [-1, 9, -1]
* [-1, -1, -1]
*because the matrix is full of (-1) , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable. I operato like that : insted of multiply in (-1) in the end of the step , I define counter initializes with zero , and
*substruct all te colors' values from it. the result is acctually the same as multiply by (-1), in more efficient way.
*/
index = calculateIndex(1,1,n);
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
for (i = 1 ; i < n-1; ++i) {
for (j = 1 ; j < n-1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
// Do central pixel first
p=src[index];
sum_red = 10*p.red;
sum_green = 10*p.green;
sum_blue = 10*p.blue;
for(ii =i-1; ii <= i + 1; ++ii) {
for(jj = j-1; jj <= j + 1; ++jj) {
p = src[calculateIndex(ii, jj, n)];
//operate according to the instructions
sum_red -= p.red;
sum_green -= p.green;
sum_blue -= p.blue;
}
}
//each pixel's colors' values must match the range [0,255] - I used the idea from the original code
//the red value must be in the range [0,255]
if (sum_red < 0) {
sum_red = 0;
} else if (sum_red > 255 ) {
sum_red = 255;
}
current_pixel.red = (unsigned char)sum_red;
//the green value must be in the range [0,255]
if (sum_green < 0) {
sum_green = 0;
} else if (sum_green > 255 ) {
sum_green = 255;
}
current_pixel.green = (unsigned char)sum_green;
//the blue value must be in the range [0,255]
if (sum_blue < 0) {
sum_blue = 0;
} else if (sum_blue > 255 ) {
sum_blue = 255;
}
current_pixel.blue = (unsigned char)sum_blue;
// put the updated pixel in [i,j] in the image
dst[calculateIndex(i, j, n)] = current_pixel;
}
index += 2;
}
//free the allocated space to prevent memory leaks
free(src);
// write result image to file
writeBMP(image, srcImgpName, sharpRsltImgName);
}
答案 0 :(得分:1)
一些一般优化指南:
如果您在x86上运行,请编译为64位二进制文件。 x86实际上是一个寄存器缺乏的CPU。在32位模式下,您几乎只有5或6个32位通用寄存器可用,如果您使用GCC上的-fomit-frame-pointer
等优化进行编译,则只能获得“全部”6。在64位模式下,您将拥有13或14个64位通用寄存器。
获得一个好的编译器并使用尽可能高的一般优化级别。
简介!轮廓!轮廓!实际上对您的代码进行分析,以便真正了解性能瓶颈所在。任何关于任何性能瓶颈位置的猜测都可能是错误的。
找到瓶颈后,请检查编译器生成的实际指令,并查看瓶颈区域,看看发生了什么。也许瓶颈是由于寄存压力导致编译器必须执行大量register spilling and filling的工作。如果您可以分析到指令级别,这可能非常有用。
使用分析和检查生成的指令中的见解来改进代码并编译参数。例如,如果您看到大量的寄存器溢出和填充,则需要降低寄存器压力,可能需要手动合并循环或使用编译器选项禁用预取。
尝试使用不同的页面大小选项。如果单行像素是页面大小的重要部分,则到达其他行更有可能进入另一页并导致TLB miss。使用更大的内存页面可能会大大减少这种情况。
您的代码的一些具体想法:
仅使用一个外环。您将不得不尝试找到处理“额外”边缘像素的最快方法。最快的方法可能是不要做任何特别的事情,像“普通”像素一样滚过它们,稍后再忽略它们中的值。
手动展开两个内部循环 - 您只需要9个像素。
请勿使用calculateIndex()
- 使用当前像素的地址,只需从当前像素地址中减去或添加适当的值即可找到其他像素。例如,内部循环中左上角像素的地址类似于currentPixelAddress - n - 1
。
那些会将你的四深嵌套循环转换为一个循环,只需很少的索引计算。
答案 1 :(得分:0)
一些想法 - 未经测试。
你有if(ii==i && jj=j)
来测试锐化循环中的中心像素,你为每个像素测试9x。我认为删除if
并对每个像素执行完全相同的操作会更快,但在循环外通过将中心像素加10倍进行校正。
// Do central pixel first
p=src[calculateIndex(i,j,n)];
sum_red = 10*p.red;
sum_green = 10*p.green;
sum_blue = 10*p.blue;
for(ii =i-1; ii <= i + 1; ++ii) {
for(jj = j-1; jj <= j + 1; ++jj) {
p = src[calculateIndex(ii, jj, n)];
//operate according to the instructions
sum_red -= p.red;
sum_green -= p.green;
sum_blue -= p.blue;
}
}
你做dst[calculateIndex(i, j, n)] = current_pixel;
的地方,你可以在开始循环之前计算一次索引,然后只需在循环内的每次写入时递增指针 - 假设你的数组是连续的和未填充的。
index=calculateIndex(1,1,n)
for (i = 1 ; i < n - 1; ++i) {
for (j = 1 ; j < n - 1 ; ++j) {
...
dst[index++] = current_pixel;
}
index+=2; // skip over last pixel of this line and first pixel of next line
}
当您在图像上移动9像素的3x3窗口时,您可以“记住”距前一个位置3个像素的最左侧列,然后代替每个像素的9个像素,你会对离开窗口的最左边一列进行一次减法,为新列进行3次加法,进入右边的窗口,即4次计算而不是9次。