我正在处理以下图片卷积代码:
typedef struct fmatrix{
int rows;
int cols;
float** array;
} fmatrix;
typedef struct image{
unsigned char* data;
int w;
int h;
int c;
} image;
typedef struct kernel{
fmatrix* psf;
int divisor;
} kernel;
void convolve_sq(image* src, image* dst, kernel* psf, int pixel){
int size = psf->psf->rows * psf->psf->cols;
float tmp[size];
int n, m; //for psf
int x, y, x0, y0, cur; //for image
y0 = pixel / (src->w * src->c);
x0 = (pixel / src->c) % src->w;
for (n = 0; n < psf->psf->rows; ++n){
for (m = 0; m < psf->psf->cols; ++m){
y = n - (psf->psf->rows / 2);
x = m - (psf->psf->cols / 2);
if ((y + y0) < 0 || (y + y0) >= src->h || (x + x0) < 0 || (x + x0) >= src->w){
tmp[n*psf->psf->rows+m] = 255 * psf->psf->array[n][m];
}
else{
cur = (pixel + y * src->w * src->c + x * src->c);
tmp[n*psf->psf->rows+m] = src->data[cur] * psf->psf->array[n][m]; //misses on read
}
}
}
m = 0;
for (n = 0; n < size; ++n){
m += (int) tmp[n];
}
m /= psf->divisor;
if (m < 0) m = 0;
if (m > 255) m = 255;
dst->data[pixel] = m; //misses on write
}
void convolve_image(image* src, image* dst, kernel* psf){
int i, j, k;
for (i = 0; i < src->h; ++i){
for (j = 0; j < src->w; ++j){
for (k = 0; k < src->c; ++k){
convolve_sq(src, dst, psf, (i * src->w * src->c + j * src->c + k) );
}
}
}
}
运行cachegrind,我已经确定了两个存在大量缓存未命中的地方,我已在上面的代码中注释了这些地方。对于标记为“读取未命中”的行,有107,205个D1mr和97,201个DLmr。对于标记为“写入未命中”的行,有107,201个D1mw和DLmw。这些行分别直接读取和写入图像。
如何在避免缓存未命中方面提高此代码的效率?