我正在编写自定义阈值功能。我有两个在性能上有奇怪结果的实现。
const uchar *ptr;
const uchar *ptr0;
const uchar *ptr1;
const uchar *ptr2;
const uchar *ptr3;
const uchar *ptr4;
const uchar *ptr5;
const uchar *ptr6;
const uchar *ptr7;
uchar *ptrR;
for (int y = inset; y < height; y++) {
// setup ptrs
for (int x = inset; x < width; x++) {
threshVal = *ptr + 8;
*ptrR = (((*ptr0 - threshVal) >> 31) +
((*ptr1 - threshVal) >> 31) +
((*ptr2 - threshVal) >> 31) +
((*ptr3 - threshVal) >> 31) +
((*ptr4 - threshVal) >> 31) +
((*ptr5 - threshVal) >> 31) +
((*ptr6 - threshVal) >> 31) +
((*ptr7 - threshVal) >> 31) + 8) * 20;
// increase ptrs
}
}
我重写了这个解决方案并且性能更差(here is idea)
static uchar threshValues[600];
static uchar *threshValuesOffset;
// call this once
memset(threshValues, 0, sizeof(uchar)*300);
memset(threshValues+300, 20, sizeof(uchar)*300);
threshValuesOffset = threshValues + 300;
for (int y = inset; y < height; y++) {
// setup ptrs
for (int x = inset; x < width; x++) {
threshVal = *ptr + 8;
uchar *checkPtr = threshValuesOffset - threshVal;
*ptrR = checkPtr[*ptr0] +
checkPtr[*ptr1] +
checkPtr[*ptr2] +
checkPtr[*ptr3] +
checkPtr[*ptr4] +
checkPtr[*ptr5] +
checkPtr[*ptr6] +
checkPtr[*ptr7];
// increase ptrs
}
}
结果相同,但表现差于25%。如何通过这个问题获得最佳性能?