最近我在将双线性插值应用到一些基于CPU的图形代码中时,遇到了同样的问题。
首先,我使用以下配置运行了您的代码:
操作系统:虚拟机中的Xubuntu 20
编译器:gcc 9.3.0
OpenCV版本:4.2.0
CPU:i3-6100u(2.3 GHz)
源位图大小:512x512
目标位图大小:2048x2048
我发现您的代码需要92毫秒,而OpenCV只需要4.2毫秒。因此,与2012年您提出这个问题时相比,差异现在更大了。我猜测OpenCV自那时以来进行了更多的优化。
(此时,我切换到在Windows中使用Visual Studio 2013,构建为x64目标)。
将代码转换为使用定点算术可将时间缩短至30毫秒。定点算术很有帮助,因为它将数据保留为整数。输入和输出数据都是整数。将它们转换为浮点数再转回去是代价高昂的。如果我坚持使用GCC 9.3,我预计加速会更快,因为我通常发现它生成的代码比VS 2013更快。无论如何,以下是代码:
typedef union ;
} DfColour;
typedef struct _DfBitmap DfBitmap;
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale)
}
}
切换到更好的算法将时间缩短到了19.5毫秒。正如Andrey Kamaev的答案所说,更好的算法通过将垂直和水平调整大小分为两个单独的过程来工作。目标位图用作第一次传递的输出的临时存储空间。第二遍通过中的X遍历是向后的,以避免覆盖它即将需要的数据。这是代码:
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale)
}
for (int y = 0; y < dstH; y++)
}
}
使用简单的可移植SIMD方案将时间缩短至16.5毫秒。该SIMD方案不使用类似SSE / AVX的专有指令集扩展。而是使用一个技巧,允许在32位整数中存储和操作红色和蓝色通道。它不像AVX实现那样快,但它具有简单性的好处。以下是代码:
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstPixel = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++, dstPixel++) {
unsigned rb = 0, g = 0;
DfColour *srcPixel = &srcRow[x];
rb += (srcPixel->c & 0xff00ff) * weightY;
g += srcPixel->g * weightY;
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstPixel->c = rb >> 8;
dstPixel->g = g >> 8;
}
}
for (int y = 0; y < dstH; y++) {
DfColour *dstRow = &dst->pixels[y * dst->width];
for (int x = dstW - 1; x; x--) {
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned rb = 0, g = 0;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
DfColour *srcPixel = &dstRow[srcX];
rb += (srcPixel->c & 0xff00ff) * weightX;
g += srcPixel->g * weightX;
srcPixel++;
rb += (srcPixel->c & 0xff00ff) * weightX2;
g += srcPixel->g * weightX2;
DfColour *dstPixel = &dstRow[x];
dstPixel->c = rb >> 8;
dstPixel->g = g >> 8;
}
}
}
可以将X轴通道保持分离,但是合并Y轴通道。这样可以提高缓存一致性并使代码更简单。重新组合两个通道可以将时间缩短至14.6毫秒。以下是代码:
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstRow = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++) {
unsigned rb = 0, g = 0;
DfColour *srcPixel = &srcRow[x];
rb += (srcPixel->c & 0xff00ff) * weightY;
g += srcPixel->g * weightY;
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
}
for (int x = dstW - 1; x; x--) {
unsigned rb = 0, g = 0;
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
DfColour *srcPixel = &dstRow[srcX];
rb += (srcPixel->c & 0xff00ff) * weightX;
g += srcPixel->g * weightX;
srcPixel++;
rb += (srcPixel->c & 0xff00ff) * weightX2;
g += srcPixel->g * weightX2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
}
}
}
第二个内部循环可以使用查找表进行简化,因为每个扫描线的srcX、weightX和weightX2的值都是相同的。 在第二个内部循环中使用查找表可以将运行时间缩短至12.9毫秒。 以下是代码:
struct SrcXandWeights {
uint8_t weightX, weightX2;
uint16_t srcX;
};
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
static SrcXandWeights *lut = NULL;
static int lutSize = 0;
if (lutSize < dstW) {
delete [] lut;
lut = new SrcXandWeights [dstW];
lutSize = dstW;
}
for (int x = 0; x < dstW; x++) {
int srcXAndWeight = (x * widthRatio) >> 8;
lut[x].srcX = srcXAndWeight >> 8;
lut[x].weightX2 = srcXAndWeight & 0xFF;
lut[x].weightX = 255 - lut[x].weightX2;
}
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = (srcYAndWeight) >> 8;
DfColour *dstRow = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++) {
DfColour *srcPixel = &srcRow[x];
unsigned rb = (srcPixel->c & 0xff00ff) * weightY;
unsigned g = srcPixel->g * weightY;
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
}
for (int x = dstW - 1; x; x--) {
SrcXandWeights *sw = lut + x;
DfColour *srcPixel = &dstRow[sw->srcX];
unsigned rb = (srcPixel->c & 0xff00ff) * sw->weightX;
unsigned g = srcPixel->g * sw->weightX;
srcPixel++;
rb += (srcPixel->c & 0xff00ff) * sw->weightX2;
g += srcPixel->g * sw->weightX2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
}
}
}
目前代码仍然是单线程的。我的CPU有两个物理核心和总共4个线程。OpenCV在我的机器上使用了2个线程。我预计将代码转换为使用2个线程将减少时间到约7毫秒。
我不知道还需要什么其他技巧才能降低到4毫秒,尽管转换为真正的AVX SIMD实现可能是必要的。
cv::resize
正在利用处理器指令集扩展,例如SSE,允许它并行运行多个乘法,例如。要自己执行此操作,您需要智能编译器优化或手动编写x86汇编语言。 编辑: 根据消息来源,GCC使用-O3
标志启用矢量化。。(但是,-O3
可能会导致非常奇怪的错误,因此不建议在一般情况下使用。) - user308323