From b247124839db0a25e6200b258e71eb5d7acc033c Mon Sep 17 00:00:00 2001 From: Olivier Goffart Date: Mon, 30 Aug 2010 16:25:33 +0200 Subject: qdrawhelper: backport the optimisations in fetchTransformBilinear from master to 4.7 This backport the following commits: e55b6a3 qdrawhelper: remove code duplication 0d7e683 qdrawhelper: optimize fetchTransformedBilinear 29ef46e Fix compilation with RVCT 6601458 qdrawhelper: Use SSE2 in fetchTransformedBilinear (when scalling up) 398ef0ca Fix nasty copy-paste bug in fetchTransformedBilinear() d585ece qdrawhelper: fix assert in fetchTransformedBilinear Reviewed-by: Benjamin Poulain --- src/gui/painting/qdrawhelper.cpp | 395 +++++++++++++++++++++++++-------------- 1 file changed, 257 insertions(+), 138 deletions(-) diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index be4275c..89754fa 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -656,6 +656,46 @@ const uint * QT_FASTCALL fetchTransformed(uint *buffer, const Operator *, const return buffer; } +/** \internal + interpolate 4 argb pixels with the distx and disty factor. + distx and disty bust be between 0 and 16 + */ +static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, int distx, int disty, int idistx, int idisty) +{ + uint tlrb = ((tl & 0x00ff00ff) * idistx * idisty); + uint tlag = (((tl & 0xff00ff00) >> 8) * idistx * idisty); + uint trrb = ((tr & 0x00ff00ff) * distx * idisty); + uint trag = (((tr & 0xff00ff00) >> 8) * distx * idisty); + uint blrb = ((bl & 0x00ff00ff) * idistx * disty); + uint blag = (((bl & 0xff00ff00) >> 8) * idistx * disty); + uint brrb = ((br & 0x00ff00ff) * distx * disty); + uint brag = (((br & 0xff00ff00) >> 8) * distx * disty); + return (((tlrb + trrb + blrb + brrb) >> 8) & 0x00ff00ff) | ((tlag + trag + blag + brag) & 0xff00ff00); +} + + +template +Q_STATIC_TEMPLATE_FUNCTION inline void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2) +{ + if (blendType == BlendTransformedBilinearTiled) { + v1 %= max; + if (v1 < 0) v1 += max; + v2 = v1 + 1; + v2 %= max; + } else { + if (v1 < l1) { + v2 = v1 = l1; + } else if (v1 >= l2 - 1) { + v2 = v1 = l2 - 1; + } else { + v2 = v1 + 1; + } + } + + Q_ASSERT(v1 >= 0 && v1 < max); + Q_ASSERT(v2 >= 0 && v2 < max); +} + template /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */ Q_STATIC_TEMPLATE_FUNCTION const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator *, const QSpanData *data, @@ -696,64 +736,230 @@ const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator * fx -= half_point; fy -= half_point; - while (b < end) { - int x1 = (fx >> 16); - int x2; + + if (fdy == 0) { //simple scale, no rotation int y1 = (fy >> 16); int y2; + fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); + const uchar *s1 = data->texture.scanLine(y1); + const uchar *s2 = data->texture.scanLine(y2); - if (blendType == BlendTransformedBilinearTiled) { - x1 %= image_width; - if (x1 < 0) x1 += image_width; - x2 = x1 + 1; - x2 %= image_width; - - y1 %= image_height; - if (y1 < 0) y1 += image_height; - y2 = y1 + 1; - y2 %= image_height; - } else { - if (x1 < image_x1) { - x2 = x1 = image_x1; - } else if (x1 >= image_x2 - 1) { - x2 = x1 = image_x2 - 1; + if (fdx <= fixed_scale && fdx > 0) { // scale up on X + int disty = (fy & 0x0000ffff) >> 8; + int idisty = 256 - disty; + int x = fx >> 16; + + // The idea is first to do the interpolation between the row s1 and the row s2 + // into an intermediate buffer, then we interpolate between two pixel of this buffer. + + // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB + // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG + quint32 intermediate_buffer[2][buffer_size + 2]; + // count is the size used in the intermediate_buffer. + int count = qCeil(length * data->m11) + 2; //+1 for the last pixel to interpolate with, and +1 for rounding errors. + Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case + int f = 0; + int lim = count; + if (blendType == BlendTransformedBilinearTiled) { + x %= image_width; + if (x < 0) x += image_width; } else { - x2 = x1 + 1; + lim = qMin(count, image_x2-x); + if (x < image_x1) { + Q_ASSERT(x < image_x2); + uint t = fetch(s1, image_x1, data->texture.colorTable); + uint b = fetch(s2, image_x1, data->texture.colorTable); + quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff; + quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff; + do { + intermediate_buffer[0][f] = rb; + intermediate_buffer[1][f] = ag; + f++; + x++; + } while (x < image_x1 && f < lim); + } } - if (y1 < image_y1) { - y2 = y1 = image_y1; - } else if (y1 >= image_y2 - 1) { - y2 = y1 = image_y2 - 1; - } else { - y2 = y1 + 1; + +#if defined(QT_ALWAYS_HAVE_SSE2) + if (blendType != BlendTransformedBilinearTiled && + (format == QImage::Format_ARGB32_Premultiplied || format == QImage::Format_RGB32)) { + + const __m128i disty_ = _mm_set1_epi16(disty); + const __m128i idisty_ = _mm_set1_epi16(idisty); + const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); + + lim -= 3; + for (; f < lim; x += 4, f += 4) { + // Load 4 pixels from s1, and split the alpha-green and red-blue component + __m128i top = _mm_loadu_si128((__m128i*)((const uint *)(s1)+x)); + __m128i topAG = _mm_srli_epi16(top, 8); + __m128i topRB = _mm_and_si128(top, colorMask); + // Multiplies each colour component by idisty + topAG = _mm_mullo_epi16 (topAG, idisty_); + topRB = _mm_mullo_epi16 (topRB, idisty_); + + // Same for the s2 vector + __m128i bottom = _mm_loadu_si128((__m128i*)((const uint *)(s2)+x)); + __m128i bottomAG = _mm_srli_epi16(bottom, 8); + __m128i bottomRB = _mm_and_si128(bottom, colorMask); + bottomAG = _mm_mullo_epi16 (bottomAG, disty_); + bottomRB = _mm_mullo_epi16 (bottomRB, disty_); + + // Add the values, and shift to only keep 8 significant bits per colors + __m128i rAG =_mm_add_epi16(topAG, bottomAG); + rAG = _mm_srli_epi16(rAG, 8); + _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG); + __m128i rRB =_mm_add_epi16(topRB, bottomRB); + rRB = _mm_srli_epi16(rRB, 8); + _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB); + } + } +#endif + for (; f < count; f++) { // Same as above but without sse2 + if (blendType == BlendTransformedBilinearTiled) { + if (x >= image_width) x -= image_width; + } else { + x = qMin(x, image_x2 - 1); + } + + uint t = fetch(s1, x, data->texture.colorTable); + uint b = fetch(s2, x, data->texture.colorTable); + + intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff; + intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff; + x++; + } + // Now interpolate the values from the intermediate_buffer to get the final result. + fx &= fixed_scale - 1; + Q_ASSERT((fx >> 16) == 0); + while (b < end) { + register int x1 = (fx >> 16); + register int x2 = x1 + 1; + Q_ASSERT(x1 >= 0); + Q_ASSERT(x2 < count); + + register int distx = (fx & 0x0000ffff) >> 8; + register int idistx = 256 - distx; + int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff; + int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00; + *b = rb | ag; + b++; + fx += fdx; + } + } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || fabs(data->m22) < (1./8.)) { // scale up more than 8x + int y1 = (fy >> 16); + int y2; + fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); + const uchar *s1 = data->texture.scanLine(y1); + const uchar *s2 = data->texture.scanLine(y2); + int disty = (fy & 0x0000ffff) >> 8; + int idisty = 256 - disty; + while (b < end) { + int x1 = (fx >> 16); + int x2; + fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); + uint tl = fetch(s1, x1, data->texture.colorTable); + uint tr = fetch(s1, x2, data->texture.colorTable); + uint bl = fetch(s2, x1, data->texture.colorTable); + uint br = fetch(s2, x2, data->texture.colorTable); + + int distx = (fx & 0x0000ffff) >> 8; + int idistx = 256 - distx; + + uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); + uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); + *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); + + fx += fdx; + ++b; + } + } else { //scale down + int y1 = (fy >> 16); + int y2; + fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); + const uchar *s1 = data->texture.scanLine(y1); + const uchar *s2 = data->texture.scanLine(y2); + int disty = (fy & 0x0000ffff) >> 12; + int idisty = 16 - disty; + while (b < end) { + int x1 = (fx >> 16); + int x2; + fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); + uint tl = fetch(s1, x1, data->texture.colorTable); + uint tr = fetch(s1, x2, data->texture.colorTable); + uint bl = fetch(s2, x1, data->texture.colorTable); + uint br = fetch(s2, x2, data->texture.colorTable); + int distx = (fx & 0x0000ffff) >> 12; + int idistx = 16 - distx; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty, idistx, idisty); + fx += fdx; + ++b; } } + } else { //rotation + if (fabs(data->m11) > 8 || fabs(data->m22) > 8) { + //if we are zooming more than 8 times, we use 8bit precision for the position. + while (b < end) { + int x1 = (fx >> 16); + int x2; + int y1 = (fy >> 16); + int y2; - Q_ASSERT(x1 >= 0 && x1 < image_width); - Q_ASSERT(x2 >= 0 && x2 < image_width); - Q_ASSERT(y1 >= 0 && y1 < image_height); - Q_ASSERT(y2 >= 0 && y2 < image_height); + fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); + fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); - const uchar *s1 = data->texture.scanLine(y1); - const uchar *s2 = data->texture.scanLine(y2); + const uchar *s1 = data->texture.scanLine(y1); + const uchar *s2 = data->texture.scanLine(y2); - uint tl = fetch(s1, x1, data->texture.colorTable); - uint tr = fetch(s1, x2, data->texture.colorTable); - uint bl = fetch(s2, x1, data->texture.colorTable); - uint br = fetch(s2, x2, data->texture.colorTable); + uint tl = fetch(s1, x1, data->texture.colorTable); + uint tr = fetch(s1, x2, data->texture.colorTable); + uint bl = fetch(s2, x1, data->texture.colorTable); + uint br = fetch(s2, x2, data->texture.colorTable); - int distx = (fx & 0x0000ffff) >> 8; - int disty = (fy & 0x0000ffff) >> 8; - int idistx = 256 - distx; - int idisty = 256 - disty; + int distx = (fx & 0x0000ffff) >> 8; + int disty = (fy & 0x0000ffff) >> 8; + int idistx = 256 - distx; + int idisty = 256 - disty; - uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); - uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); - *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); + uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); + uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); + *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); - fx += fdx; - fy += fdy; - ++b; + fx += fdx; + fy += fdy; + ++b; + } + } else { + //we are zooming less than 8x, use 4bit precision + while (b < end) { + int x1 = (fx >> 16); + int x2; + int y1 = (fy >> 16); + int y2; + + fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); + fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); + + const uchar *s1 = data->texture.scanLine(y1); + const uchar *s2 = data->texture.scanLine(y2); + + uint tl = fetch(s1, x1, data->texture.colorTable); + uint tr = fetch(s1, x2, data->texture.colorTable); + uint bl = fetch(s2, x1, data->texture.colorTable); + uint br = fetch(s2, x2, data->texture.colorTable); + + int distx = (fx & 0x0000ffff) >> 12; + int disty = (fy & 0x0000ffff) >> 12; + int idistx = 16 - distx; + int idisty = 16 - disty; + + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty, idistx, idisty); + + fx += fdx; + fy += fdy; + ++b; + } + } } } else { const qreal fdx = data->m11; @@ -779,37 +985,8 @@ const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator * int idistx = 256 - distx; int idisty = 256 - disty; - if (blendType == BlendTransformedBilinearTiled) { - x1 %= image_width; - if (x1 < 0) x1 += image_width; - x2 = x1 + 1; - x2 %= image_width; - - y1 %= image_height; - if (y1 < 0) y1 += image_height; - y2 = y1 + 1; - y2 %= image_height; - } else { - if (x1 < 0) { - x2 = x1 = 0; - } else if (x1 >= image_width - 1) { - x2 = x1 = image_width - 1; - } else { - x2 = x1 + 1; - } - if (y1 < 0) { - y2 = y1 = 0; - } else if (y1 >= image_height - 1) { - y2 = y1 = image_height - 1; - } else { - y2 = y1 + 1; - } - } - - Q_ASSERT(x1 >= 0 && x1 < image_width); - Q_ASSERT(x2 >= 0 && x2 < image_width); - Q_ASSERT(y1 >= 0 && y1 < image_height); - Q_ASSERT(y2 >= 0 && y2 < image_height); + fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); + fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); const uchar *s1 = data->texture.scanLine(y1); const uchar *s2 = data->texture.scanLine(y2); @@ -5212,37 +5389,8 @@ Q_STATIC_TEMPLATE_FUNCTION void blend_transformed_bilinear_argb(int count, const int y1 = (y >> 16); int y2; - if (blendType == BlendTransformedBilinearTiled) { - x1 %= image_width; - if (x1 < 0) x1 += image_width; - x2 = x1 + 1; - x2 %= image_width; - - y1 %= image_height; - if (y1 < 0) y1 += image_height; - y2 = y1 + 1; - y2 %= image_height; - - Q_ASSERT(x1 >= 0 && x1 < image_width); - Q_ASSERT(x2 >= 0 && x2 < image_width); - Q_ASSERT(y1 >= 0 && y1 < image_height); - Q_ASSERT(y2 >= 0 && y2 < image_height); - } else { - if (x1 < image_x1) { - x2 = x1 = image_x1; - } else if (x1 >= image_x2 - 1) { - x2 = x1 = image_x2 - 1; - } else { - x2 = x1 + 1; - } - if (y1 < image_y1) { - y2 = y1 = image_y1; - } else if (y1 >= image_y2 - 1) { - y2 = y1 = image_y2 - 1; - } else { - y2 = y1 + 1; - } - } + fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); + fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); int y1_offset = y1 * scanline_offset; int y2_offset = y2 * scanline_offset; @@ -5322,37 +5470,8 @@ Q_STATIC_TEMPLATE_FUNCTION void blend_transformed_bilinear_argb(int count, const int idistx = 256 - distx; int idisty = 256 - disty; - if (blendType == BlendTransformedBilinearTiled) { - x1 %= image_width; - if (x1 < 0) x1 += image_width; - x2 = x1 + 1; - x2 %= image_width; - - y1 %= image_height; - if (y1 < 0) y1 += image_height; - y2 = y1 + 1; - y2 %= image_height; - - Q_ASSERT(x1 >= 0 && x1 < image_width); - Q_ASSERT(x2 >= 0 && x2 < image_width); - Q_ASSERT(y1 >= 0 && y1 < image_height); - Q_ASSERT(y2 >= 0 && y2 < image_height); - } else { - if (x1 < image_x1) { - x2 = x1 = image_x1; - } else if (x1 >= image_x2 - 1) { - x2 = x1 = image_x2 - 1; - } else { - x2 = x1 + 1; - } - if (y1 < image_y1) { - y2 = y1 = image_y1; - } else if (y1 >= image_y2 - 1) { - y2 = y1 = image_y2 - 1; - } else { - y2 = y1 + 1; - } - } + fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); + fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); int y1_offset = y1 * scanline_offset; int y2_offset = y2 * scanline_offset; -- cgit v0.12