From 660145873a05b159b2f96dab8f0eaa75499146ff Mon Sep 17 00:00:00 2001
From: Olivier Goffart <olivier.goffart@nokia.com>
Date: Mon, 7 Jun 2010 16:26:52 +0200
Subject: qdrawhelper: Use SSE2 in fetchTransformedBilinear (when scalling up)

It is possible to use SSE2 to speedup the smooth scaling of a pixmap

Benchmark results:
Drawing the home screen of Qt Creator:
 Qt 4.7: 45fps
 After last set of patches: 75fps
 After this patch: 95fps

Reviewed-by: Samuel
---
 src/gui/painting/qdrawhelper.cpp | 85 +++++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 14 deletions(-)

diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index a765102..51d841e 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -748,30 +748,87 @@ const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator *
                 int disty = (fy & 0x0000ffff) >> 8;
                 int idisty = 256 - disty;
                 int count = length * data->m11 + 2;
-                int x_ = fx >> 16;
+                int x = fx >> 16;
 
-                quint32 intermediate_buffer[buffer_size + 2][2];
+                // The idea is first to do the interpolation between the row s1 and the row s2
+                // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+
+                // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
+                // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
+                quint32 intermediate_buffer[2][buffer_size + 2];
                 Q_ASSERT(length * data->m11 <= buffer_size);
+                int f = 0;
+                int lim = count;
                 if (blendType == BlendTransformedBilinearTiled) {
-                    x_ %= image_width;
-                    if (x_ < 0) x_ += image_width;
+                    x %= image_width;
+                    if (x < 0) x += image_width;
+                } else {
+                    lim = qMin(count, image_x2-x);
+                    if (x < image_x1) {
+                        Q_ASSERT(x < image_x2);
+                        uint t = fetch(s1, image_x1, data->texture.colorTable);
+                        uint b = fetch(s2, image_x1, data->texture.colorTable);
+                        quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                        quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                        do {
+                            intermediate_buffer[0][f] = rb;
+                            intermediate_buffer[1][f] = ag;
+                            f++;
+                            x++;
+                        } while (x < image_x1 && f < lim);
+                    }
                 }
-                for (int f = 0; f < count; f++) {
-                    int x;
+
+#if defined(QT_ALWAYS_HAVE_SSE2)
+                if (blendType != BlendTransformedBilinearTiled &&
+                        (format == QImage::Format_ARGB32_Premultiplied || format == QImage::Format_RGB32)) {
+
+                    const __m128i disty_ = _mm_set1_epi16(disty);
+                    const __m128i idisty_ = _mm_set1_epi16(idisty);
+                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+
+                    lim -= 3;
+                    for (; f < lim; x += 4, f += 4) {
+                        // Load 4 pixels from s1, and split the alpha-green and red-blue component
+                        __m128i top = _mm_loadu_si128((__m128i*)((const uint *)(s1)+x));
+                        __m128i topAG = _mm_srli_epi16(top, 8);
+                        __m128i topRB = _mm_and_si128(top, colorMask);
+                        // Multiplies each colour component by idisty
+                        topAG = _mm_mullo_epi16 (topAG, idisty_);
+                        topRB = _mm_mullo_epi16 (topRB, idisty_);
+
+                        // Same for the s2 vector
+                        __m128i bottom = _mm_loadu_si128((__m128i*)((const uint *)(s2)+x));
+                        __m128i bottomAG = _mm_srli_epi16(bottom, 8);
+                        __m128i bottomRB = _mm_and_si128(bottom, colorMask);
+                        bottomAG = _mm_mullo_epi16 (bottomAG, disty_);
+                        bottomRB = _mm_mullo_epi16 (bottomRB, disty_);
+
+                        // Add the values, and shift to only keep 8 significant bits per colors
+                        __m128i rAG =_mm_add_epi16(topAG, bottomAG);
+                        rAG = _mm_srli_epi16(rAG, 8);
+                        _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG);
+                        __m128i rRB =_mm_add_epi16(topRB, bottomRB);
+                        rRB = _mm_srli_epi16(rRB, 8);
+                        _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
+                    }
+                }
+#endif
+                for (; f < count; f++) { // Same as above but without sse2
                     if (blendType == BlendTransformedBilinearTiled) {
-                        if (x_ >= image_width) x_ -= image_width;
-                        x = x_;
+                        if (x >= image_width) x -= image_width;
                     } else {
-                        x = qBound(image_x1, x_, image_x2 - 1);
+                        x = qMin(x, image_x2 - 1);
                     }
 
                     uint t = fetch(s1, x, data->texture.colorTable);
                     uint b = fetch(s2, x, data->texture.colorTable);
 
-                    intermediate_buffer[f][0] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                    intermediate_buffer[f][1] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                    x_++;
+                    intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                    intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                    x++;
                 }
+                // Now interpolate the values from the intermediate_buffer to get the final result.
                 fx &= fixed_scale - 1;
                 Q_ASSERT((fx >> 16) == 0);
                 while (b < end) {
@@ -782,8 +839,8 @@ const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator *
 
                     register int distx = (fx & 0x0000ffff) >> 8;
                     register int idistx = 256 - distx;
-                    int rb = ((intermediate_buffer[x1][0] * idistx + intermediate_buffer[x2][0] * distx) >> 8) & 0xff00ff;
-                    int ag = (intermediate_buffer[x1][1] * idistx + intermediate_buffer[x2][1] * distx) & 0xff00ff00;
+                    int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff;
+                    int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00;
                     *b = rb | ag;
                     b++;
                     fx += fdx;
-- 
cgit v0.12