From d00d1bd5423d0bbfea7c85a0411d2b22d97fbe0f Mon Sep 17 00:00:00 2001
From: Benjamin Poulain <benjamin.poulain@nokia.com>
Date: Wed, 21 Jul 2010 17:14:28 +0200
Subject: Use aligned load for the blending of RGB32 over RGB32

Aligned load are faster than unaligned load. This patch add a prologue
to the blending function in order to align the destination on 16 bytes
before using SSE2.

Reviewed-by: Kent Hansen
---
 src/gui/painting/qdrawhelper_sse2.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/gui/painting/qdrawhelper_sse2.cpp b/src/gui/painting/qdrawhelper_sse2.cpp
index 346e177..279f685 100644
--- a/src/gui/painting/qdrawhelper_sse2.cpp
+++ b/src/gui/painting/qdrawhelper_sse2.cpp
@@ -110,13 +110,23 @@ void qt_blend_rgb32_on_rgb32_sse2(uchar *destPixels, int dbpl,
             const __m128i oneMinusConstAlpha =  _mm_set1_epi16(one_minus_const_alpha);
             for (int y = 0; y < h; ++y) {
                 int x = 0;
+
+                // First, align dest to 16 bytes:
+                const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;
+                const int prologLength = qMin(w, offsetToAlignOn16Bytes);
+                for (; x < prologLength; ++x) {
+                    quint32 s = src[x];
+                    s = BYTE_MUL(s, const_alpha);
+                    dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
+                }
+
                 for (; x < w-3; x += 4) {
                     __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
                     if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) {
-                        const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]);
+                        const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
                         __m128i result;
                         INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
-                        _mm_storeu_si128((__m128i *)&dst[x], result);
+                        _mm_store_si128((__m128i *)&dst[x], result);
                     }
                 }
                 for (; x<w; ++x) {
-- 
cgit v0.12