From 9427b4c8f3b557524cda3f72cf81f68940cb7246 Mon Sep 17 00:00:00 2001
From: Benjamin Poulain <benjamin.poulain@nokia.com>
Date: Fri, 16 Jul 2010 20:32:38 +0200
Subject: Used aligned load and store when possible for the blending of ARGB32

Unaligned load and store can be costly. This patch mitigate the problem
by aligning the destination before using SSE2. The destination is
aligned because it is used by load and store, while the source is only
use by load.

On Atom, the blending test is 7% faster for ARGB32.

Re-pushing that patch, thanks to awesome policies...

Reviewed-by: Andreas Kling
---
 src/gui/painting/qdrawingprimitive_sse2_p.h | 33 ++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)
diff --git a/src/gui/painting/qdrawingprimitive_sse2_p.h b/src/gui/painting/qdrawingprimitive_sse2_p.h
index 3c96946..b1f8306 100644
--- a/src/gui/painting/qdrawingprimitive_sse2_p.h
+++ b/src/gui/painting/qdrawingprimitive_sse2_p.h
@@ -141,12 +141,24 @@ QT_BEGIN_NAMESPACE
 // with shortcuts if fully opaque or fully transparent.
 #define BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask) { \
     int x = 0; \
+\
+    /* First, get dst aligned. */ \
+    const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;\
+    const int prologLength = qMin(length, offsetToAlignOn16Bytes);\
+    for (; x < prologLength; ++x) { \
+        uint s = src[x]; \
+        if (s >= 0xff000000) \
+            dst[x] = s; \
+        else if (s != 0) \
+            dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); \
+    } \
+\
     for (; x < length-3; x += 4) { \
         const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); \
         const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
         if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
             /* all opaque */ \
-            _mm_storeu_si128((__m128i *)&dst[x], srcVector); \
+            _mm_store_si128((__m128i *)&dst[x], srcVector); \
         } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
             /* not fully transparent */ \
             /* extract the alpha channel on 2 x 16 bits */ \
@@ -157,13 +169,13 @@ QT_BEGIN_NAMESPACE
             alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \
             alphaChannel = _mm_sub_epi16(one, alphaChannel); \
  \
-            const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); \
+            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
             __m128i destMultipliedByOneMinusAlpha; \
             BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
  \
             /* result = s + d * (1-alpha) */\
             const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
-            _mm_storeu_si128((__m128i *)&dst[x], result); \
+            _mm_store_si128((__m128i *)&dst[x], result); \
         } \
     } \
     for (; x < length; ++x) { \
@@ -189,6 +201,17 @@ QT_BEGIN_NAMESPACE
 #define BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector) \
 { \
     int x = 0; \
+\
+    const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;\
+    const int prologLength = qMin(length, offsetToAlignOn16Bytes);\
+    for (; x < prologLength; ++x) { \
+        uint s = src[x]; \
+        if (s >= 0xff000000) \
+            dst[x] = s; \
+        else if (s != 0) \
+            dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); \
+    } \
+\
     for (; x < length-3; x += 4) { \
         __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); \
         if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { \
@@ -198,12 +221,12 @@ QT_BEGIN_NAMESPACE
             alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \
             alphaChannel = _mm_sub_epi16(one, alphaChannel); \
  \
-            const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); \
+            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
             __m128i destMultipliedByOneMinusAlpha; \
             BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
  \
             const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
-            _mm_storeu_si128((__m128i *)&dst[x], result); \
+            _mm_store_si128((__m128i *)&dst[x], result); \
         } \
     } \
     for (; x < length; ++x) { \
-- 
cgit v0.12


From 23ea4340a622cbfed81eb7afb2e09ec64b0ebef8 Mon Sep 17 00:00:00 2001
From: Andreas Kling <andreas.kling@nokia.com>
Date: Sun, 18 Jul 2010 07:30:35 +0200
Subject: Corrected BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2

The unaligned prologue was processed without using the const alpha.
Regressed with 9427b4c8f3b5.
---
 src/gui/painting/qdrawingprimitive_sse2_p.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gui/painting/qdrawingprimitive_sse2_p.h b/src/gui/painting/qdrawingprimitive_sse2_p.h
index b1f8306..18355c2 100644
--- a/src/gui/painting/qdrawingprimitive_sse2_p.h
+++ b/src/gui/painting/qdrawingprimitive_sse2_p.h
@@ -205,11 +205,11 @@ QT_BEGIN_NAMESPACE
     const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;\
     const int prologLength = qMin(length, offsetToAlignOn16Bytes);\
     for (; x < prologLength; ++x) { \
-        uint s = src[x]; \
-        if (s >= 0xff000000) \
-            dst[x] = s; \
-        else if (s != 0) \
+        quint32 s = src[x]; \
+        if (s != 0) { \
+            s = BYTE_MUL(s, const_alpha); \
             dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); \
+        } \
     } \
 \
     for (; x < length-3; x += 4) { \
-- 
cgit v0.12