Included ARM NEON optimizations from pixman in Qt.

On the N900 16 bit text blending is 30 - 50 % faster, and ARGB32PM on RGB16 image blending now runs in 1/10th of the time it used to. We now make ARGB32PM the default pixmap format for alpha pixmaps instead of ARGB8565PM which is unaligned and bad for performance. The relevant numbers: Mostly opaque pixels: ARGB24 on ARGB24 using QPainter..................: 336,813033 ARGB32 on ARGB32 using QPainter.................: 18,419387 RGB16 on ARGB24 using QPainter..................: 167,301014 RGB16 on ARGB32 using QPainter..................: 17,279372 ARGB24 on RGB16 using QPainter..................: 35,100147 ARGB32PM on RGB16 using QPainter................: 15,924256 No opaque pixels: ARGB24 on ARGB24 using QPainter..................: 412,190765 ARGB32 on ARGB32 using QPainter.................: 16,818389 RGB16 on ARGB24 using QPainter..................: 170,957878 RGB16 on ARGB32 using QPainter..................: 16,742984 ARGB24 on RGB16 using QPainter..................: 93,600482 ARGB32PM on RGB16 using QPainter................: 15,999310 So switching to ARGB32PM should give a boost in all areas. Task-number: QTBUG-6684 Reviewed-by: Gunnar Sletta
author: Samuel Rødal <sroedal@trolltech.com> 2010-03-08 12:38:08 (GMT)
committer: Samuel Rødal <sroedal@trolltech.com> 2010-03-26 09:48:59 (GMT)
commit: 348d22c37611066dc7efc9aac820d77bcf3bbbab (patch)
tree: 2def91d28881b2ec809bf1eb4a64a71b08090530 /src/gui/painting/qdrawhelper_neon.cpp
parent: 8d5ae9bca2cbe8c5a7f764b8ba325f79c0bbfe62 (diff)
download: Qt-348d22c37611066dc7efc9aac820d77bcf3bbbab.zip
Qt-348d22c37611066dc7efc9aac820d77bcf3bbbab.tar.gz
Qt-348d22c37611066dc7efc9aac820d77bcf3bbbab.tar.bz2
1 files changed, 100 insertions, 44 deletions
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index 77c5202..ef1b85c 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -44,6 +44,7 @@
 #ifdef QT_HAVE_NEON
 
 #include <private/qdrawhelper_neon_p.h>
+#include <private/qpaintengine_raster_p.h>
 #include <arm_neon.h>
 
 QT_BEGIN_NAMESPACE
@@ -87,6 +88,79 @@ static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, u
     return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
 }
 
+extern "C" void
+pixman_composite_over_8888_0565_asm_neon (int32_t   w,
+                                          int32_t   h,
+                                          uint16_t *dst,
+                                          int32_t   dst_stride,
+                                          uint32_t *src,
+                                          int32_t   src_stride);
+
+extern "C" void
+pixman_composite_over_8888_8888_asm_neon (int32_t   w,
+                                          int32_t   h,
+                                          uint32_t *dst,
+                                          int32_t   dst_stride,
+                                          uint32_t *src,
+                                          int32_t   src_stride);
+
+extern "C" void
+pixman_composite_src_0565_8888_asm_neon (int32_t   w,
+                                         int32_t   h,
+                                         uint32_t *dst,
+                                         int32_t   dst_stride,
+                                         uint16_t *src,
+                                         int32_t   src_stride);
+
+// qblendfunctions.cpp
+void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
+                                          const uchar *srcPixels, int sbpl,
+                                          int w, int h,
+                                          int const_alpha);
+
+void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   int w, int h,
+                                   int const_alpha)
+{
+    dbpl /= 4;
+    sbpl /= 2;
+
+    quint32 *dst = (quint32 *) destPixels;
+    quint16 *src = (quint16 *) srcPixels;
+
+    if (const_alpha != 256) {
+        quint8 a = (255 * const_alpha) >> 8;
+        quint8 ia = 255 - a;
+
+        while (h--) {
+            for (int x=0; x<w; ++x)
+                dst[x] = INTERPOLATE_PIXEL_255(qt_colorConvert(src[x], dst[x]), a, dst[x], ia);
+            dst += dbpl;
+            src += sbpl;
+        }
+        return;
+    }
+
+    pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
+}
+
+void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   int w, int h,
+                                   int const_alpha)
+{
+    if (const_alpha != 256) {
+        qt_blend_argb32_on_rgb16_const_alpha(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
+        return;
+    }
+
+    quint16 *dst = (quint16 *) destPixels;
+    quint32 *src = (quint32 *) srcPixels;
+
+    pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
+}
+
 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
                                     const uchar *srcPixels, int sbpl,
                                     int w, int h,
@@ -97,50 +171,7 @@ void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
     uint16x8_t half = vdupq_n_u16(0x80);
     uint16x8_t full = vdupq_n_u16(0xff);
     if (const_alpha == 256) {
-        for (int y = 0; y < h; ++y) {
-            int x = 0;
-            for (; x < w-3; x += 4) {
-                uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
-                if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) {
-                    // all opaque
-                    vst1q_u32((uint32_t *)&dst[x], src32);
-                } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
-                    uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
-
-                    const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
-                    const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
-
-                    const uint8x8_t src8_low = vget_low_u8(src8);
-                    const uint8x8_t dst8_low = vget_low_u8(dst8);
-
-                    const uint8x8_t src8_high = vget_high_u8(src8);
-                    const uint8x8_t dst8_high = vget_high_u8(dst8);
-
-                    const uint16x8_t src16_low = vmovl_u8(src8_low);
-                    const uint16x8_t dst16_low = vmovl_u8(dst8_low);
-
-                    const uint16x8_t src16_high = vmovl_u8(src8_high);
-                    const uint16x8_t dst16_high = vmovl_u8(dst8_high);
-
-                    const uint16x8_t result16_low = qvsource_over_u16(src16_low, dst16_low, half, full);
-                    const uint16x8_t result16_high = qvsource_over_u16(src16_high, dst16_high, half, full);
-
-                    const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
-                    const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
-
-                    vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
-                }
-            }
-            for (; x<w; ++x) {
-                uint s = src[x];
-                if (s >= 0xff000000)
-                    dst[x] = s;
-                else if (s != 0)
-                    dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
-            }
-            dst = (quint32 *)(((uchar *) dst) + dbpl);
-            src = (const quint32 *)(((const uchar *) src) + sbpl);
-        }
+        pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
     } else if (const_alpha != 0) {
         const_alpha = (const_alpha * 255) >> 8;
         uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
@@ -254,6 +285,31 @@ void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
     }
 }
 
+extern "C" void
+pixman_composite_over_n_8_0565_asm_neon (int32_t    w,
+                                         int32_t    h,
+                                         uint16_t  *dst,
+                                         int32_t    dst_stride,
+                                         uint32_t   src,
+                                         int32_t    unused,
+                                         uint8_t   *mask,
+                                         int32_t    mask_stride);
+
+
+void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
+                                  int x, int y, quint32 color,
+                                  const uchar *bitmap,
+                                  int mapWidth, int mapHeight, int mapStride,
+                                  const QClipData *)
+{
+    quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
+    const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
+
+    uchar *mask = const_cast<uchar *>(bitmap);
+
+    pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
+}
+
 QT_END_NAMESPACE
 
 #endif // QT_HAVE_NEON
author	Samuel Rødal <sroedal@trolltech.com>	2010-03-08 12:38:08 (GMT)
committer	Samuel Rødal <sroedal@trolltech.com>	2010-03-26 09:48:59 (GMT)
commit	348d22c37611066dc7efc9aac820d77bcf3bbbab (patch)
tree	2def91d28881b2ec809bf1eb4a64a71b08090530 /src/gui/painting/qdrawhelper_neon.cpp
parent	8d5ae9bca2cbe8c5a7f764b8ba325f79c0bbfe62 (diff)
download	Qt-348d22c37611066dc7efc9aac820d77bcf3bbbab.zip Qt-348d22c37611066dc7efc9aac820d77bcf3bbbab.tar.gz Qt-348d22c37611066dc7efc9aac820d77bcf3bbbab.tar.bz2