Optimized SourceOver and 16 bit dest fetches, dest stores using NEON.

This makes for example linear gradient blending on top of RGB16 156 % faster (from 20.4 fps to 52.3 fps in my benchmark). Task-number: QTBUG-6684 Reviewed-by: Gunnar Sletta
author: Samuel Rødal <sroedal@trolltech.com> 2010-03-25 11:14:40 (GMT)
committer: Samuel Rødal <sroedal@trolltech.com> 2010-03-26 09:49:09 (GMT)
commit: fa44a37174f51f3d2786fc6e60d8fa5561a4df6c (patch)
tree: 014108ee7598fb3a246bad894cb0a47da36115b9 /src/gui/painting/qdrawhelper_neon.cpp
parent: 0ad22e6cd1cb353e2e1244c1eb7257cb3af9def4 (diff)
download: Qt-fa44a37174f51f3d2786fc6e60d8fa5561a4df6c.zip
Qt-fa44a37174f51f3d2786fc6e60d8fa5561a4df6c.tar.gz
Qt-fa44a37174f51f3d2786fc6e60d8fa5561a4df6c.tar.bz2
1 files changed, 114 insertions, 11 deletions
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index ca1d85f..946e100 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -114,6 +114,21 @@ pixman_composite_src_0565_8888_asm_neon (int32_t   w,
                                          uint16_t *src,
                                          int32_t   src_stride);
 
+extern "C" void
+pixman_composite_over_n_8_0565_asm_neon (int32_t    w,
+                                         int32_t    h,
+                                         uint16_t  *dst,
+                                         int32_t    dst_stride,
+                                         uint32_t   src,
+                                         int32_t    unused,
+                                         uint8_t   *mask,
+                                         int32_t    mask_stride);
+
+extern "C" void
+pixman_composite_scanline_over_asm_neon (int32_t         w,
+                                         const uint32_t *dst,
+                                         const uint32_t *src);
+
 // qblendfunctions.cpp
 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
                                           const uchar *srcPixels, int sbpl,
@@ -163,6 +178,15 @@ void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
     pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
 }
 
+void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (const_alpha == 255) {
+        pixman_composite_scanline_over_asm_neon(length, dest, src);
+    } else {
+        qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
+    }
+}
+
 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
                                     const uchar *srcPixels, int sbpl,
                                     int w, int h,
@@ -287,17 +311,6 @@ void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
     }
 }
 
-extern "C" void
-pixman_composite_over_n_8_0565_asm_neon (int32_t    w,
-                                         int32_t    h,
-                                         uint16_t  *dst,
-                                         int32_t    dst_stride,
-                                         uint32_t   src,
-                                         int32_t    unused,
-                                         uint8_t   *mask,
-                                         int32_t    mask_stride);
-
-
 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
                                   int x, int y, quint32 color,
                                   const uchar *bitmap,
@@ -449,6 +462,96 @@ void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
 }
 
+static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
+{
+    asm volatile (
+        "vld1.16     { d0, d1 }, [%[SRC]]\n\t"
+
+        /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
+           and put data into d4 - red, d3 - green, d2 - blue */
+        "vshrn.u16   d4,  q0,  #8\n\t"
+        "vshrn.u16   d3,  q0,  #3\n\t"
+        "vsli.u16    q0,  q0,  #5\n\t"
+        "vsri.u8     d4,  d4,  #5\n\t"
+        "vsri.u8     d3,  d3,  #6\n\t"
+        "vshrn.u16   d2,  q0,  #2\n\t"
+
+        /* fill d5 - alpha with 0xff */
+        "mov         r2, #255\n\t"
+        "vdup.8      d5, r2\n\t"
+
+        "vst4.8      { d2, d3, d4, d5 }, [%[DST]]"
+        : : [DST]"r" (dst), [SRC]"r" (src)
+        : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
+    );
+}
+
+uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
+{
+    const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
+
+    int i = 0;
+    for (; i < length - 7; i += 8)
+        convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
+
+    if (i < length) {
+        quint16 srcBuffer[8];
+        quint32 dstBuffer[8];
+
+        int tail = length - i;
+        for (int j = 0; j < tail; ++j)
+            srcBuffer[j] = data[i + j];
+
+        convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
+
+        for (int j = 0; j < tail; ++j)
+            buffer[i + j] = dstBuffer[j];
+    }
+
+    return buffer;
+}
+
+static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
+{
+    asm volatile (
+        "vld4.8      { d0, d1, d2, d3 }, [%[SRC]]\n\t"
+
+        /* convert to r5g6b5 and store it into {d28, d29} */
+        "vshll.u8    q14, d2, #8\n\t"
+        "vshll.u8    q8,  d1, #8\n\t"
+        "vshll.u8    q9,  d0, #8\n\t"
+        "vsri.u16    q14, q8, #5\n\t"
+        "vsri.u16    q14, q9, #11\n\t"
+
+        "vst1.16     { d28, d29 }, [%[DST]]"
+        : : [DST]"r" (dst), [SRC]"r" (src)
+        : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
+    );
+}
+
+void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
+{
+    quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
+
+    int i = 0;
+    for (; i < length - 7; i += 8)
+        convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
+
+    if (i < length) {
+        quint32 srcBuffer[8];
+        quint16 dstBuffer[8];
+
+        int tail = length - i;
+        for (int j = 0; j < tail; ++j)
+            srcBuffer[j] = buffer[i + j];
+
+        convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
+
+        for (int j = 0; j < tail; ++j)
+            data[i + j] = dstBuffer[j];
+    }
+}
+
 QT_END_NAMESPACE
 
 #endif // QT_HAVE_NEON
author	Samuel Rødal <sroedal@trolltech.com>	2010-03-25 11:14:40 (GMT)
committer	Samuel Rødal <sroedal@trolltech.com>	2010-03-26 09:49:09 (GMT)
commit	fa44a37174f51f3d2786fc6e60d8fa5561a4df6c (patch)
tree	014108ee7598fb3a246bad894cb0a47da36115b9 /src/gui/painting/qdrawhelper_neon.cpp
parent	0ad22e6cd1cb353e2e1244c1eb7257cb3af9def4 (diff)
download	Qt-fa44a37174f51f3d2786fc6e60d8fa5561a4df6c.zip Qt-fa44a37174f51f3d2786fc6e60d8fa5561a4df6c.tar.gz Qt-fa44a37174f51f3d2786fc6e60d8fa5561a4df6c.tar.bz2