Merge branch '4.7' of git@scm.dev.nokia.troll.no:qt/oslo-staging-2 into 4.7

author: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@nokia.com> 2010-03-26 10:47:01 (GMT)
committer: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@nokia.com> 2010-03-26 10:47:01 (GMT)
commit: e7eb7bdf63791ed03257f2f23b1f515e4d89e054 (patch)
tree: 1d580cea9ffbf342a029c73bd2cecc106811ff22
parent: 47472906fd00e0eff820870330d481c4229ee285 (diff)
parent: 41e9adb44137c8839d0d7e131802de198b0e7168 (diff)
download: Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.zip
Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.tar.gz
Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.tar.bz2
29 files changed, 4385 insertions, 541 deletions
diff --git a/doc/src/legal/3rdparty.qdoc b/doc/src/legal/3rdparty.qdoc
index d608038..8d0cd2a 100644
--- a/doc/src/legal/3rdparty.qdoc
+++ b/doc/src/legal/3rdparty.qdoc
@@ -454,4 +454,43 @@
     OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 
     See \c src/3rdparty/webkit/JavaScriptCore/wtf/dtoa.cpp for license details.
+
+    \section1 Pixman (\c pixman) version 0.17.11
+
+    \e{pixman is a library that provides low-level pixel manipulation
+    features such as image compositing and trapezoid rasterization.} -- quoted
+    from \c src/3rdparty/pixman/README
+
+    We are only using the pixman-arm-neon-asm.h and pixman-arm-neon-asm.S
+    source files which have the following copyright and license header:
+
+    \hr
+
+    Copyright © 2009 Nokia Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice (including the next
+    paragraph) shall be included in all copies or substantial portions of the
+    Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+    Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+
+    \hr
+
+    See \c src/3rdparty/pixman/pixman-arm-neon-asm.h and
+    \c src/3rdparty/pixman/pixman-arm-neon-asm.S
 */
diff --git a/src/3rdparty/pixman/README b/src/3rdparty/pixman/README
new file mode 100644
index 0000000..843b069
--- /dev/null
+++ b/src/3rdparty/pixman/README
@@ -0,0 +1,26 @@
+pixman is a library that provides low-level pixel manipulation
+features such as image compositing and trapezoid rasterization.
+
+Please submit bugs & patches to the libpixman bugzilla:
+
+       https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
+
+All questions regarding this software should be directed to either the 
+Xorg mailing list:
+
+       http://lists.freedesktop.org/mailman/listinfo/xorg
+
+or the cairo mailing list:
+
+       http://lists.freedesktop.org/mailman/listinfo/cairo
+
+The master development code repository can be found at:
+
+	git://anongit.freedesktop.org/git/pixman
+
+	http://gitweb.freedesktop.org/?p=pixman;a=summary
+
+For more information on the git code manager, see:
+
+	http://wiki.x.org/wiki/GitPage
+
diff --git a/src/3rdparty/pixman/pixman-arm-neon-asm.S b/src/3rdparty/pixman/pixman-arm-neon-asm.S
new file mode 100644
index 0000000..eb8cc4c
--- /dev/null
+++ b/src/3rdparty/pixman/pixman-arm-neon-asm.S
@@ -0,0 +1,1709 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .fpu neon
+    .arch armv7a
+    .altmacro
+
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5            - contain loaded destination pixels (they are needed)
+ * d28, d29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ *  vuzp.8 d0, d1
+ *  vuzp.8 d2, d3
+ *  vuzp.8 d1, d3
+ *  vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ *  vzip.8 d0, d2
+ *  vzip.8 d1, d3
+ *  vzip.8 d2, d3
+ *  vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3      /* invert source alpha */
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   vst1.16     {d28, d29}, [DST_W, :128]!
+ *   vld1.16     {d4, d5}, [DST_R, :128]!
+ *   vld4.32     {d0, d1, d2, d3}, [SRC]!
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        vqadd.u8    d16, d2, d20
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vqadd.u8    q9, q0, q11
+    vshrn.u16   d6, q2, #8
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+        vshll.u8    q14, d16, #8
+                                    PF add PF_X, PF_X, #8
+        vshll.u8    q8, d19, #8
+                                    PF tst PF_CTL, #0xF
+    vsri.u8     d6, d6, #5
+                                    PF addne PF_X, PF_X, #8
+    vmvn.8      d3, d3
+                                    PF subne PF_CTL, PF_CTL, #1
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    vmull.u8    q10, d3, d6
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vsri.u16    q14, q8, #5
+                                    PF cmp PF_X, ORIG_W
+        vshll.u8    q9, d18, #8
+    vrshr.u16   q13, q10, #8
+                                    PF subge PF_X, PF_X, ORIG_W
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+                                    PF subges PF_CTL, PF_CTL, #0x10
+        vsri.u16    q14, q9, #11
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vraddhn.u16 d22, q12, q15
+        vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vld4.32     {d0, d1, d2, d3}, [SRC]!
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    pixman_composite_over_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d3, d3      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q14, d2, #8
+    vshll.u8    q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        vsri.u16    q14, q8, #5
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vsri.u16    q14, q9, #11
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vshll.u8    q8, d1, #8
+        vst1.16     {d28, d29}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vshll.u8    q14, d2, #8
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vshll.u8    q9, d0, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    vshrn.u16   d30, q0, #8
+    vshrn.u16   d29, q0, #3
+    vsli.u16    q0, q0, #5
+    vmov.u8     d31, #255
+    vsri.u8     d30, d30, #5
+    vsri.u8     d29, d29, #6
+    vshrn.u16   d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.16    {d0, d1}, [SRC]!
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8000_8000_process_pixblock_head
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8000_8000_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
+    vld1.8      {d0, d1, d2, d3}, [SRC]!
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8000_8000_process_pixblock_head, \
+    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8000_8000_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    vld1.8      {d0, d1, d2, d3}, [SRC]!
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8000_8000_process_pixblock_head, \
+    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8000_8000_process_pixblock_head, \
+    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+    pixman_composite_over_8888_8888_process_pixblock_tail
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    pixman_composite_over_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d7[0]}, [DUMMY]
+    vdup.8      d4, d7[0]
+    vdup.8      d5, d7[1]
+    vdup.8      d6, d7[2]
+    vdup.8      d7, d7[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_0565_process_pixblock_head
+    /* in */
+    vmull.u8    q0, d24, d8
+    vmull.u8    q1, d24, d9
+    vmull.u8    q6, d24, d10
+    vmull.u8    q7, d24, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_8_0565_process_pixblock_tail
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert to r5g6b5 */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
+    pixman_composite_over_n_8_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vld1.8      {d24}, [MASK]!
+    cache_preload 8, 8
+    pixman_composite_over_n_8_0565_process_pixblock_head
+.endm
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_n_8_0565_process_pixblock_head, \
+    pixman_composite_over_n_8_0565_process_pixblock_tail, \
+    pixman_composite_over_n_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+    vld1.16 {d0, d1, d2, d3}, [SRC]!
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #8
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vmov        d1, d0
+    vmov        q1, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vmov        d1, d0
+    vmov        q1, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #32
+    vmov        d1, d0
+    vmov        q1, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    vld1.32 {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    vld1.32 {d0, d1, d2, d3}, [SRC]!
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    vmov.u8  q2, #0xFF
+    vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q0, d24, d8
+    vmull.u8    q1, d24, d9
+    vmull.u8    q6, d24, d10
+    vmull.u8    q7, d24, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
+    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+    pixman_composite_over_n_8_8888_process_pixblock_tail
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld1.8      {d24}, [MASK]!
+    cache_preload 8, 8
+    pixman_composite_over_n_8_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}
+     *         dest in          {d4,  d5,  d6,  d7 }
+     *         mask in          {d24, d25, d26, d27}
+     * output: updated src in   {d0,  d1,  d2,  d3 }
+     *         updated mask in  {d24, d25, d26, d3 }
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q7,  d27, d11
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vrshr.u16   q10, q7,  #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    vraddhn.u16 d26, q13, q6
+    vraddhn.u16 d3,  q7,  q10
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {d28, d29, d30, d31}
+     */
+    vmvn.8      d24, d24
+    vmvn.8      d25, d25
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmvn.8      d26, d26
+    vmvn.8      d27, d3
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q6,  q10, #8
+    vrshr.u16   q7,  q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6,  q10
+    vraddhn.u16 d31, q7,  q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q6, q10, #8
+        vrshr.u16   q7, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q6, q10
+        vraddhn.u16 d31, q7, q11
+    vld4.8      {d24, d25, d26, d27}, [MASK]!
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24, d25, d26, d27 */
+    vmull.u8    q0, d24, d11
+    vmull.u8    q1, d25, d11
+    vmull.u8    q6, d26, d11
+    vmull.u8    q7, d27, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld1.8      {d24, d25, d26, d27}, [MASK]!
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d25, d1
+    vmull.u8    q10, d26, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld1.8      {d24, d25, d26, d27}, [MASK]!
+    vld1.8      {d0, d1, d2, d3}, [SRC]!
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d27, d0
+    vmull.u8    q9, d27, d1
+    vmull.u8    q10, d27, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld4.8      {d24, d25, d26, d27}, [MASK]!
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+    pixman_composite_add_8888_8888_8888_process_pixblock_head
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* solid mask is in d15 */
+
+    /* 'in' */
+    vmull.u8    q8, d15, d3
+    vmull.u8    q6, d15, d2
+    vmull.u8    q5, d15, d1
+    vmull.u8    q4, d15, d0
+    vrshr.u16   q13, q8, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q11, q5, #8
+    vrshr.u16   q10, q4, #8
+    vraddhn.u16 d3, q8, q13
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d1, q5, q11
+    vraddhn.u16 d0, q4, q10
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    add         DUMMY, sp, #48
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+    vld4.8     {d12, d13, d14, d15}, [MASK]!
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_8888_8888_init
+    vpush       {d8-d15}
+.endm
+
+.macro pixman_composite_over_8888_8888_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_8888_8888_init, \
+    pixman_composite_over_8888_8888_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    pixman_composite_over_8888_8888_8888_init, \
+    pixman_composite_over_8888_8888_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+    vld1.8     {d15}, [MASK]!
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_8_8888_init
+    vpush       {d8-d15}
+.endm
+
+.macro pixman_composite_over_8888_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_8_8888_init, \
+    pixman_composite_over_8888_8_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    vld3.8 {d0, d1, d2}, [SRC]!
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    vswp   d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    vst4.8 {d0, d1, d2, d3}, [DST_W]!
+    vld3.8 {d0, d1, d2}, [SRC]!
+    vswp   d0, d2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    veor   d3, d3, d3
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    vshll.u8    q14, d0, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        vshll.u8    q14, d0, #8
+    vld3.8 {d0, d1, d2}, [SRC]!
+        vsri.u16    q14, q8, #5
+        vsri.u16    q14, q9, #11
+    vshll.u8    q8, d1, #8
+        vst1.16 {d28, d29}, [DST_W, :128]!
+    vshll.u8    q9, d2, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d30, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    vld4.8 {d0, d1, d2, d3}, [SRC]!
+        vraddhn.u16 d30, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d28, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
diff --git a/src/3rdparty/pixman/pixman-arm-neon-asm.h b/src/3rdparty/pixman/pixman-arm-neon-asm.h
new file mode 100644
index 0000000..56c3fae
--- /dev/null
+++ b/src/3rdparty/pixman/pixman-arm-neon-asm.h
@@ -0,0 +1,906 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ *  - handling of leading and trailing unaligned pixels
+ *  - doing most of the work related to L2 cache preload
+ *  - encourages the use of software pipelining for better instructions
+ *    scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ *  - try overlapped pixel method (from Ian Rickards) when processing
+ *    exactly two blocks of pixels
+ *  - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+
+/*
+ * Offset in stack where mask and source pointer/stride can be accessed
+ * from 'init' macro. This is useful for doing special handling for solid mask.
+ */
+.set ARGS_STACK_OFFSET,        40
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif numbytes == 16
+    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+.elseif numbytes == 8
+    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+.elseif numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+    .elseif elem_size == 16
+        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+    .endif
+.elseif numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+    .endif
+.elseif numbytes == 1
+    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+    vuzp.8 d&reg1, d&reg2
+.endm
+
+.macro vzip8 reg1, reg2
+    vzip.8 d&reg1, d&reg2
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(basereg+0), %(basereg+1)
+    vuzp8 %(basereg+2), %(basereg+3)
+    vuzp8 %(basereg+1), %(basereg+3)
+    vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(basereg+0), %(basereg+2)
+    vzip8 %(basereg+1), %(basereg+3)
+    vzip8 %(basereg+2), %(basereg+3)
+    vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in sofware
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+    a x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if regs_shortage
+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+.endif
+.if std_increment != 0
+    PF add PF_X, PF_X, #std_increment
+.endif
+    PF tst PF_CTL, #0xF
+    PF addne PF_X, PF_X, #boost_increment
+    PF subne PF_CTL, PF_CTL, #1
+    PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+.endif
+    PF subge PF_X, PF_X, ORIG_W
+    PF subges PF_CTL, PF_CTL, #0x10
+.if src_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endif
+.if dst_r_bpp != 0
+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+.endif
+.if mask_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+.endif
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+                                        process_pixblock_tail, \
+                                        process_pixblock_tail_head
+.if dst_w_bpp != 24
+    tst         DST_R, #0xF
+    beq         2f
+
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #lowbit
+    beq         1f
+.endif
+    pixld       (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+    add         DST_R, DST_R, #lowbit
+.endif
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    process_pixblock_tail
+
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #lowbit
+    beq         1f
+.endif
+    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+1:
+.endif
+.endr
+.endif
+2:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ *                      set to 0
+ * dst_aligned_flag   - selects whether destination buffer
+ *                      is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+                               dst_aligned_flag, \
+                               process_pixblock_head, \
+                               process_pixblock_tail, \
+                               process_pixblock_tail_head
+    tst         W, #(pixblock_size - 1)
+    beq         2f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+    pixld       chunk_size, src_bpp, src_basereg, SRC
+    pixld       chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+    PF add      PF_X, PF_X, #chunk_size
+.endif
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+.if cache_preload_flag != 0
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+.endif
+    process_pixblock_tail
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+.if dst_aligned_flag != 0
+    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+1:
+.endif
+.endr
+2:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+.if regs_shortage
+    ldrd        W, [sp] /* load W and H (width and height) from stack */
+.else
+    mov         W, ORIG_W
+.endif
+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+    sub         SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+    sub         MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+    subs        H, H, #1
+    mov         DST_R, DST_W
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.endif
+    bge         start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * d0, d1, d2, d3     - reserved for loading source pixel data
+ * d4, d5, d6, d7     - reserved for loading destination pixel data
+ * d24, d25, d26, d27 - reserved for loading mask pixel data
+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags, \
+                                   pixblock_size_, \
+                                   prefetch_distance, \
+                                   init, \
+                                   cleanup, \
+                                   process_pixblock_head, \
+                                   process_pixblock_tail, \
+                                   process_pixblock_tail_head, \
+                                   dst_w_basereg_ = 28, \
+                                   dst_r_basereg_ = 4, \
+                                   src_basereg_   = 0, \
+                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    push        {r4-r12, lr}        /* save all registers */
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req        r0      /* width (is updated during processing) */
+    H           .req        r1      /* height (is updated during processing) */
+    DST_W       .req        r2      /* destination buffer pointer for writes */
+    DST_STRIDE  .req        r3      /* destination image stride */
+    SRC         .req        r4      /* source buffer pointer */
+    SRC_STRIDE  .req        r5      /* source image stride */
+    DST_R       .req        r6      /* destination buffer pointer for reads */
+
+    MASK        .req        r7      /* mask pointer */
+    MASK_STRIDE .req        r8      /* mask stride */
+
+    PF_CTL      .req        r9      /* combined lines counter and prefetch */
+                                    /* distance increment counter */
+    PF_X        .req        r10     /* pixel index in a scanline for current */
+                                    /* pretetch position */
+    PF_SRC      .req        r11     /* pointer to source scanline start */
+                                    /* for prefetch purposes */
+    PF_DST      .req        r12     /* pointer to destination scanline start */
+                                    /* for prefetch purposes */
+    PF_MASK     .req        r14     /* pointer to mask scanline start */
+                                    /* for prefetch purposes */
+/*
+ * Check whether we have enough registers for all the local variables.
+ * If we don't have enough registers, original width and height are
+ * kept on top of stack (and 'regs_shortage' variable is set to indicate
+ * this for the rest of code). Even if there are enough registers, the
+ * allocation scheme may be a bit different depending on whether source
+ * or mask is not used.
+ */
+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
+    ORIG_W      .req        r10     /* saved original width */
+    DUMMY       .req        r12     /* temporary register */
+    .set        regs_shortage, 0
+.elseif mask_bpp == 0
+    ORIG_W      .req        r7      /* saved original width */
+    DUMMY       .req        r8      /* temporary register */
+    .set        regs_shortage, 0
+.elseif src_bpp == 0
+    ORIG_W      .req        r4      /* saved original width */
+    DUMMY       .req        r5      /* temporary register */
+    .set        regs_shortage, 0
+.else
+    ORIG_W      .req        r1      /* saved original width */
+    DUMMY       .req        r1      /* temporary register */
+    .set        regs_shortage, 1
+.endif
+
+    .set mask_bpp_shift, -1
+.if src_bpp == 32
+    .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
+.elseif src_bpp == 16
+    .set src_bpp_shift, 1
+.elseif src_bpp == 8
+    .set src_bpp_shift, 0
+.elseif src_bpp == 0
+    .set src_bpp_shift, -1
+.else
+    .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+    .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+    .set mask_bpp_shift, -1
+.else
+    .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+    .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+    .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+    .set dst_bpp_shift, 0
+.else
+    .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if prefetch_distance < 0 || prefetch_distance > 15
+    .error "invalid prefetch distance (prefetch_distance)"
+.endif
+
+.if src_bpp > 0
+    ldr         SRC, [sp, #40]
+.endif
+.if mask_bpp > 0
+    ldr         MASK, [sp, #48]
+.endif
+    PF mov      PF_X, #0
+.if src_bpp > 0
+    ldr         SRC_STRIDE, [sp, #44]
+.endif
+.if mask_bpp > 0
+    ldr         MASK_STRIDE, [sp, #52]
+.endif
+    mov         DST_R, DST_W
+
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF mov      PF_CTL, H, lsl #4
+    PF add      PF_CTL, #(prefetch_distance - 0x10)
+
+    init
+.if regs_shortage
+    push        {r0, r1}
+.endif
+    subs        H, H, #1
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.else
+    mov         ORIG_W, W
+.endif
+    blt         9f
+    cmp         W, #(pixblock_size * 2)
+    blt         8f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    pixld       pixblock_size, src_bpp, \
+                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    PF add      PF_X, PF_X, #pixblock_size
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    subs        W, W, #(pixblock_size * 2)
+    blt         2f
+1:
+    process_pixblock_tail_head
+    cache_preload_simple
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 1, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 0b
+
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+8:
+    /* Process exactly pixblock_size pixels if needed */
+    tst         W, #pixblock_size
+    beq         1f
+    pixld       pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    pixld       pixblock_size, src_bpp, \
+                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    process_pixblock_tail
+    pixst       pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+1:
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 8b
+9:
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      ORIG_W
+    .unreq      W
+    .unreq      H
+    .unreq      SRC_STRIDE
+    .unreq      DST_STRIDE
+    .unreq      MASK_STRIDE
+    .unreq      PF_CTL
+    .unreq      PF_X
+    .unreq      PF_SRC
+    .unreq      PF_DST
+    .unreq      PF_MASK
+    .unreq      DUMMY
+    .endfunc
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_single_scanline fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags, \
+                                                   pixblock_size_, \
+                                                   init, \
+                                                   cleanup, \
+                                                   process_pixblock_head, \
+                                                   process_pixblock_tail, \
+                                                   process_pixblock_tail_head, \
+                                                   dst_w_basereg_ = 28, \
+                                                   dst_r_basereg_ = 4, \
+                                                   src_basereg_   = 0, \
+                                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req        r0      /* width (is updated during processing) */
+    DST_W       .req        r1      /* destination buffer pointer for writes */
+    SRC         .req        r2      /* source buffer pointer */
+    DST_R       .req        ip      /* destination buffer pointer for reads */
+    MASK        .req        r3      /* mask pointer */
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+    init
+    mov         DST_R, DST_W
+
+    cmp         W, #pixblock_size
+    blt         8f
+
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    subs        W, W, #pixblock_size
+    blt         7f
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    pixld       pixblock_size, src_bpp, \
+                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    subs        W, W, #pixblock_size
+    blt         2f
+1:
+    process_pixblock_tail_head
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+7:
+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
+    process_trailing_pixels 0, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+    bx         lr  /* exit */
+8:
+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+    bx          lr  /* exit */
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      W
+    .endfunc
+.endm
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
diff --git a/src/gui/image/qimage.cpp b/src/gui/image/qimage.cpp
index 94307de..233c58d 100644
--- a/src/gui/image/qimage.cpp
+++ b/src/gui/image/qimage.cpp
@@ -2988,19 +2988,19 @@ static void convert_Indexed8_to_X32(QImageData *dest, const QImageData *src, Qt:
         colorTable.resize(256);
         for (int i=0; i<256; ++i)
             colorTable[i] = qRgb(i, i, i);
-
     }
 
     int w = src->width;
     const uchar *src_data = src->data;
     uchar *dest_data = dest->data;
+    int tableSize = colorTable.size() - 1;
     for (int y = 0; y < src->height; y++) {
         uint *p = (uint *)dest_data;
         const uchar *b = src_data;
         uint *end = p + w;
 
         while (p < end)
-            *p++ = colorTable.at(*b++);
+            *p++ = colorTable.at(qMin<int>(tableSize, *b++));
 
         src_data += src->bytes_per_line;
         dest_data += dest->bytes_per_line;
diff --git a/src/gui/image/qpixmap_raster.cpp b/src/gui/image/qpixmap_raster.cpp
index 0b1c18d..b183d0d 100644
--- a/src/gui/image/qpixmap_raster.cpp
+++ b/src/gui/image/qpixmap_raster.cpp
@@ -182,6 +182,7 @@ void QRasterPixmapData::fromImage(const QImage &sourceImage,
             QImage::Format opaqueFormat = QNativeImage::systemFormat();
             QImage::Format alphaFormat = QImage::Format_ARGB32_Premultiplied;
 
+#ifndef QT_HAVE_NEON
             switch (opaqueFormat) {
             case QImage::Format_RGB16:
                 alphaFormat = QImage::Format_ARGB8565_Premultiplied;
@@ -189,6 +190,7 @@ void QRasterPixmapData::fromImage(const QImage &sourceImage,
             default: // We don't care about the others...
                 break;
             }
+#endif
 
             if (!sourceImage.hasAlphaChannel()
                 || ((flags & Qt::NoOpaqueDetection) == 0
@@ -238,6 +240,7 @@ void QRasterPixmapData::fill(const QColor &color)
         if (alpha != 255) {
             if (!image.hasAlphaChannel()) {
                 QImage::Format toFormat;
+#ifndef QT_HAVE_NEON
                 if (image.format() == QImage::Format_RGB16)
                     toFormat = QImage::Format_ARGB8565_Premultiplied;
                 else if (image.format() == QImage::Format_RGB666)
@@ -247,6 +250,7 @@ void QRasterPixmapData::fill(const QColor &color)
                 else if (image.format() == QImage::Format_RGB444)
                     toFormat = QImage::Format_ARGB4444_Premultiplied;
                 else
+#endif
                     toFormat = QImage::Format_ARGB32_Premultiplied;
                 image = QImage(image.width(), image.height(), toFormat);
             }
diff --git a/src/gui/itemviews/qitemdelegate.cpp b/src/gui/itemviews/qitemdelegate.cpp
index cba213b..d5f6fd2 100644
--- a/src/gui/itemviews/qitemdelegate.cpp
+++ b/src/gui/itemviews/qitemdelegate.cpp
@@ -69,6 +69,7 @@
 #include <qdebug.h>
 #include <qlocale.h>
 #include <qdialog.h>
+#include <qmath.h>
 
 #include <limits.h>
 
@@ -1148,7 +1149,8 @@ QRect QItemDelegate::textRectangle(QPainter * /*painter*/, const QRect &rect,
     d->textLayout.setTextOption(d->textOption);
     d->textLayout.setFont(font);
     d->textLayout.setText(QItemDelegatePrivate::replaceNewLine(text));
-    const QSize size = d->doTextLayout(rect.width()).toSize();
+    QSizeF fpSize = d->doTextLayout(rect.width());
+    const QSize size = QSize(qCeil(fpSize.width()), qCeil(fpSize.height()));
     // ###: textRectangle should take style option as argument
     const int textMargin = QApplication::style()->pixelMetric(QStyle::PM_FocusFrameHMargin) + 1;
     return QRect(0, 0, size.width() + 2 * textMargin, size.height());
diff --git a/src/gui/painting/painting.pri b/src/gui/painting/painting.pri
index a6cc9c7..ed8ee76 100644
--- a/src/gui/painting/painting.pri
+++ b/src/gui/painting/painting.pri
@@ -91,6 +91,8 @@ SOURCES += \
 
         HEADERS +=                                      \
                 painting/qpaintengine_raster_p.h        \
+                painting/qdrawhelper_p.h                \
+                painting/qblendfunctions_p.h            \
                 painting/qrasterdefs_p.h                \
                 painting/qgrayraster_p.h
 
@@ -379,11 +381,23 @@ symbian {
         QMAKE_CXXFLAGS.ARMCC *= -O3
 }
 
-neon {
+neon:*-g++* {
     DEFINES += QT_HAVE_NEON
     HEADERS += painting/qdrawhelper_neon_p.h
     SOURCES += painting/qdrawhelper_neon.cpp
     QMAKE_CXXFLAGS *= -mfpu=neon
+
+    DRAWHELPER_NEON_ASM_FILES = ../3rdparty/pixman/pixman-arm-neon-asm.S painting/qdrawhelper_neon_asm.S
+
+    neon_compiler.commands = $$QMAKE_CXX -c
+    neon_compiler.commands += $(CXXFLAGS) $(INCPATH) ${QMAKE_FILE_IN} -o ${QMAKE_FILE_OUT}
+    neon_compiler.dependency_type = TYPE_C
+    neon_compiler.output = ${QMAKE_VAR_OBJECTS_DIR}${QMAKE_FILE_BASE}$${first(QMAKE_EXT_OBJ)}
+    neon_compiler.input = DRAWHELPER_NEON_ASM_FILES
+    neon_compiler.variable_out = OBJECTS
+    neon_compiler.name = compiling[neon] ${QMAKE_FILE_IN}
+    silent:neon_compiler.commands = @echo compiling[neon] ${QMAKE_FILE_IN} && $$neon_compiler.commands
+    QMAKE_EXTRA_COMPILERS += neon_compiler
 }
 
 contains(QT_CONFIG, zlib) {
diff --git a/src/gui/painting/qblendfunctions.cpp b/src/gui/painting/qblendfunctions.cpp
index dc33896..24908ce 100644
--- a/src/gui/painting/qblendfunctions.cpp
+++ b/src/gui/painting/qblendfunctions.cpp
@@ -40,7 +40,7 @@
 ****************************************************************************/
 
 #include <qmath.h>
-#include "qdrawhelper_p.h"
+#include "qblendfunctions_p.h"
 
 QT_BEGIN_NAMESPACE
 
@@ -88,6 +88,8 @@ static inline quint16 convert_argb32_to_rgb16(quint32 spix)
 
 struct Blend_RGB16_on_RGB16_NoAlpha {
     inline void write(quint16 *dst, quint16 src) { *dst = src; }
+
+    inline void flush(void *) {}
 };
 
 struct Blend_RGB16_on_RGB16_ConstAlpha {
@@ -100,6 +102,8 @@ struct Blend_RGB16_on_RGB16_ConstAlpha {
         *dst = BYTE_MUL_RGB16(src, m_alpha) + BYTE_MUL_RGB16(*dst, m_ialpha);
     }
 
+    inline void flush(void *) {}
+
     quint32 m_alpha;
     quint32 m_ialpha;
 };
@@ -114,6 +118,8 @@ struct Blend_ARGB24_on_RGB16_SourceAlpha {
             *dst = s;
         }
     }
+
+    inline void flush(void *) {}
 };
 
 struct Blend_ARGB24_on_RGB16_SourceAndConstAlpha {
@@ -132,6 +138,8 @@ struct Blend_ARGB24_on_RGB16_SourceAndConstAlpha {
         }
     }
 
+    inline void flush(void *) {}
+
     quint32 m_alpha;
 };
 
@@ -145,6 +153,8 @@ struct Blend_ARGB32_on_RGB16_SourceAlpha {
             *dst = s;
         }
     }
+
+    inline void flush(void *) {}
 };
 
 struct Blend_ARGB32_on_RGB16_SourceAndConstAlpha {
@@ -163,99 +173,11 @@ struct Blend_ARGB32_on_RGB16_SourceAndConstAlpha {
         }
     }
 
+    inline void flush(void *) {}
+
     quint32 m_alpha;
 };
 
-template <typename SRC, typename T>
-void qt_scale_image_16bit(uchar *destPixels, int dbpl,
-                          const uchar *srcPixels, int sbpl,
-                          const QRectF &targetRect,
-                          const QRectF &srcRect,
-                          const QRect &clip,
-                          T blender)
-{
-    qreal sx = targetRect.width() / (qreal) srcRect.width();
-    qreal sy = targetRect.height() / (qreal) srcRect.height();
-
-    int ix = 0x00010000 / sx;
-    int iy = 0x00010000 / sy;
-
-//     qDebug() << "scale:" << endl
-//              << " - target" << targetRect << endl
-//              << " - source" << srcRect << endl
-//              << " - clip" << clip << endl
-//              << " - sx=" << sx << " sy=" << sy << " ix=" << ix << " iy=" << iy;
-
-    int cx1 = clip.x();
-    int cx2 = clip.x() + clip.width();
-    int cy1 = clip.top();
-    int cy2 = clip.y() + clip.height();
-
-    int tx1 = qRound(targetRect.left());
-    int tx2 = qRound(targetRect.right());
-    int ty1 = qRound(targetRect.top());
-    int ty2 = qRound(targetRect.bottom());
-
-    if (tx2 < tx1)
-        qSwap(tx2, tx1);
-
-    if (ty2 < ty1)
-        qSwap(ty2, ty1);
-
-    if (tx1 < cx1)
-        tx1 = cx1;
-
-    if (tx2 >= cx2)
-        tx2 = cx2;
-
-    if (tx1 >= tx2)
-        return;
-
-    if (ty1 < cy1)
-        ty1 = cy1;
-
-    if (ty2 >= cy2)
-       ty2 = cy2;
-
-    if (ty1 >= ty2)
-        return;
-
-    int h = ty2 - ty1;
-    int w = tx2 - tx1;
-
-
-    quint32 basex;
-    quint32 srcy;
-
-    if (sx < 0) {
-        int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1;
-        basex = quint32(srcRect.right() * 65536) + dstx;
-    } else {
-        int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1;
-        basex = quint32(srcRect.left() * 65536) + dstx;
-    }
-    if (sy < 0) {
-        int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1;
-        srcy = quint32(srcRect.bottom() * 65536) + dsty;
-    } else {
-        int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1;
-        srcy = quint32(srcRect.top() * 65536) + dsty;
-    }
-
-    quint16 *dst = ((quint16 *) (destPixels + ty1 * dbpl)) + tx1;
-
-    while (h--) {
-        const SRC *src = (const SRC *) (srcPixels + (srcy >> 16) * sbpl);
-        int srcx = basex;
-        for (int x=0; x<w; ++x) {
-            blender.write(&dst[x], src[srcx >> 16]);
-            srcx += ix;
-        }
-        dst = (quint16 *)(((uchar *) dst) + dbpl);
-        srcy += iy;
-    }
-}
-
 void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
                                    const uchar *srcPixels, int sbpl,
                                    const QRectF &targetRect,
@@ -447,10 +369,10 @@ static void qt_blend_argb24_on_rgb16(uchar *destPixels, int dbpl,
 
 
 
-static void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
-                                                 const uchar *srcPixels, int sbpl,
-                                                 int w, int h,
-                                                 int const_alpha)
+void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
+                                          const uchar *srcPixels, int sbpl,
+                                          int w, int h,
+                                          int const_alpha)
 {
     quint16 *dst = (quint16 *) destPixels;
     const quint32 *src = (const quint32 *) srcPixels;
@@ -643,6 +565,8 @@ void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
 
 struct Blend_RGB32_on_RGB32_NoAlpha {
     inline void write(quint32 *dst, quint32 src) { *dst = src; }
+
+    inline void flush(void *) {}
 };
 
 struct Blend_RGB32_on_RGB32_ConstAlpha {
@@ -655,6 +579,8 @@ struct Blend_RGB32_on_RGB32_ConstAlpha {
         *dst = BYTE_MUL(src, m_alpha) + BYTE_MUL(*dst, m_ialpha);
     }
 
+    inline void flush(void *) {}
+
     quint32 m_alpha;
     quint32 m_ialpha;
 };
@@ -663,6 +589,8 @@ struct Blend_ARGB32_on_ARGB32_SourceAlpha {
     inline void write(quint32 *dst, quint32 src) {
         *dst = src + BYTE_MUL(*dst, qAlpha(~src));
     }
+
+    inline void flush(void *) {}
 };
 
 struct Blend_ARGB32_on_ARGB32_SourceAndConstAlpha {
@@ -676,98 +604,12 @@ struct Blend_ARGB32_on_ARGB32_SourceAndConstAlpha {
         *dst = src + BYTE_MUL(*dst, qAlpha(~src));
     }
 
+    inline void flush(void *) {}
+
     quint32 m_alpha;
     quint32 m_ialpha;
 };
 
-template <typename T> void qt_scale_image_32bit(uchar *destPixels, int dbpl,
-                                                const uchar *srcPixels, int sbpl,
-                                                const QRectF &targetRect,
-                                                const QRectF &srcRect,
-                                                const QRect &clip,
-                                                T blender)
-{
-    qreal sx = targetRect.width() / (qreal) srcRect.width();
-    qreal sy = targetRect.height() / (qreal) srcRect.height();
-
-    int ix = 0x00010000 / sx;
-    int iy = 0x00010000 / sy;
-
-//     qDebug() << "scale:" << endl
-//              << " - target" << targetRect << endl
-//              << " - source" << srcRect << endl
-//              << " - clip" << clip << endl
-//              << " - sx=" << sx << " sy=" << sy << " ix=" << ix << " iy=" << iy;
-
-    int cx1 = clip.x();
-    int cx2 = clip.x() + clip.width();
-    int cy1 = clip.top();
-    int cy2 = clip.y() + clip.height();
-
-    int tx1 = qRound(targetRect.left());
-    int tx2 = qRound(targetRect.right());
-    int ty1 = qRound(targetRect.top());
-    int ty2 = qRound(targetRect.bottom());
-
-    if (tx2 < tx1)
-        qSwap(tx2, tx1);
-
-    if (ty2 < ty1)
-        qSwap(ty2, ty1);
-
-    if (tx1 < cx1)
-        tx1 = cx1;
-
-    if (tx2 >= cx2)
-        tx2 = cx2;
-
-    if (tx1 >= tx2)
-        return;
-
-    if (ty1 < cy1)
-        ty1 = cy1;
-
-    if (ty2 >= cy2)
-       ty2 = cy2;
-
-    if (ty1 >= ty2)
-        return;
-
-    int h = ty2 - ty1;
-    int w = tx2 - tx1;
-
-    quint32 basex;
-    quint32 srcy;
-
-    if (sx < 0) {
-        int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1;
-        basex = quint32(srcRect.right() * 65536) + dstx;
-    } else {
-        int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1;
-        basex = quint32(srcRect.left() * 65536) + dstx;
-    }
-    if (sy < 0) {
-        int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1;
-        srcy = quint32(srcRect.bottom() * 65536) + dsty;
-    } else {
-        int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1;
-        srcy = quint32(srcRect.top() * 65536) + dsty;
-    }
-
-    quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
-
-    while (h--) {
-        const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
-        int srcx = basex;
-        for (int x=0; x<w; ++x) {
-            blender.write(&dst[x], src[srcx >> 16]);
-            srcx += ix;
-        }
-        dst = (quint32 *)(((uchar *) dst) + dbpl);
-        srcy += iy;
-    }
-}
-
 void qt_scale_image_rgb32_on_rgb32(uchar *destPixels, int dbpl,
                                    const uchar *srcPixels, int sbpl,
                                    const QRectF &targetRect,
@@ -818,244 +660,6 @@ void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
     }
 }
 
-struct QTransformImageVertex
-{
-    qreal x, y, u, v; // destination coordinates (x, y) and source coordinates (u, v)
-};
-
-template <class SrcT, class DestT, class Blender>
-void qt_transform_image_rasterize(DestT *destPixels, int dbpl,
-                                  const SrcT *srcPixels, int sbpl,
-                                  const QTransformImageVertex &topLeft, const QTransformImageVertex &bottomLeft,
-                                  const QTransformImageVertex &topRight, const QTransformImageVertex &bottomRight,
-                                  const QRect &sourceRect,
-                                  const QRect &clip,
-                                  qreal topY, qreal bottomY,
-                                  int dudx, int dvdx, int dudy, int dvdy, int u0, int v0,
-                                  Blender blender)
-{
-    int fromY = qMax(qRound(topY), clip.top());
-    int toY = qMin(qRound(bottomY), clip.top() + clip.height());
-    if (fromY >= toY)
-        return;
-
-    qreal leftSlope = (bottomLeft.x - topLeft.x) / (bottomLeft.y - topLeft.y);
-    qreal rightSlope = (bottomRight.x - topRight.x) / (bottomRight.y - topRight.y);
-    int dx_l = int(leftSlope * 0x10000);
-    int dx_r = int(rightSlope * 0x10000);
-    int x_l = int((topLeft.x + (0.5 + fromY - topLeft.y) * leftSlope + 0.5) * 0x10000);
-    int x_r = int((topRight.x + (0.5 + fromY - topRight.y) * rightSlope + 0.5) * 0x10000);
-
-    int fromX, toX, x1, x2, u, v, i, ii;
-    DestT *line;
-    for (int y = fromY; y < toY; ++y) {
-        line = reinterpret_cast<DestT *>(reinterpret_cast<uchar *>(destPixels) + y * dbpl);
-
-        fromX = qMax(x_l >> 16, clip.left());
-        toX = qMin(x_r >> 16, clip.left() + clip.width());
-        if (fromX < toX) {
-            // Because of rounding, we can get source coordinates outside the source image.
-            // Clamp these coordinates to the source rect to avoid segmentation fault and
-            // garbage on the screen.
-
-            // Find the first pixel on the current scan line where the source coordinates are within the source rect.
-            x1 = fromX;
-            u = x1 * dudx + y * dudy + u0;
-            v = x1 * dvdx + y * dvdy + v0;
-            for (; x1 < toX; ++x1) {
-                int uu = u >> 16;
-                int vv = v >> 16;
-                if (uu >= sourceRect.left() && uu < sourceRect.left() + sourceRect.width()
-                    && vv >= sourceRect.top() && vv < sourceRect.top() + sourceRect.height()) {
-                    break;
-                }
-                u += dudx;
-                v += dvdx;
-            }
-
-            // Find the last pixel on the current scan line where the source coordinates are within the source rect.
-            x2 = toX;
-            u = (x2 - 1) * dudx + y * dudy + u0;
-            v = (x2 - 1) * dvdx + y * dvdy + v0;
-            for (; x2 > x1; --x2) {
-                int uu = u >> 16;
-                int vv = v >> 16;
-                if (uu >= sourceRect.left() && uu < sourceRect.left() + sourceRect.width()
-                    && vv >= sourceRect.top() && vv < sourceRect.top() + sourceRect.height()) {
-                    break;
-                }
-                u -= dudx;
-                v -= dvdx;
-            }
-
-            // Set up values at the beginning of the scan line.
-            u = fromX * dudx + y * dudy + u0;
-            v = fromX * dvdx + y * dvdy + v0;
-            line += fromX;
-
-            // Beginning of the scan line, with per-pixel checks.
-            i = x1 - fromX;
-            while (i) {
-                int uu = qBound(sourceRect.left(), u >> 16, sourceRect.left() + sourceRect.width() - 1);
-                int vv = qBound(sourceRect.top(), v >> 16, sourceRect.top() + sourceRect.height() - 1);
-                blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + vv * sbpl)[uu]);
-                u += dudx;
-                v += dvdx;
-                ++line;
-                --i;
-            }
-
-            // Middle of the scan line, without checks.
-            // Manual loop unrolling.
-            i = x2 - x1;
-            ii = i >> 3;
-            while (ii) {
-                blender.write(&line[0], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
-                blender.write(&line[1], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
-                blender.write(&line[2], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
-                blender.write(&line[3], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
-                blender.write(&line[4], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
-                blender.write(&line[5], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
-                blender.write(&line[6], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
-                blender.write(&line[7], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
-                line += 8;
-                --ii;
-            }
-            switch (i & 7) {
-                case 7: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
-                case 6: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
-                case 5: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
-                case 4: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
-                case 3: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
-                case 2: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
-                case 1: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
-            }
-
-            // End of the scan line, with per-pixel checks.
-            i = toX - x2;
-            while (i) {
-                int uu = qBound(sourceRect.left(), u >> 16, sourceRect.left() + sourceRect.width() - 1);
-                int vv = qBound(sourceRect.top(), v >> 16, sourceRect.top() + sourceRect.height() - 1);
-                blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + vv * sbpl)[uu]);
-                u += dudx;
-                v += dvdx;
-                ++line;
-                --i;
-            }
-        }
-        x_l += dx_l;
-        x_r += dx_r;
-    }
-}
-
-template <class SrcT, class DestT, class Blender>
-void qt_transform_image(DestT *destPixels, int dbpl,
-                        const SrcT *srcPixels, int sbpl,
-                        const QRectF &targetRect,
-                        const QRectF &sourceRect,
-                        const QRect &clip,
-                        const QTransform &targetRectTransform,
-                        Blender blender)
-{
-    enum Corner
-    {
-        TopLeft,
-        TopRight,
-        BottomRight,
-        BottomLeft
-    };
-
-    // map source rectangle to destination.
-    QTransformImageVertex v[4];
-    v[TopLeft].u = v[BottomLeft].u = sourceRect.left();
-    v[TopLeft].v = v[TopRight].v = sourceRect.top();
-    v[TopRight].u = v[BottomRight].u = sourceRect.right();
-    v[BottomLeft].v = v[BottomRight].v = sourceRect.bottom();
-    targetRectTransform.map(targetRect.left(), targetRect.top(), &v[TopLeft].x, &v[TopLeft].y);
-    targetRectTransform.map(targetRect.right(), targetRect.top(), &v[TopRight].x, &v[TopRight].y);
-    targetRectTransform.map(targetRect.left(), targetRect.bottom(), &v[BottomLeft].x, &v[BottomLeft].y);
-    targetRectTransform.map(targetRect.right(), targetRect.bottom(), &v[BottomRight].x, &v[BottomRight].y);
-
-    // find topmost vertex.
-    int topmost = 0;
-    for (int i = 1; i < 4; ++i) {
-        if (v[i].y < v[topmost].y)
-            topmost = i;
-    }
-    // rearrange array such that topmost vertex is at index 0.
-    switch (topmost) {
-    case 1:
-        {
-            QTransformImageVertex t = v[0];
-            for (int i = 0; i < 3; ++i)
-                v[i] = v[i+1];
-            v[3] = t;
-        }
-        break;
-    case 2:
-        qSwap(v[0], v[2]);
-        qSwap(v[1], v[3]);
-        break;
-    case 3:
-        {
-            QTransformImageVertex t = v[3];
-            for (int i = 3; i > 0; --i)
-                v[i] = v[i-1];
-            v[0] = t;
-        }
-        break;
-    }
-
-    // if necessary, swap vertex 1 and 3 such that 1 is to the left of 3.
-    qreal dx1 = v[1].x - v[0].x;
-    qreal dy1 = v[1].y - v[0].y;
-    qreal dx2 = v[3].x - v[0].x;
-    qreal dy2 = v[3].y - v[0].y;
-    if (dx1 * dy2 - dx2 * dy1 > 0)
-        qSwap(v[1], v[3]);
-
-    QTransformImageVertex u = {v[1].x - v[0].x, v[1].y - v[0].y, v[1].u - v[0].u, v[1].v - v[0].v};
-    QTransformImageVertex w = {v[2].x - v[0].x, v[2].y - v[0].y, v[2].u - v[0].u, v[2].v - v[0].v};
-
-    qreal det = u.x * w.y - u.y * w.x;
-    if (det == 0)
-        return;
-
-    qreal invDet = 1.0 / det;
-    qreal m11, m12, m21, m22, mdx, mdy;
-
-    m11 = (u.u * w.y - u.y * w.u) * invDet;
-    m12 = (u.x * w.u - u.u * w.x) * invDet;
-    m21 = (u.v * w.y - u.y * w.v) * invDet;
-    m22 = (u.x * w.v - u.v * w.x) * invDet;
-    mdx = v[0].u - m11 * v[0].x - m12 * v[0].y;
-    mdy = v[0].v - m21 * v[0].x - m22 * v[0].y;
-
-    int dudx = int(m11 * 0x10000);
-    int dvdx = int(m21 * 0x10000);
-    int dudy = int(m12 * 0x10000);
-    int dvdy = int(m22 * 0x10000);
-    int u0 = qCeil((0.5 * m11 + 0.5 * m12 + mdx) * 0x10000) - 1;
-    int v0 = qCeil((0.5 * m21 + 0.5 * m22 + mdy) * 0x10000) - 1;
-
-    int x1 = qFloor(sourceRect.left());
-    int y1 = qFloor(sourceRect.top());
-    int x2 = qCeil(sourceRect.right());
-    int y2 = qCeil(sourceRect.bottom());
-    QRect sourceRectI(x1, y1, x2 - x1, y2 - y1);
-
-    // rasterize trapezoids.
-    if (v[1].y < v[3].y) {
-        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[0], v[3], sourceRectI, clip, v[0].y, v[1].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
-        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[0], v[3], sourceRectI, clip, v[1].y, v[3].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
-        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[3], v[2], sourceRectI, clip, v[3].y, v[2].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
-    } else {
-        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[0], v[3], sourceRectI, clip, v[0].y, v[3].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
-        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[3], v[2], sourceRectI, clip, v[3].y, v[1].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
-        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[3], v[2], sourceRectI, clip, v[1].y, v[2].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
-    }
-}
-
 void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
                                        const uchar *srcPixels, int sbpl,
                                        const QRectF &targetRect,
diff --git a/src/gui/painting/qblendfunctions_p.h b/src/gui/painting/qblendfunctions_p.h
new file mode 100644
index 0000000..ad754b0
--- /dev/null
+++ b/src/gui/painting/qblendfunctions_p.h
@@ -0,0 +1,497 @@
+/****************************************************************************
+**
+** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QBLENDFUNCTIONS_P_H
+#define QBLENDFUNCTIONS_P_H
+
+#include <qmath.h>
+#include "qdrawhelper_p.h"
+
+QT_BEGIN_NAMESPACE
+
+//
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the Qt API.  It exists purely as an
+// implementation detail.  This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+template <typename SRC, typename T>
+void qt_scale_image_16bit(uchar *destPixels, int dbpl,
+                          const uchar *srcPixels, int sbpl,
+                          const QRectF &targetRect,
+                          const QRectF &srcRect,
+                          const QRect &clip,
+                          T blender)
+{
+    qreal sx = targetRect.width() / (qreal) srcRect.width();
+    qreal sy = targetRect.height() / (qreal) srcRect.height();
+
+    int ix = 0x00010000 / sx;
+    int iy = 0x00010000 / sy;
+
+//     qDebug() << "scale:" << endl
+//              << " - target" << targetRect << endl
+//              << " - source" << srcRect << endl
+//              << " - clip" << clip << endl
+//              << " - sx=" << sx << " sy=" << sy << " ix=" << ix << " iy=" << iy;
+
+    int cx1 = clip.x();
+    int cx2 = clip.x() + clip.width();
+    int cy1 = clip.top();
+    int cy2 = clip.y() + clip.height();
+
+    int tx1 = qRound(targetRect.left());
+    int tx2 = qRound(targetRect.right());
+    int ty1 = qRound(targetRect.top());
+    int ty2 = qRound(targetRect.bottom());
+
+    if (tx2 < tx1)
+        qSwap(tx2, tx1);
+
+    if (ty2 < ty1)
+        qSwap(ty2, ty1);
+
+    if (tx1 < cx1)
+        tx1 = cx1;
+
+    if (tx2 >= cx2)
+        tx2 = cx2;
+
+    if (tx1 >= tx2)
+        return;
+
+    if (ty1 < cy1)
+        ty1 = cy1;
+
+    if (ty2 >= cy2)
+       ty2 = cy2;
+
+    if (ty1 >= ty2)
+        return;
+
+    int h = ty2 - ty1;
+    int w = tx2 - tx1;
+
+
+    quint32 basex;
+    quint32 srcy;
+
+    if (sx < 0) {
+        int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1;
+        basex = quint32(srcRect.right() * 65536) + dstx;
+    } else {
+        int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1;
+        basex = quint32(srcRect.left() * 65536) + dstx;
+    }
+    if (sy < 0) {
+        int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1;
+        srcy = quint32(srcRect.bottom() * 65536) + dsty;
+    } else {
+        int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1;
+        srcy = quint32(srcRect.top() * 65536) + dsty;
+    }
+
+    quint16 *dst = ((quint16 *) (destPixels + ty1 * dbpl)) + tx1;
+
+    while (h--) {
+        const SRC *src = (const SRC *) (srcPixels + (srcy >> 16) * sbpl);
+        int srcx = basex;
+        int x = 0;
+        for (; x<w-7; x+=8) {
+            blender.write(&dst[x], src[srcx >> 16]); srcx += ix;
+            blender.write(&dst[x+1], src[srcx >> 16]); srcx += ix;
+            blender.write(&dst[x+2], src[srcx >> 16]); srcx += ix;
+            blender.write(&dst[x+3], src[srcx >> 16]); srcx += ix;
+            blender.write(&dst[x+4], src[srcx >> 16]); srcx += ix;
+            blender.write(&dst[x+5], src[srcx >> 16]); srcx += ix;
+            blender.write(&dst[x+6], src[srcx >> 16]); srcx += ix;
+            blender.write(&dst[x+7], src[srcx >> 16]); srcx += ix;
+        }
+        for (; x<w; ++x) {
+            blender.write(&dst[x], src[srcx >> 16]);
+            srcx += ix;
+        }
+        blender.flush(&dst[x]);
+        dst = (quint16 *)(((uchar *) dst) + dbpl);
+        srcy += iy;
+    }
+}
+
+template <typename T> void qt_scale_image_32bit(uchar *destPixels, int dbpl,
+                                                const uchar *srcPixels, int sbpl,
+                                                const QRectF &targetRect,
+                                                const QRectF &srcRect,
+                                                const QRect &clip,
+                                                T blender)
+{
+    qreal sx = targetRect.width() / (qreal) srcRect.width();
+    qreal sy = targetRect.height() / (qreal) srcRect.height();
+
+    int ix = 0x00010000 / sx;
+    int iy = 0x00010000 / sy;
+
+//     qDebug() << "scale:" << endl
+//              << " - target" << targetRect << endl
+//              << " - source" << srcRect << endl
+//              << " - clip" << clip << endl
+//              << " - sx=" << sx << " sy=" << sy << " ix=" << ix << " iy=" << iy;
+
+    int cx1 = clip.x();
+    int cx2 = clip.x() + clip.width();
+    int cy1 = clip.top();
+    int cy2 = clip.y() + clip.height();
+
+    int tx1 = qRound(targetRect.left());
+    int tx2 = qRound(targetRect.right());
+    int ty1 = qRound(targetRect.top());
+    int ty2 = qRound(targetRect.bottom());
+
+    if (tx2 < tx1)
+        qSwap(tx2, tx1);
+
+    if (ty2 < ty1)
+        qSwap(ty2, ty1);
+
+    if (tx1 < cx1)
+        tx1 = cx1;
+
+    if (tx2 >= cx2)
+        tx2 = cx2;
+
+    if (tx1 >= tx2)
+        return;
+
+    if (ty1 < cy1)
+        ty1 = cy1;
+
+    if (ty2 >= cy2)
+       ty2 = cy2;
+
+    if (ty1 >= ty2)
+        return;
+
+    int h = ty2 - ty1;
+    int w = tx2 - tx1;
+
+    quint32 basex;
+    quint32 srcy;
+
+    if (sx < 0) {
+        int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1;
+        basex = quint32(srcRect.right() * 65536) + dstx;
+    } else {
+        int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1;
+        basex = quint32(srcRect.left() * 65536) + dstx;
+    }
+    if (sy < 0) {
+        int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1;
+        srcy = quint32(srcRect.bottom() * 65536) + dsty;
+    } else {
+        int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1;
+        srcy = quint32(srcRect.top() * 65536) + dsty;
+    }
+
+    quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
+
+    while (h--) {
+        const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
+        int srcx = basex;
+        int x = 0;
+        for (; x<w; ++x) {
+            blender.write(&dst[x], src[srcx >> 16]);
+            srcx += ix;
+        }
+        blender.flush(&dst[x]);
+        dst = (quint32 *)(((uchar *) dst) + dbpl);
+        srcy += iy;
+    }
+}
+
+struct QTransformImageVertex
+{
+    qreal x, y, u, v; // destination coordinates (x, y) and source coordinates (u, v)
+};
+
+template <class SrcT, class DestT, class Blender>
+void qt_transform_image_rasterize(DestT *destPixels, int dbpl,
+                                  const SrcT *srcPixels, int sbpl,
+                                  const QTransformImageVertex &topLeft, const QTransformImageVertex &bottomLeft,
+                                  const QTransformImageVertex &topRight, const QTransformImageVertex &bottomRight,
+                                  const QRect &sourceRect,
+                                  const QRect &clip,
+                                  qreal topY, qreal bottomY,
+                                  int dudx, int dvdx, int dudy, int dvdy, int u0, int v0,
+                                  Blender blender)
+{
+    int fromY = qMax(qRound(topY), clip.top());
+    int toY = qMin(qRound(bottomY), clip.top() + clip.height());
+    if (fromY >= toY)
+        return;
+
+    qreal leftSlope = (bottomLeft.x - topLeft.x) / (bottomLeft.y - topLeft.y);
+    qreal rightSlope = (bottomRight.x - topRight.x) / (bottomRight.y - topRight.y);
+    int dx_l = int(leftSlope * 0x10000);
+    int dx_r = int(rightSlope * 0x10000);
+    int x_l = int((topLeft.x + (0.5 + fromY - topLeft.y) * leftSlope + 0.5) * 0x10000);
+    int x_r = int((topRight.x + (0.5 + fromY - topRight.y) * rightSlope + 0.5) * 0x10000);
+
+    int fromX, toX, x1, x2, u, v, i, ii;
+    DestT *line;
+    for (int y = fromY; y < toY; ++y) {
+        line = reinterpret_cast<DestT *>(reinterpret_cast<uchar *>(destPixels) + y * dbpl);
+
+        fromX = qMax(x_l >> 16, clip.left());
+        toX = qMin(x_r >> 16, clip.left() + clip.width());
+        if (fromX < toX) {
+            // Because of rounding, we can get source coordinates outside the source image.
+            // Clamp these coordinates to the source rect to avoid segmentation fault and
+            // garbage on the screen.
+
+            // Find the first pixel on the current scan line where the source coordinates are within the source rect.
+            x1 = fromX;
+            u = x1 * dudx + y * dudy + u0;
+            v = x1 * dvdx + y * dvdy + v0;
+            for (; x1 < toX; ++x1) {
+                int uu = u >> 16;
+                int vv = v >> 16;
+                if (uu >= sourceRect.left() && uu < sourceRect.left() + sourceRect.width()
+                    && vv >= sourceRect.top() && vv < sourceRect.top() + sourceRect.height()) {
+                    break;
+                }
+                u += dudx;
+                v += dvdx;
+            }
+
+            // Find the last pixel on the current scan line where the source coordinates are within the source rect.
+            x2 = toX;
+            u = (x2 - 1) * dudx + y * dudy + u0;
+            v = (x2 - 1) * dvdx + y * dvdy + v0;
+            for (; x2 > x1; --x2) {
+                int uu = u >> 16;
+                int vv = v >> 16;
+                if (uu >= sourceRect.left() && uu < sourceRect.left() + sourceRect.width()
+                    && vv >= sourceRect.top() && vv < sourceRect.top() + sourceRect.height()) {
+                    break;
+                }
+                u -= dudx;
+                v -= dvdx;
+            }
+
+            // Set up values at the beginning of the scan line.
+            u = fromX * dudx + y * dudy + u0;
+            v = fromX * dvdx + y * dvdy + v0;
+            line += fromX;
+
+            // Beginning of the scan line, with per-pixel checks.
+            i = x1 - fromX;
+            while (i) {
+                int uu = qBound(sourceRect.left(), u >> 16, sourceRect.left() + sourceRect.width() - 1);
+                int vv = qBound(sourceRect.top(), v >> 16, sourceRect.top() + sourceRect.height() - 1);
+                blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + vv * sbpl)[uu]);
+                u += dudx;
+                v += dvdx;
+                ++line;
+                --i;
+            }
+
+            // Middle of the scan line, without checks.
+            // Manual loop unrolling.
+            i = x2 - x1;
+            ii = i >> 3;
+            while (ii) {
+                blender.write(&line[0], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
+                blender.write(&line[1], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
+                blender.write(&line[2], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
+                blender.write(&line[3], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
+                blender.write(&line[4], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
+                blender.write(&line[5], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
+                blender.write(&line[6], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
+                blender.write(&line[7], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx;
+
+                line += 8;
+
+                --ii;
+            }
+            switch (i & 7) {
+                case 7: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
+                case 6: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
+                case 5: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
+                case 4: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
+                case 3: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
+                case 2: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
+                case 1: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line;
+            }
+
+            // End of the scan line, with per-pixel checks.
+            i = toX - x2;
+            while (i) {
+                int uu = qBound(sourceRect.left(), u >> 16, sourceRect.left() + sourceRect.width() - 1);
+                int vv = qBound(sourceRect.top(), v >> 16, sourceRect.top() + sourceRect.height() - 1);
+                blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + vv * sbpl)[uu]);
+                u += dudx;
+                v += dvdx;
+                ++line;
+                --i;
+            }
+
+            blender.flush(line);
+        }
+        x_l += dx_l;
+        x_r += dx_r;
+    }
+}
+
+template <class SrcT, class DestT, class Blender>
+void qt_transform_image(DestT *destPixels, int dbpl,
+                        const SrcT *srcPixels, int sbpl,
+                        const QRectF &targetRect,
+                        const QRectF &sourceRect,
+                        const QRect &clip,
+                        const QTransform &targetRectTransform,
+                        Blender blender)
+{
+    enum Corner
+    {
+        TopLeft,
+        TopRight,
+        BottomRight,
+        BottomLeft
+    };
+
+    // map source rectangle to destination.
+    QTransformImageVertex v[4];
+    v[TopLeft].u = v[BottomLeft].u = sourceRect.left();
+    v[TopLeft].v = v[TopRight].v = sourceRect.top();
+    v[TopRight].u = v[BottomRight].u = sourceRect.right();
+    v[BottomLeft].v = v[BottomRight].v = sourceRect.bottom();
+    targetRectTransform.map(targetRect.left(), targetRect.top(), &v[TopLeft].x, &v[TopLeft].y);
+    targetRectTransform.map(targetRect.right(), targetRect.top(), &v[TopRight].x, &v[TopRight].y);
+    targetRectTransform.map(targetRect.left(), targetRect.bottom(), &v[BottomLeft].x, &v[BottomLeft].y);
+    targetRectTransform.map(targetRect.right(), targetRect.bottom(), &v[BottomRight].x, &v[BottomRight].y);
+
+    // find topmost vertex.
+    int topmost = 0;
+    for (int i = 1; i < 4; ++i) {
+        if (v[i].y < v[topmost].y)
+            topmost = i;
+    }
+    // rearrange array such that topmost vertex is at index 0.
+    switch (topmost) {
+    case 1:
+        {
+            QTransformImageVertex t = v[0];
+            for (int i = 0; i < 3; ++i)
+                v[i] = v[i+1];
+            v[3] = t;
+        }
+        break;
+    case 2:
+        qSwap(v[0], v[2]);
+        qSwap(v[1], v[3]);
+        break;
+    case 3:
+        {
+            QTransformImageVertex t = v[3];
+            for (int i = 3; i > 0; --i)
+                v[i] = v[i-1];
+            v[0] = t;
+        }
+        break;
+    }
+
+    // if necessary, swap vertex 1 and 3 such that 1 is to the left of 3.
+    qreal dx1 = v[1].x - v[0].x;
+    qreal dy1 = v[1].y - v[0].y;
+    qreal dx2 = v[3].x - v[0].x;
+    qreal dy2 = v[3].y - v[0].y;
+    if (dx1 * dy2 - dx2 * dy1 > 0)
+        qSwap(v[1], v[3]);
+
+    QTransformImageVertex u = {v[1].x - v[0].x, v[1].y - v[0].y, v[1].u - v[0].u, v[1].v - v[0].v};
+    QTransformImageVertex w = {v[2].x - v[0].x, v[2].y - v[0].y, v[2].u - v[0].u, v[2].v - v[0].v};
+
+    qreal det = u.x * w.y - u.y * w.x;
+    if (det == 0)
+        return;
+
+    qreal invDet = 1.0 / det;
+    qreal m11, m12, m21, m22, mdx, mdy;
+
+    m11 = (u.u * w.y - u.y * w.u) * invDet;
+    m12 = (u.x * w.u - u.u * w.x) * invDet;
+    m21 = (u.v * w.y - u.y * w.v) * invDet;
+    m22 = (u.x * w.v - u.v * w.x) * invDet;
+    mdx = v[0].u - m11 * v[0].x - m12 * v[0].y;
+    mdy = v[0].v - m21 * v[0].x - m22 * v[0].y;
+
+    int dudx = int(m11 * 0x10000);
+    int dvdx = int(m21 * 0x10000);
+    int dudy = int(m12 * 0x10000);
+    int dvdy = int(m22 * 0x10000);
+    int u0 = qCeil((0.5 * m11 + 0.5 * m12 + mdx) * 0x10000) - 1;
+    int v0 = qCeil((0.5 * m21 + 0.5 * m22 + mdy) * 0x10000) - 1;
+
+    int x1 = qFloor(sourceRect.left());
+    int y1 = qFloor(sourceRect.top());
+    int x2 = qCeil(sourceRect.right());
+    int y2 = qCeil(sourceRect.bottom());
+    QRect sourceRectI(x1, y1, x2 - x1, y2 - y1);
+
+    // rasterize trapezoids.
+    if (v[1].y < v[3].y) {
+        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[0], v[3], sourceRectI, clip, v[0].y, v[1].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
+        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[0], v[3], sourceRectI, clip, v[1].y, v[3].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
+        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[3], v[2], sourceRectI, clip, v[3].y, v[2].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
+    } else {
+        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[0], v[3], sourceRectI, clip, v[0].y, v[3].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
+        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[3], v[2], sourceRectI, clip, v[3].y, v[1].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
+        qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[3], v[2], sourceRectI, clip, v[1].y, v[2].y, dudx, dvdx, dudy, dvdy, u0, v0, blender);
+    }
+}
+
+QT_END_NAMESPACE
+
+#endif // QBLENDFUNCTIONS_P_H
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 1f75ec7..917b910 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -175,7 +175,7 @@ Q_STATIC_TEMPLATE_FUNCTION uint * QT_FASTCALL destFetch(uint *buffer, QRasterBuf
 
 # define SPANFUNC_POINTER_DESTFETCH(Arg) destFetch<Arg>
 
-static const DestFetchProc destFetchProc[QImage::NImageFormats] =
+static DestFetchProc destFetchProc[QImage::NImageFormats] =
 {
     0, // Format_Invalid
     destFetchMono, // Format_Mono,
@@ -323,7 +323,7 @@ Q_STATIC_TEMPLATE_FUNCTION void QT_FASTCALL destStore(QRasterBuffer *rasterBuffe
 
 # define SPANFUNC_POINTER_DESTSTORE(DEST) destStore<DEST>
 
-static const DestStoreProc destStoreProc[QImage::NImageFormats] =
+static DestStoreProc destStoreProc[QImage::NImageFormats] =
 {
     0, // Format_Invalid
     destStoreMono, // Format_Mono,
@@ -2827,7 +2827,7 @@ static void QT_FASTCALL rasterop_SourceAndNotDestination(uint *dest,
     }
 }
 
-static const CompositionFunctionSolid functionForModeSolid_C[] = {
+static CompositionFunctionSolid functionForModeSolid_C[] = {
         comp_func_solid_SourceOver,
         comp_func_solid_DestinationOver,
         comp_func_solid_Clear,
@@ -2865,7 +2865,7 @@ static const CompositionFunctionSolid functionForModeSolid_C[] = {
 
 static const CompositionFunctionSolid *functionForModeSolid = functionForModeSolid_C;
 
-static const CompositionFunction functionForMode_C[] = {
+static CompositionFunction functionForMode_C[] = {
         comp_func_SourceOver,
         comp_func_DestinationOver,
         comp_func_Clear,
@@ -7961,6 +7961,20 @@ void qInitDrawhelperAsm()
             qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;
             qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_neon;
             qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_neon;
+            qBlendFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_rgb16_neon;
+            qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB16] = qt_blend_rgb16_on_argb32_neon;
+
+            qScaleFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_rgb16_neon;
+            qScaleFunctions[QImage::Format_RGB16][QImage::Format_RGB16] = qt_scale_image_rgb16_on_rgb16_neon;
+
+            qTransformFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_transform_image_argb32_on_rgb16_neon;
+            qTransformFunctions[QImage::Format_RGB16][QImage::Format_RGB16] = qt_transform_image_rgb16_on_rgb16_neon;
+
+            qDrawHelper[QImage::Format_RGB16].alphamapBlit = qt_alphamapblit_quint16_neon;
+
+            functionForMode_C[QPainter::CompositionMode_SourceOver] = qt_blend_argb32_on_argb32_scanline_neon;
+            destFetchProc[QImage::Format_RGB16] = qt_destFetchRGB16_neon;
+            destStoreProc[QImage::Format_RGB16] = qt_destStoreRGB16_neon;
         }
 #endif
 
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index 77c5202..ee5f24a 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -40,10 +40,13 @@
 ****************************************************************************/
 
 #include <private/qdrawhelper_p.h>
+#include <private/qblendfunctions_p.h>
+#include <private/qmath_p.h>
 
 #ifdef QT_HAVE_NEON
 
 #include <private/qdrawhelper_neon_p.h>
+#include <private/qpaintengine_raster_p.h>
 #include <arm_neon.h>
 
 QT_BEGIN_NAMESPACE
@@ -87,60 +90,142 @@ static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, u
     return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
 }
 
-void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
-                                    const uchar *srcPixels, int sbpl,
-                                    int w, int h,
-                                    int const_alpha)
+extern "C" void
+pixman_composite_over_8888_0565_asm_neon (int32_t   w,
+                                          int32_t   h,
+                                          uint16_t *dst,
+                                          int32_t   dst_stride,
+                                          uint32_t *src,
+                                          int32_t   src_stride);
+
+extern "C" void
+pixman_composite_over_8888_8888_asm_neon (int32_t   w,
+                                          int32_t   h,
+                                          uint32_t *dst,
+                                          int32_t   dst_stride,
+                                          uint32_t *src,
+                                          int32_t   src_stride);
+
+extern "C" void
+pixman_composite_src_0565_8888_asm_neon (int32_t   w,
+                                         int32_t   h,
+                                         uint32_t *dst,
+                                         int32_t   dst_stride,
+                                         uint16_t *src,
+                                         int32_t   src_stride);
+
+extern "C" void
+pixman_composite_over_n_8_0565_asm_neon (int32_t    w,
+                                         int32_t    h,
+                                         uint16_t  *dst,
+                                         int32_t    dst_stride,
+                                         uint32_t   src,
+                                         int32_t    unused,
+                                         uint8_t   *mask,
+                                         int32_t    mask_stride);
+
+extern "C" void
+pixman_composite_scanline_over_asm_neon (int32_t         w,
+                                         const uint32_t *dst,
+                                         const uint32_t *src);
+
+// qblendfunctions.cpp
+void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
+                                          const uchar *srcPixels, int sbpl,
+                                          int w, int h,
+                                          int const_alpha);
+
+void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   int w, int h,
+                                   int const_alpha)
 {
-    const uint *src = (const uint *) srcPixels;
-    uint *dst = (uint *) destPixels;
-    uint16x8_t half = vdupq_n_u16(0x80);
-    uint16x8_t full = vdupq_n_u16(0xff);
-    if (const_alpha == 256) {
-        for (int y = 0; y < h; ++y) {
-            int x = 0;
-            for (; x < w-3; x += 4) {
-                uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
-                if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) {
-                    // all opaque
-                    vst1q_u32((uint32_t *)&dst[x], src32);
-                } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
-                    uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
+    dbpl /= 4;
+    sbpl /= 2;
 
-                    const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
-                    const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
+    quint32 *dst = (quint32 *) destPixels;
+    quint16 *src = (quint16 *) srcPixels;
 
-                    const uint8x8_t src8_low = vget_low_u8(src8);
-                    const uint8x8_t dst8_low = vget_low_u8(dst8);
+    if (const_alpha != 256) {
+        quint8 a = (255 * const_alpha) >> 8;
+        quint8 ia = 255 - a;
+
+        while (h--) {
+            for (int x=0; x<w; ++x)
+                dst[x] = INTERPOLATE_PIXEL_255(qt_colorConvert(src[x], dst[x]), a, dst[x], ia);
+            dst += dbpl;
+            src += sbpl;
+        }
+        return;
+    }
 
-                    const uint8x8_t src8_high = vget_high_u8(src8);
-                    const uint8x8_t dst8_high = vget_high_u8(dst8);
+    pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
+}
 
-                    const uint16x8_t src16_low = vmovl_u8(src8_low);
-                    const uint16x8_t dst16_low = vmovl_u8(dst8_low);
+extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
 
-                    const uint16x8_t src16_high = vmovl_u8(src8_high);
-                    const uint16x8_t dst16_high = vmovl_u8(dst8_high);
+void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   int w, int h,
+                                   int const_alpha)
+{
+    quint16 *dst = (quint16 *) destPixels;
+    quint32 *src = (quint32 *) srcPixels;
 
-                    const uint16x8_t result16_low = qvsource_over_u16(src16_low, dst16_low, half, full);
-                    const uint16x8_t result16_high = qvsource_over_u16(src16_high, dst16_high, half, full);
+    if (const_alpha != 256) {
+        for (int y=0; y<h; ++y) {
+            int i = 0;
+            for (; i < w-7; i += 8)
+                blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
 
-                    const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
-                    const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
+            if (i < w) {
+                int tail = w - i;
 
-                    vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
+                quint16 dstBuffer[8];
+                quint32 srcBuffer[8];
+
+                for (int j = 0; j < tail; ++j) {
+                    dstBuffer[j] = dst[i + j];
+                    srcBuffer[j] = src[i + j];
+                }
+
+                blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
+
+                for (int j = 0; j < tail; ++j) {
+                    dst[i + j] = dstBuffer[j];
+                    src[i + j] = srcBuffer[j];
                 }
             }
-            for (; x<w; ++x) {
-                uint s = src[x];
-                if (s >= 0xff000000)
-                    dst[x] = s;
-                else if (s != 0)
-                    dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
-            }
-            dst = (quint32 *)(((uchar *) dst) + dbpl);
-            src = (const quint32 *)(((const uchar *) src) + sbpl);
+
+            dst = (quint16 *)(((uchar *) dst) + dbpl);
+            src = (quint32 *)(((uchar *) src) + sbpl);
         }
+        return;
+    }
+
+    pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
+}
+
+void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (const_alpha == 255) {
+        pixman_composite_scanline_over_asm_neon(length, dest, src);
+    } else {
+        qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
+    }
+}
+
+void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
+                                    const uchar *srcPixels, int sbpl,
+                                    int w, int h,
+                                    int const_alpha)
+{
+    const uint *src = (const uint *) srcPixels;
+    uint *dst = (uint *) destPixels;
+    uint16x8_t half = vdupq_n_u16(0x80);
+    uint16x8_t full = vdupq_n_u16(0xff);
+    if (const_alpha == 256) {
+        pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
     } else if (const_alpha != 0) {
         const_alpha = (const_alpha * 255) >> 8;
         uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
@@ -254,6 +339,246 @@ void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
     }
 }
 
+void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
+                                  int x, int y, quint32 color,
+                                  const uchar *bitmap,
+                                  int mapWidth, int mapHeight, int mapStride,
+                                  const QClipData *)
+{
+    quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
+    const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
+
+    uchar *mask = const_cast<uchar *>(bitmap);
+
+    pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
+}
+
+extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
+
+template <typename SRC, typename BlendFunc>
+struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
+    Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
+        : m_index(0)
+        , m_blender(blender)
+        , m_const_alpha(const_alpha)
+    {
+    }
+
+    inline void write(quint16 *dst, quint32 src)
+    {
+        srcBuffer[m_index++] = src;
+
+        if (m_index == 8) {
+            m_blender(dst - 7, srcBuffer, m_const_alpha);
+            m_index = 0;
+        }
+    }
+
+    inline void flush(quint16 *dst)
+    {
+        if (m_index > 0) {
+            quint16 dstBuffer[8];
+            for (int i = 0; i < m_index; ++i)
+                dstBuffer[i] = dst[i - m_index];
+
+            m_blender(dstBuffer, srcBuffer, m_const_alpha);
+
+            for (int i = 0; i < m_index; ++i)
+                dst[i - m_index] = dstBuffer[i];
+
+            m_index = 0;
+        }
+    }
+
+    SRC srcBuffer[8];
+
+    int m_index;
+    BlendFunc m_blender;
+    int m_const_alpha;
+};
+
+template <typename SRC, typename BlendFunc>
+Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
+Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
+{
+    return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
+}
+
+void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                         const uchar *srcPixels, int sbpl,
+                                         const QRectF &targetRect,
+                                         const QRectF &sourceRect,
+                                         const QRect &clip,
+                                         int const_alpha)
+{
+    if (const_alpha == 0)
+        return;
+
+    qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
+        Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
+}
+
+void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   const QRectF &targetRect,
+                                   const QRectF &sourceRect,
+                                   const QRect &clip,
+                                   int const_alpha);
+
+void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                        const uchar *srcPixels, int sbpl,
+                                        const QRectF &targetRect,
+                                        const QRectF &sourceRect,
+                                        const QRect &clip,
+                                        int const_alpha)
+{
+    if (const_alpha == 0)
+        return;
+
+    if (const_alpha == 256) {
+        qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, const_alpha);
+        return;
+    }
+
+    qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
+        Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
+}
+
+extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
+                                              const uchar *srcPixels, int sbpl,
+                                              const QRectF &targetRect,
+                                              const QRectF &sourceRect,
+                                              const QRect &clip,
+                                              const QTransform &targetRectTransform,
+                                              int const_alpha);
+
+void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                            const uchar *srcPixels, int sbpl,
+                                            const QRectF &targetRect,
+                                            const QRectF &sourceRect,
+                                            const QRect &clip,
+                                            const QTransform &targetRectTransform,
+                                            int const_alpha)
+{
+    if (const_alpha == 0)
+        return;
+
+    if (const_alpha == 256) {
+        qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
+        return;
+    }
+
+    qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
+                       reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
+        Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
+}
+
+void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                             const uchar *srcPixels, int sbpl,
+                                             const QRectF &targetRect,
+                                             const QRectF &sourceRect,
+                                             const QRect &clip,
+                                             const QTransform &targetRectTransform,
+                                             int const_alpha)
+{
+    if (const_alpha == 0)
+        return;
+
+    qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
+                       reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
+        Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
+}
+
+static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
+{
+    asm volatile (
+        "vld1.16     { d0, d1 }, [%[SRC]]\n\t"
+
+        /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
+           and put data into d4 - red, d3 - green, d2 - blue */
+        "vshrn.u16   d4,  q0,  #8\n\t"
+        "vshrn.u16   d3,  q0,  #3\n\t"
+        "vsli.u16    q0,  q0,  #5\n\t"
+        "vsri.u8     d4,  d4,  #5\n\t"
+        "vsri.u8     d3,  d3,  #6\n\t"
+        "vshrn.u16   d2,  q0,  #2\n\t"
+
+        /* fill d5 - alpha with 0xff */
+        "mov         r2, #255\n\t"
+        "vdup.8      d5, r2\n\t"
+
+        "vst4.8      { d2, d3, d4, d5 }, [%[DST]]"
+        : : [DST]"r" (dst), [SRC]"r" (src)
+        : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
+    );
+}
+
+uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
+{
+    const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
+
+    int i = 0;
+    for (; i < length - 7; i += 8)
+        convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
+
+    if (i < length) {
+        quint16 srcBuffer[8];
+        quint32 dstBuffer[8];
+
+        int tail = length - i;
+        for (int j = 0; j < tail; ++j)
+            srcBuffer[j] = data[i + j];
+
+        convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
+
+        for (int j = 0; j < tail; ++j)
+            buffer[i + j] = dstBuffer[j];
+    }
+
+    return buffer;
+}
+
+static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
+{
+    asm volatile (
+        "vld4.8      { d0, d1, d2, d3 }, [%[SRC]]\n\t"
+
+        /* convert to r5g6b5 and store it into {d28, d29} */
+        "vshll.u8    q14, d2, #8\n\t"
+        "vshll.u8    q8,  d1, #8\n\t"
+        "vshll.u8    q9,  d0, #8\n\t"
+        "vsri.u16    q14, q8, #5\n\t"
+        "vsri.u16    q14, q9, #11\n\t"
+
+        "vst1.16     { d28, d29 }, [%[DST]]"
+        : : [DST]"r" (dst), [SRC]"r" (src)
+        : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
+    );
+}
+
+void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
+{
+    quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
+
+    int i = 0;
+    for (; i < length - 7; i += 8)
+        convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
+
+    if (i < length) {
+        quint32 srcBuffer[8];
+        quint16 dstBuffer[8];
+
+        int tail = length - i;
+        for (int j = 0; j < tail; ++j)
+            srcBuffer[j] = buffer[i + j];
+
+        convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
+
+        for (int j = 0; j < tail; ++j)
+            data[i + j] = dstBuffer[j];
+    }
+}
+
 QT_END_NAMESPACE
 
 #endif // QT_HAVE_NEON
diff --git a/src/gui/painting/qdrawhelper_neon_asm.S b/src/gui/painting/qdrawhelper_neon_asm.S
new file mode 100644
index 0000000..9992817
--- /dev/null
+++ b/src/gui/painting/qdrawhelper_neon_asm.S
@@ -0,0 +1,192 @@
+/****************************************************************************
+**
+** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.altmacro
+
+/* void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha) */
+
+    .func blend_8_pixels_argb32_on_rgb16_neon
+    .global blend_8_pixels_argb32_on_rgb16_neon
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden blend_8_pixels_argb32_on_rgb16_neon
+    .type blend_8_pixels_argb32_on_rgb16_neon, %function
+#endif
+blend_8_pixels_argb32_on_rgb16_neon:
+    vld4.8      { d0, d1, d2, d3 }, [r1]
+    vld1.16     { d4, d5 }, [r0]
+
+    cmp         r2, #256
+    beq         .blend_32_inner
+
+    vdup.8      d6, r2
+
+    /* multiply by const_alpha */
+    vmull.u8    q8,   d6, d0
+    vmull.u8    q9,   d6, d1
+    vmull.u8    q10,  d6, d2
+    vmull.u8    q11,  d6, d3
+
+    vshrn.u16   d0,  q8, #8
+    vshrn.u16   d1,  q9, #8
+    vshrn.u16   d2, q10, #8
+    vshrn.u16   d3, q11, #8
+
+.blend_32_inner:
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+
+    pld [r0, #128]
+
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3,  q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+
+    vst1.16     { d28, d29 }, [r0]
+
+    bx          lr
+
+    .endfunc
+
+/* void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha) */
+
+    .func blend_8_pixels_rgb16_on_rgb16_neon
+    .global blend_8_pixels_rgb16_on_rgb16_neon
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden blend_8_pixels_rgb16_on_rgb16_neon
+    .type blend_8_pixels_rgb16_on_rgb16_neon, %function
+#endif
+blend_8_pixels_rgb16_on_rgb16_neon:
+    vld1.16     { d0, d1 }, [r0]
+    vld1.16     { d2, d3 }, [r1]
+
+    rsb         r3, r2, #256
+    vdup.8      d4, r2
+    vdup.8      d5, r3
+
+    /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6,  q0,  #8
+    vshrn.u16   d7,  q0,  #3
+    vsli.u16    q0,  q0,  #5
+    vsri.u8     d6,  d6,  #5
+    vsri.u8     d7,  d7,  #6
+    vshrn.u16   d30, q0,  #2
+
+    /* same from {d2, d3} into {d26, d27, d28} */
+    vshrn.u16   d26, q1,  #8
+    vshrn.u16   d27, q1,  #3
+    vsli.u16    q1,  q1,  #5
+    vsri.u8     d26, d26, #5
+    vsri.u8     d27, d27, #6
+    vshrn.u16   d28, q1,  #2
+
+    /* multiply dst by inv const_alpha */
+    vmull.u8    q10, d5,  d6
+    vmull.u8    q11, d5,  d7
+    vmull.u8    q12, d5,  d30
+
+    vshrn.u16   d6,  q10, #8
+    vshrn.u16   d7,  q11, #8
+    vshrn.u16   d30, q12, #8
+
+    /* multiply src by const_alpha */
+    vmull.u8    q10,  d4, d26
+    vmull.u8    q11,  d4, d27
+    vmull.u8    q12,  d4, d28
+
+    vshrn.u16   d26, q10, #8
+    vshrn.u16   d27, q11, #8
+    vshrn.u16   d28, q12, #8
+
+    /* preload dst + 128 */
+    pld [r0, #128]
+
+    /* add components, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vadd.u8     d16, d26, d6
+    vadd.u8     d19, d27, d7
+    vadd.u8     d18, d28, d30
+
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8,  d19, #8
+    vshll.u8    q9,  d18, #8
+    vsri.u16    q14,  q8, #5
+    vsri.u16    q14,  q9, #11
+
+    vst1.16     { d28, d29 }, [r0]
+
+    bx          lr
+
+    .endfunc
diff --git a/src/gui/painting/qdrawhelper_neon_p.h b/src/gui/painting/qdrawhelper_neon_p.h
index 1994441..d6a4509 100644
--- a/src/gui/painting/qdrawhelper_neon_p.h
+++ b/src/gui/painting/qdrawhelper_neon_p.h
@@ -69,6 +69,64 @@ void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
                                   int w, int h,
                                   int const_alpha);
 
+void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   int w, int h,
+                                   int const_alpha);
+
+void qt_blend_argb32_on_argb32_scanline_neon(uint *dest,
+                                             const uint *src,
+                                             int length,
+                                             uint const_alpha);
+
+void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   int w, int h,
+                                   int const_alpha);
+
+void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
+                                  int x, int y, quint32 color,
+                                  const uchar *bitmap,
+                                  int mapWidth, int mapHeight, int mapStride,
+                                  const QClipData *clip);
+
+void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                         const uchar *srcPixels, int sbpl,
+                                         const QRectF &targetRect,
+                                         const QRectF &sourceRect,
+                                         const QRect &clip,
+                                         int const_alpha);
+
+void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                        const uchar *srcPixels, int sbpl,
+                                        const QRectF &targetRect,
+                                        const QRectF &sourceRect,
+                                        const QRect &clip,
+                                        int const_alpha);
+
+void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                             const uchar *srcPixels, int sbpl,
+                                             const QRectF &targetRect,
+                                             const QRectF &sourceRect,
+                                             const QRect &clip,
+                                             const QTransform &targetRectTransform,
+                                             int const_alpha);
+
+void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
+                                            const uchar *srcPixels, int sbpl,
+                                            const QRectF &targetRect,
+                                            const QRectF &sourceRect,
+                                            const QRect &clip,
+                                            const QTransform &targetRectTransform,
+                                            int const_alpha);
+
+uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer,
+                                          QRasterBuffer *rasterBuffer,
+                                          int x, int y, int length);
+
+void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer,
+                                        int x, int y, const uint *buffer, int length);
+
 #endif // QT_HAVE_NEON
 
 QT_END_NAMESPACE
diff --git a/src/gui/painting/qpaintbuffer.cpp b/src/gui/painting/qpaintbuffer.cpp
index 39b76c8..ca2077f 100644
--- a/src/gui/painting/qpaintbuffer.cpp
+++ b/src/gui/painting/qpaintbuffer.cpp
@@ -269,24 +269,304 @@ void QPaintBuffer::draw(QPainter *painter, int frame) const
     printf("\n");
 #endif
 
-    if (painter && !painter->isActive())
-        return;
+    processCommands(painter, frameStartIndex(frame), frameEndIndex(frame));
+
+#ifdef QPAINTBUFFER_DEBUG_DRAW
+    qDebug() << "QPaintBuffer::draw() -------------------------------- DONE!";
+#endif
+}
+
+int QPaintBuffer::frameStartIndex(int frame) const
+{
+    return (frame == 0) ? 0 : d_ptr->frames.at(frame - 1);
+}
+
+int QPaintBuffer::frameEndIndex(int frame) const
+{
+    return (frame == d_ptr->frames.size()) ? d_ptr->commands.size() : d_ptr->frames.at(frame);
+}
+
+int QPaintBuffer::processCommands(QPainter *painter, int begin, int end) const
+{
+    if (!painter || !painter->isActive())
+        return 0;
 
     QPaintEngineEx *xengine = painter->paintEngine()->isExtended()
                               ? (QPaintEngineEx *) painter->paintEngine() : 0;
     if (xengine) {
         QPaintEngineExReplayer player;
-        player.draw(*this, painter, frame);
+        player.processCommands(*this, painter, begin, end);
     } else {
         QPainterReplayer player;
-        player.draw(*this, painter, frame);
+        player.processCommands(*this, painter, begin, end);
     }
 
-#ifdef QPAINTBUFFER_DEBUG_DRAW
-    qDebug() << "QPaintBuffer::draw() -------------------------------- DONE!";
-#endif
+    int depth = 0;
+    for (int i = begin; i < end; ++i) {
+        const QPaintBufferCommand &cmd = d_ptr->commands.at(i);
+        if (cmd.id == QPaintBufferPrivate::Cmd_Save)
+            ++depth;
+        else if (cmd.id == QPaintBufferPrivate::Cmd_Restore)
+            --depth;
+    }
+    return depth;
 }
 
+QString QPaintBuffer::commandDescription(int command) const
+{
+    QString desc;
+    QDebug debug(&desc);
+
+    const QPaintBufferCommand &cmd = d_ptr->commands.at(command);
+
+    switch (cmd.id) {
+    case QPaintBufferPrivate::Cmd_Save: {
+        debug << "Cmd_Save";
+        break; }
+
+    case QPaintBufferPrivate::Cmd_Restore: {
+        debug << "Cmd_Restore";
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetBrush: {
+        QBrush brush = qVariantValue<QBrush>(d_ptr->variants.at(cmd.offset));
+        debug << "Cmd_SetBrush: " << brush;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetBrushOrigin: {
+        debug << "Cmd_SetBrushOrigin: " << d_ptr->variants.at(cmd.offset).toPointF();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetCompositionMode: {
+        QPainter::CompositionMode mode = (QPainter::CompositionMode) cmd.extra;
+        debug << "ExCmd_SetCompositionMode, mode: " << mode;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetOpacity: {
+        debug << "ExCmd_SetOpacity: " << d_ptr->variants.at(cmd.offset).toDouble();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawVectorPath: {
+        debug << "ExCmd_DrawVectorPath: size: " << cmd.size
+//                 << ", hints:" << d->ints[cmd.offset2+cmd.size]
+                 << "pts/elms:" << cmd.offset << cmd.offset2;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_StrokeVectorPath: {
+        QPen pen = qVariantValue<QPen>(d_ptr->variants.at(cmd.extra));
+        debug << "ExCmd_StrokeVectorPath: size: " << cmd.size
+//                 << ", hints:" << d->ints[cmd.offset2+cmd.size]
+                 << "pts/elms:" << cmd.offset << cmd.offset2 << pen;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_FillVectorPath: {
+        QBrush brush = qVariantValue<QBrush>(d_ptr->variants.at(cmd.extra));
+        debug << "ExCmd_FillVectorPath: size: " << cmd.size
+//                 << ", hints:" << d->ints[cmd.offset2+cmd.size]
+                 << "pts/elms:" << cmd.offset << cmd.offset2 << brush;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_FillRectBrush: {
+        QBrush brush = qVariantValue<QBrush>(d_ptr->variants.at(cmd.extra));
+        QRectF *rect = (QRectF *)(d_ptr->floats.constData() + cmd.offset);
+        debug << "ExCmd_FillRectBrush, offset: " << cmd.offset << " rect: " << *rect << " brush: " << brush;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_FillRectColor: {
+        QColor color = qVariantValue<QColor>(d_ptr->variants.at(cmd.extra));
+        QRectF *rect = (QRectF *)(d_ptr->floats.constData() + cmd.offset);
+        debug << "ExCmd_FillRectBrush, offset: " << cmd.offset << " rect: " << *rect << " color: " << color;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawPolygonF: {
+        debug << "ExCmd_DrawPolygonF, offset: " << cmd.offset << " size: " << cmd.size
+                 << " mode: " << cmd.extra
+                 << d_ptr->floats.at(cmd.offset)
+                 << d_ptr->floats.at(cmd.offset+1);
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawPolygonI: {
+        debug << "ExCmd_DrawPolygonI, offset: " << cmd.offset << " size: " << cmd.size
+                 << " mode: " << cmd.extra
+                 << d_ptr->ints.at(cmd.offset)
+                 << d_ptr->ints.at(cmd.offset+1);
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawEllipseF: {
+        debug << "ExCmd_DrawEllipseF, offset: " << cmd.offset;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawLineF: {
+        debug << "ExCmd_DrawLineF, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawLineI: {
+        debug << "ExCmd_DrawLineI, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawPointsF: {
+        debug << "ExCmd_DrawPointsF, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawPointsI: {
+        debug << "ExCmd_DrawPointsI, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawPolylineF: {
+        debug << "ExCmd_DrawPolylineF, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawPolylineI: {
+        debug << "ExCmd_DrawPolylineI, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawRectF: {
+        debug << "ExCmd_DrawRectF, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawRectI: {
+        debug << "ExCmd_DrawRectI, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetClipEnabled: {
+        bool clipEnabled = d_ptr->variants.at(cmd.offset).toBool();
+        debug << "ExCmd_SetClipEnabled:" << clipEnabled;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_ClipVectorPath: {
+        QVectorPathCmd path(d_ptr, cmd);
+        debug << "ExCmd_ClipVectorPath:" << path().elementCount();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_ClipRect: {
+        QRect rect(QPoint(d_ptr->ints.at(cmd.offset), d_ptr->ints.at(cmd.offset + 1)),
+                   QPoint(d_ptr->ints.at(cmd.offset + 2), d_ptr->ints.at(cmd.offset + 3)));
+        debug << "ExCmd_ClipRect:" << rect << cmd.extra;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_ClipRegion: {
+        QRegion region(d_ptr->variants.at(cmd.offset).value<QRegion>());
+        debug << "ExCmd_ClipRegion:" << region.boundingRect() << cmd.extra;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetPen: {
+        QPen pen = qVariantValue<QPen>(d_ptr->variants.at(cmd.offset));
+        debug << "Cmd_SetPen: " << pen;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetTransform: {
+        QTransform xform = qVariantValue<QTransform>(d_ptr->variants.at(cmd.offset));
+        debug << "Cmd_SetTransform, offset: " << cmd.offset << xform;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetRenderHints: {
+        debug << "Cmd_SetRenderHints, hints: " << cmd.extra;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_SetBackgroundMode: {
+        debug << "Cmd_SetBackgroundMode: " << cmd.extra;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawConvexPolygonF: {
+        debug << "Cmd_DrawConvexPolygonF, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawConvexPolygonI: {
+        debug << "Cmd_DrawConvexPolygonI, offset: " << cmd.offset << " size: " << cmd.size;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawEllipseI: {
+        debug << "Cmd_DrawEllipseI, offset: " << cmd.offset;
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawPixmapRect: {
+        QPixmap pm(d_ptr->variants.at(cmd.offset).value<QPixmap>());
+        QRectF r(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1),
+                 d_ptr->floats.at(cmd.extra+2), d_ptr->floats.at(cmd.extra+3));
+
+        QRectF sr(d_ptr->floats.at(cmd.extra+4), d_ptr->floats.at(cmd.extra+5),
+                  d_ptr->floats.at(cmd.extra+6), d_ptr->floats.at(cmd.extra+7));
+        debug << "Cmd_DrawPixmapRect:" << r << sr << pm.size();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawPixmapPos: {
+        QPixmap pm(d_ptr->variants.at(cmd.offset).value<QPixmap>());
+        QPointF pos(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1));
+        debug << "Cmd_DrawPixmapPos:" << pos << pm.size();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawTiledPixmap: {
+        QPixmap pm(d_ptr->variants.at(cmd.offset).value<QPixmap>());
+        QRectF r(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1),
+                 d_ptr->floats.at(cmd.extra+2), d_ptr->floats.at(cmd.extra+3));
+
+        QPointF offset(d_ptr->floats.at(cmd.extra+4), d_ptr->floats.at(cmd.extra+5));
+        debug << "Cmd_DrawTiledPixmap:" << r << offset << pm.size();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawImageRect: {
+        QImage image(d_ptr->variants.at(cmd.offset).value<QImage>());
+        QRectF r(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1),
+                 d_ptr->floats.at(cmd.extra+2), d_ptr->floats.at(cmd.extra+3));
+        QRectF sr(d_ptr->floats.at(cmd.extra+4), d_ptr->floats.at(cmd.extra+5),
+                  d_ptr->floats.at(cmd.extra+6), d_ptr->floats.at(cmd.extra+7));
+        debug << "Cmd_DrawImageRect:" << r << sr << image.size();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawImagePos: {
+        QImage image(d_ptr->variants.at(cmd.offset).value<QImage>());
+        QPointF pos(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1));
+        debug << "Cmd_DrawImagePos:" << pos << image.size();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawText: {
+        QPointF pos(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1));
+        QList<QVariant> variants(d_ptr->variants.at(cmd.offset).value<QList<QVariant> >());
+
+        QFont font(variants.at(0).value<QFont>());
+        QString text(variants.at(1).value<QString>());
+
+        debug << "Cmd_DrawText:" << pos << text << font.family();
+        break; }
+
+    case QPaintBufferPrivate::Cmd_DrawTextItem: {
+        QPointF pos(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1));
+        QTextItemIntCopy *tiCopy = reinterpret_cast<QTextItemIntCopy *>(qVariantValue<void *>(d_ptr->variants.at(cmd.offset)));
+        QTextItemInt &ti = (*tiCopy)();
+        QString text(ti.text());
+
+        QFont font(ti.font());
+        font.setUnderline(false);
+        font.setStrikeOut(false);
+        font.setOverline(false);
+
+        const QTextItemInt &si = static_cast<const QTextItemInt &>(ti);
+        qreal justificationWidth = 0;
+        if (si.justified)
+            justificationWidth = si.width.toReal();
+
+        debug << "Cmd_DrawTextItem:" << pos << " " << text;
+        break; }
+    case QPaintBufferPrivate::Cmd_SystemStateChanged: {
+        QRegion systemClip(d_ptr->variants.at(cmd.offset).value<QRegion>());
+
+        debug << "Cmd_SystemStateChanged:" << systemClip;
+        break; }
+    case QPaintBufferPrivate::Cmd_Translate: {
+        QPointF delta(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1));
+        debug << "Cmd_Translate:" << delta;
+        break; }
+    case QPaintBufferPrivate::Cmd_DrawStaticText: {
+        QPointF delta(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1));
+        QVariantList variants(d_ptr->variants.at(cmd.offset).value<QVariantList>());
+
+        QStaticText text(variants.at(0).value<QStaticText>());
+        debug << "Cmd_DrawStaticText:" << text.text();
+        break; }
+    }
+
+    return desc;
+}
 
 QRectF QPaintBuffer::boundingRect() const
 {
@@ -1110,15 +1390,12 @@ void QPainterReplayer::setupTransform(QPainter *_painter)
     painter->setTransform(m_world_matrix);
 }
 
-void QPainterReplayer::draw(const QPaintBuffer &buffer, QPainter *_painter, int frame)
+void QPainterReplayer::processCommands(const QPaintBuffer &buffer, QPainter *p, int begin, int end)
 {
     d = buffer.d_ptr;
-    setupTransform(_painter);
-
-    int frameStart = (frame == 0) ? 0 : d->frames.at(frame-1);
-    int frameEnd = (frame == d->frames.size()) ? d->commands.size() : d->frames.at(frame);
+    painter = p;
 
-    for (int cmdIndex=frameStart; cmdIndex<frameEnd; ++cmdIndex) {
+    for (int cmdIndex = begin; cmdIndex < end; ++cmdIndex) {
         const QPaintBufferCommand &cmd = d->commands.at(cmdIndex);
         process(cmd);
     }
diff --git a/src/gui/painting/qpaintbuffer_p.h b/src/gui/painting/qpaintbuffer_p.h
index 0fde290..4576947 100644
--- a/src/gui/painting/qpaintbuffer_p.h
+++ b/src/gui/painting/qpaintbuffer_p.h
@@ -78,6 +78,12 @@ public:
     int numFrames() const;
 
     void draw(QPainter *painter, int frame = 0) const;
+
+    int frameStartIndex(int frame) const;
+    int frameEndIndex(int frame) const;
+    int processCommands(QPainter *painter, int begin, int end) const;
+    QString commandDescription(int command) const;
+
     void setBoundingRect(const QRectF &rect);
     QRectF boundingRect() const;
 
@@ -317,7 +323,7 @@ public:
 
     void setupTransform(QPainter *painter);
     virtual void process(const QPaintBufferCommand &cmd);
-    void draw(const QPaintBuffer &buffer, QPainter *painter, int frame);
+    void processCommands(const QPaintBuffer &buffer, QPainter *painter, int begin, int end);
 
 protected:
     QPaintBufferPrivate *d;
diff --git a/src/gui/painting/qtransform.cpp b/src/gui/painting/qtransform.cpp
index 4f42a58..988d678 100644
--- a/src/gui/painting/qtransform.cpp
+++ b/src/gui/painting/qtransform.cpp
@@ -1037,8 +1037,18 @@ QDataStream & operator>>(QDataStream &s, QTransform &t)
 #ifndef QT_NO_DEBUG_STREAM
 QDebug operator<<(QDebug dbg, const QTransform &m)
 {
-    dbg.nospace() << "QTransform("
-                  << "11="  << m.m11()
+    static const char *typeStr[] =
+    {
+        "TxNone",
+        "TxTranslate",
+        "TxScale",
+        "TxRotate",
+        "TxShear",
+        "TxProject"
+    };
+
+    dbg.nospace() << "QTransform(type=" << typeStr[m.type()] << ','
+                  << " 11=" << m.m11()
                   << " 12=" << m.m12()
                   << " 13=" << m.m13()
                   << " 21=" << m.m21()
@@ -1048,6 +1058,7 @@ QDebug operator<<(QDebug dbg, const QTransform &m)
                   << " 32=" << m.m32()
                   << " 33=" << m.m33()
                   << ')';
+
     return dbg.space();
 }
 #endif
diff --git a/src/plugins/bearer/corewlan/qcorewlanengine.h b/src/plugins/bearer/corewlan/qcorewlanengine.h
index 5e93193..11f5d96 100644
--- a/src/plugins/bearer/corewlan/qcorewlanengine.h
+++ b/src/plugins/bearer/corewlan/qcorewlanengine.h
@@ -102,7 +102,7 @@ protected:
     void getUserConfigurations();
     QString getNetworkNameFromSsid(const QString &ssid);
     QString getSsidFromNetworkName(const QString &name);
-    QStringList foundNetwork(const QString &id, const QString &ssid, const QNetworkConfiguration::StateFlags state, const QString &interfaceName);
+    QStringList foundNetwork(const QString &id, const QString &ssid, const QNetworkConfiguration::StateFlags state, const QString &interfaceName, const QNetworkConfiguration::Purpose purpose);
 };
 
 QT_END_NAMESPACE
diff --git a/src/plugins/bearer/corewlan/qcorewlanengine.mm b/src/plugins/bearer/corewlan/qcorewlanengine.mm
index a366d00..b59ccee 100644
--- a/src/plugins/bearer/corewlan/qcorewlanengine.mm
+++ b/src/plugins/bearer/corewlan/qcorewlanengine.mm
@@ -271,7 +271,7 @@ void QCoreWlanEngine::connectToId(const QString &id)
                         SecKeychainAttributeList attributeList = {3,attributes};
 
                         SecKeychainSearchRef searchRef;
-                        OSErr result = SecKeychainSearchCreateFromAttributes(NULL, kSecGenericPasswordItemClass, &attributeList, &searchRef);
+                        SecKeychainSearchCreateFromAttributes(NULL, kSecGenericPasswordItemClass, &attributeList, &searchRef);
 
                         NSString *password = @"";
                         SecKeychainItemRef searchItem;
@@ -429,7 +429,14 @@ QStringList QCoreWlanEngine::scanForSsids(const QString &interfaceName)
                         state = QNetworkConfiguration::Undefined;
                     }
                 }
-                found.append(foundNetwork(id, networkSsid, state, interfaceName));
+                QNetworkConfiguration::Purpose purpose = QNetworkConfiguration::UnknownPurpose;
+                if([[apNetwork securityMode] intValue] == kCWSecurityModeOpen) {
+                    purpose = QNetworkConfiguration::PublicPurpose;
+                } else {
+                    purpose = QNetworkConfiguration::PrivatePurpose;
+                }
+
+                found.append(foundNetwork(id, networkSsid, state, interfaceName, purpose));
 
             } //end row
         } //end error
@@ -470,13 +477,13 @@ QStringList QCoreWlanEngine::scanForSsids(const QString &interfaceName)
                 state = QNetworkConfiguration::Defined;
             }
 
-            found.append(foundNetwork(id, networkName, state, interfaceName));
+            found.append(foundNetwork(id, networkName, state, interfaceName, QNetworkConfiguration::UnknownPurpose));
         }
     }
     return found;
 }
 
-QStringList QCoreWlanEngine::foundNetwork(const QString &id, const QString &name, const QNetworkConfiguration::StateFlags state, const QString &interfaceName)
+QStringList QCoreWlanEngine::foundNetwork(const QString &id, const QString &name, const QNetworkConfiguration::StateFlags state, const QString &interfaceName, const QNetworkConfiguration::Purpose purpose)
 {
     QStringList found;
     QMutexLocker locker(&mutex);
@@ -507,6 +514,10 @@ QStringList QCoreWlanEngine::foundNetwork(const QString &id, const QString &name
             changed = true;
         }
 
+        if (ptr->purpose != purpose) {
+            ptr->purpose = purpose;
+            changed = true;
+        }
         ptr->mutex.unlock();
 
         if (changed) {
@@ -524,6 +535,7 @@ QStringList QCoreWlanEngine::foundNetwork(const QString &id, const QString &name
         ptr->state = state;
         ptr->type = QNetworkConfiguration::InternetAccessPoint;
         ptr->bearer = QLatin1String("WLAN");
+        ptr->purpose = purpose;
 
         accessPointConfigurations.insert(ptr->id, ptr);
         configurationInterface.insert(ptr->id, interfaceName);
diff --git a/src/plugins/bearer/networkmanager/qnetworkmanagerengine.cpp b/src/plugins/bearer/networkmanager/qnetworkmanagerengine.cpp
index 28ee38e..72d6838 100644
--- a/src/plugins/bearer/networkmanager/qnetworkmanagerengine.cpp
+++ b/src/plugins/bearer/networkmanager/qnetworkmanagerengine.cpp
@@ -602,7 +602,11 @@ void QNetworkManagerEngine::newAccessPoint(const QString &path, const QDBusObjec
     ptr->isValid = true;
     ptr->id = QString::number(qHash(objectPath.path()));
     ptr->type = QNetworkConfiguration::InternetAccessPoint;
-    ptr->purpose = QNetworkConfiguration::PublicPurpose;
+    if(accessPoint->flags() == NM_802_11_AP_FLAGS_PRIVACY) {
+        ptr->purpose = QNetworkConfiguration::PrivatePurpose;
+    } else {
+        ptr->purpose = QNetworkConfiguration::PublicPurpose;
+    }
     ptr->state = QNetworkConfiguration::Undefined;
     ptr->bearer = QLatin1String("WLAN");
 
@@ -718,6 +722,7 @@ QNetworkConfigurationPrivate *QNetworkManagerEngine::parseConnection(const QStri
 
     if (connectionType == QLatin1String("802-3-ethernet")) {
         cpPriv->bearer = QLatin1String("Ethernet");
+        cpPriv->purpose = QNetworkConfiguration::PublicPurpose;
 
         foreach (const QDBusObjectPath &devicePath, interface->getDevices()) {
             QNetworkManagerInterfaceDevice device(devicePath.path());
@@ -734,7 +739,12 @@ QNetworkConfigurationPrivate *QNetworkManagerEngine::parseConnection(const QStri
         cpPriv->bearer = QLatin1String("WLAN");
 
         const QString connectionSsid = map.value("802-11-wireless").value("ssid").toString();
-
+        const QString connectionSecurity = map.value("802-11-wireless").value("security").toString();
+        if(!connectionSecurity.isEmpty()) {
+            cpPriv->purpose = QNetworkConfiguration::PrivatePurpose;
+        } else {
+            cpPriv->purpose = QNetworkConfiguration::PublicPurpose;
+        }
         for (int i = 0; i < accessPoints.count(); ++i) {
             if (connectionSsid == accessPoints.at(i)->ssid()) {
                 cpPriv->state |= QNetworkConfiguration::Discovered;
diff --git a/src/plugins/mediaservices/directshow/mediaplayer/directshowmediatype.cpp b/src/plugins/mediaservices/directshow/mediaplayer/directshowmediatype.cpp
index cf6d45b..f8f519d 100644
--- a/src/plugins/mediaservices/directshow/mediaplayer/directshowmediatype.cpp
+++ b/src/plugins/mediaservices/directshow/mediaplayer/directshowmediatype.cpp
@@ -130,9 +130,17 @@ QVideoSurfaceFormat DirectShowMediaType::formatFromType(const AM_MEDIA_TYPE &typ
                 if (header->AvgTimePerFrame > 0)
                     format.setFrameRate(10000 /header->AvgTimePerFrame);
 
-                format.setScanLineDirection(header->bmiHeader.biHeight < 0
-                        ? QVideoSurfaceFormat::TopToBottom
-                        : QVideoSurfaceFormat::BottomToTop);
+                switch (qt_typeLookup[i].pixelFormat) {
+                    case QVideoFrame::Format_RGB32:
+                    case QVideoFrame::Format_BGR24:
+                    case QVideoFrame::Format_RGB565:
+                    case QVideoFrame::Format_RGB555:
+                        if (header->bmiHeader.biHeight >= 0)
+                            format.setScanLineDirection(QVideoSurfaceFormat::BottomToTop);
+                        break;
+                    default:
+                        break;
+                }
 
                 return format;
             } else if (IsEqualGUID(type.formattype, FORMAT_VideoInfo2)) {
@@ -145,9 +153,17 @@ QVideoSurfaceFormat DirectShowMediaType::formatFromType(const AM_MEDIA_TYPE &typ
                 if (header->AvgTimePerFrame > 0)
                     format.setFrameRate(10000 / header->AvgTimePerFrame);
 
-                format.setScanLineDirection(header->bmiHeader.biHeight < 0
-                        ? QVideoSurfaceFormat::TopToBottom
-                        : QVideoSurfaceFormat::BottomToTop);
+                switch (qt_typeLookup[i].pixelFormat) {
+                    case QVideoFrame::Format_RGB32:
+                    case QVideoFrame::Format_BGR24:
+                    case QVideoFrame::Format_RGB565:
+                    case QVideoFrame::Format_RGB555:
+                        if (header->bmiHeader.biHeight >= 0)
+                            format.setScanLineDirection(QVideoSurfaceFormat::BottomToTop);
+                        break;
+                    default:
+                        break;
+                }
 
                 return format;
             }
diff --git a/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.cpp b/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.cpp
index d54d188..a5f143f 100644
--- a/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.cpp
+++ b/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.cpp
@@ -191,6 +191,8 @@ void DirectShowPlayerService::load(const QMediaContent &media, QIODevice *stream
     m_atEnd = false;
     m_metaDataControl->updateGraph(0, 0);
 
+    QCoreApplication::postEvent(this, new QEvent(QEvent::Type(VideoOutputChange)));
+
     if (m_resources.isEmpty() && !stream) {
         m_pendingTasks = 0;
         m_graphStatus = NoMedia;
@@ -464,6 +466,8 @@ void DirectShowPlayerService::doRender(QMutexLocker *locker)
             QCoreApplication::postEvent(this, new QEvent(QEvent::Type(Error)));
         }
 
+        QCoreApplication::postEvent(this, new QEvent(QEvent::Type(VideoOutputChange)));
+
         m_executedTasks |= Render;
     }
 }
@@ -1144,6 +1148,9 @@ void DirectShowPlayerService::customEvent(QEvent *event)
         QMutexLocker locker(&m_mutex);
 
         m_playerControl->updatePosition(m_position);
+    } else if (event->type() == QEvent::Type(VideoOutputChange)) {
+        if (m_videoWindowControl)
+            m_videoWindowControl->updateNativeSize();
     } else {
         QMediaService::customEvent(event);
     }
diff --git a/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.h b/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.h
index 23515d0..d3ef809 100644
--- a/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.h
+++ b/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.h
@@ -164,7 +164,8 @@ private:
         DurationChange,
         StatusChange,
         EndOfMedia,
-        PositionChange
+        PositionChange,
+        VideoOutputChange
     };
 
     enum GraphStatus
diff --git a/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.cpp b/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.cpp
index 4b9aeb8..e25dd99 100644
--- a/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.cpp
+++ b/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.cpp
@@ -51,6 +51,7 @@ Vmr9VideoWindowControl::Vmr9VideoWindowControl(QObject *parent)
     , m_filter(com_new<IBaseFilter>(CLSID_VideoMixingRenderer9, IID_IBaseFilter))
     , m_windowId(0)
     , m_dirtyValues(0)
+    , m_aspectRatioMode(Qt::KeepAspectRatio)
     , m_brightness(0)
     , m_contrast(0)
     , m_hue(0)
@@ -90,33 +91,30 @@ void Vmr9VideoWindowControl::setWinId(WId id)
 
 QRect Vmr9VideoWindowControl::displayRect() const
 {
-    QRect rect;
-
-    if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>(
-            m_filter, IID_IVMRWindowlessControl9)) {
-        RECT sourceRect;
-        RECT displayRect;
-
-        if (control->GetVideoPosition(&sourceRect, &displayRect) == S_OK) {
-            rect = QRect(
-                    displayRect.left,
-                    displayRect.bottom,
-                    displayRect.right - displayRect.left,
-                    displayRect.bottom - displayRect.top);
-        }
-        control->Release();
-    }
-    return rect;
+    return m_displayRect;
 }
 
 void Vmr9VideoWindowControl::setDisplayRect(const QRect &rect)
 {
+    m_displayRect = rect;
+
     if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>(
             m_filter, IID_IVMRWindowlessControl9)) {
         RECT sourceRect = { 0, 0, 0, 0 };
         RECT displayRect = { rect.left(), rect.top(), rect.right(), rect.bottom() };
 
         control->GetNativeVideoSize(&sourceRect.right, &sourceRect.bottom, 0, 0);
+
+        if (m_aspectRatioMode == Qt::KeepAspectRatioByExpanding) {
+            QSize clippedSize = rect.size();
+            clippedSize.scale(sourceRect.right, sourceRect.bottom, Qt::KeepAspectRatio);
+
+            sourceRect.left = (sourceRect.right - clippedSize.width()) / 2;
+            sourceRect.top = (sourceRect.bottom - clippedSize.height()) / 2;
+            sourceRect.right = sourceRect.left + clippedSize.width();
+            sourceRect.bottom = sourceRect.top + clippedSize.height();
+        }
+
         control->SetVideoPosition(&sourceRect, &displayRect);
         control->Release();
     }
@@ -134,7 +132,6 @@ void Vmr9VideoWindowControl::setFullScreen(bool fullScreen)
 
 void Vmr9VideoWindowControl::repaint()
 {
-
     if (QWidget *widget = QWidget::find(m_windowId)) {
         HDC dc = widget->getDC();
         if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>(
@@ -164,21 +161,13 @@ QSize Vmr9VideoWindowControl::nativeSize() const
 
 Qt::AspectRatioMode Vmr9VideoWindowControl::aspectRatioMode() const
 {
-    Qt::AspectRatioMode mode = Qt::KeepAspectRatio;
-
-    if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>(
-            m_filter, IID_IVMRWindowlessControl9)) {
-        DWORD arMode;
-
-        if (control->GetAspectRatioMode(&arMode) == S_OK && arMode == VMR9ARMode_None)
-            mode = Qt::IgnoreAspectRatio;
-        control->Release();
-    }
-    return mode;
+    return m_aspectRatioMode;
 }
 
 void Vmr9VideoWindowControl::setAspectRatioMode(Qt::AspectRatioMode mode)
 {
+    m_aspectRatioMode = mode;
+
     if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>(
             m_filter, IID_IVMRWindowlessControl9)) {
         switch (mode) {
@@ -188,10 +177,15 @@ void Vmr9VideoWindowControl::setAspectRatioMode(Qt::AspectRatioMode mode)
         case Qt::KeepAspectRatio:
             control->SetAspectRatioMode(VMR9ARMode_LetterBox);
             break;
+        case Qt::KeepAspectRatioByExpanding:
+            control->SetAspectRatioMode(VMR9ARMode_LetterBox);
+            break;
         default:
             break;
         }
         control->Release();
+
+        setDisplayRect(m_displayRect);
     }
 }
 
@@ -259,6 +253,13 @@ void Vmr9VideoWindowControl::setSaturation(int saturation)
     emit saturationChanged(saturation);
 }
 
+void Vmr9VideoWindowControl::updateNativeSize()
+{
+    setDisplayRect(m_displayRect);
+
+    emit nativeSizeChanged();
+}
+
 void Vmr9VideoWindowControl::setProcAmpValues()
 {
     if (IVMRMixerControl9 *control = com_cast<IVMRMixerControl9>(m_filter, IID_IVMRMixerControl9)) {
diff --git a/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.h b/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.h
index bf4fb42..beac433 100644
--- a/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.h
+++ b/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.h
@@ -90,6 +90,8 @@ public:
     int saturation() const;
     void setSaturation(int saturation);
 
+    void updateNativeSize();
+
 private:
     void setProcAmpValues();
     float scaleProcAmpValue(
@@ -98,6 +100,8 @@ private:
     IBaseFilter *m_filter;
     WId m_windowId;
     DWORD m_dirtyValues;
+    Qt::AspectRatioMode m_aspectRatioMode;
+    QRect m_displayRect;
     int m_brightness;
     int m_contrast;
     int m_hue;
diff --git a/src/plugins/mediaservices/gstreamer/qgstreamervideooverlay.cpp b/src/plugins/mediaservices/gstreamer/qgstreamervideooverlay.cpp
index 427d514..f381f7f 100644
--- a/src/plugins/mediaservices/gstreamer/qgstreamervideooverlay.cpp
+++ b/src/plugins/mediaservices/gstreamer/qgstreamervideooverlay.cpp
@@ -192,21 +192,36 @@ void QGstreamerVideoOverlay::surfaceFormatChanged()
 
 void QGstreamerVideoOverlay::setScaledDisplayRect()
 {
+    QRect formatViewport = m_surface->surfaceFormat().viewport();
+
     switch (m_aspectRatioMode) {
     case Qt::KeepAspectRatio:
         {
-            QSize size = m_surface->surfaceFormat().viewport().size();
-
+            QSize size = m_surface->surfaceFormat().sizeHint();
             size.scale(m_displayRect.size(), Qt::KeepAspectRatio);
 
             QRect rect(QPoint(0, 0), size);
             rect.moveCenter(m_displayRect.center());
 
             m_surface->setDisplayRect(rect);
+            m_surface->setViewport(formatViewport);
         }
         break;
     case Qt::IgnoreAspectRatio:
         m_surface->setDisplayRect(m_displayRect);
+        m_surface->setViewport(formatViewport);
+        break;
+    case Qt::KeepAspectRatioByExpanding:
+        {
+            QSize size = m_displayRect.size();
+            size.scale(m_surface->surfaceFormat().sizeHint(), Qt::KeepAspectRatio);
+
+            QRect viewport(QPoint(0, 0), size);
+            viewport.moveCenter(formatViewport.center());
+
+            m_surface->setDisplayRect(m_displayRect);
+            m_surface->setViewport(viewport);
+        }
         break;
     };
 }
diff --git a/src/plugins/mediaservices/gstreamer/qx11videosurface.cpp b/src/plugins/mediaservices/gstreamer/qx11videosurface.cpp
index cbd5a76..70b8527 100644
--- a/src/plugins/mediaservices/gstreamer/qx11videosurface.cpp
+++ b/src/plugins/mediaservices/gstreamer/qx11videosurface.cpp
@@ -213,6 +213,16 @@ void QX11VideoSurface::setDisplayRect(const QRect &rect)
     m_displayRect = rect;
 }
 
+QRect QX11VideoSurface::viewport() const
+{
+    return m_viewport;
+}
+
+void QX11VideoSurface::setViewport(const QRect &rect)
+{
+    m_viewport = rect;
+}
+
 int QX11VideoSurface::brightness() const
 {
     return getAttribute("XV_BRIGHTNESS", m_brightnessRange.first, m_brightnessRange.second);
diff --git a/src/plugins/mediaservices/gstreamer/qx11videosurface.h b/src/plugins/mediaservices/gstreamer/qx11videosurface.h
index 1be963e..10f79a6 100644
--- a/src/plugins/mediaservices/gstreamer/qx11videosurface.h
+++ b/src/plugins/mediaservices/gstreamer/qx11videosurface.h
@@ -67,6 +67,9 @@ public:
     QRect displayRect() const;
     void setDisplayRect(const QRect &rect);
 
+    QRect viewport() const;
+    void setViewport(const QRect &rect);
+
     int brightness() const;
     void setBrightness(int brightness);
 
diff --git a/tools/qttracereplay/main.cpp b/tools/qttracereplay/main.cpp
index be7906b..101d512 100644
--- a/tools/qttracereplay/main.cpp
+++ b/tools/qttracereplay/main.cpp
@@ -49,7 +49,7 @@ class ReplayWidget : public QWidget
 {
     Q_OBJECT
 public:
-    ReplayWidget(const QString &filename, int from, int to, bool single);
+    ReplayWidget(const QString &filename, int from, int to, bool single, int frame);
 
     void paintEvent(QPaintEvent *event);
     void resizeEvent(QResizeEvent *event);
@@ -66,27 +66,96 @@ public:
     QTime timer;
 
     QList<uint> visibleUpdates;
-    QList<uint> iterationTimes;
+
+    QVector<uint> iterationTimes;
     QString filename;
 
     int from;
     int to;
 
     bool single;
+
+    int frame;
+    int currentCommand;
 };
 
 void ReplayWidget::updateRect()
 {
-    if (!visibleUpdates.isEmpty())
+    if (frame >= 0 && !updates.isEmpty())
+        update(updates.at(frame));
+    else if (!visibleUpdates.isEmpty())
         update(updates.at(visibleUpdates.at(currentFrame)));
 }
 
+const int singleFrameRepeatsPerCommand = 100;
+const int singleFrameIterations = 4;
+
 void ReplayWidget::paintEvent(QPaintEvent *)
 {
     QPainter p(this);
 
+    QTimer::singleShot(0, this, SLOT(updateRect()));
+
 //    p.setClipRegion(frames.at(currentFrame).updateRegion);
 
+    if (frame >= 0) {
+        int start = buffer.frameStartIndex(frame);
+        int end = buffer.frameEndIndex(frame);
+
+        iterationTimes.resize(end - start);
+
+        int saveRestoreStackDepth = buffer.processCommands(&p, start, start + currentCommand);
+
+        for (int i = 0; i < saveRestoreStackDepth; ++i)
+            p.restore();
+
+        const int repeats = currentIteration >= 3 ? singleFrameRepeatsPerCommand : 1;
+
+        ++currentFrame;
+        if (currentFrame == repeats) {
+            currentFrame = 0;
+            if (currentIteration >= 3) {
+                iterationTimes[currentCommand - 1] = qMin(iterationTimes[currentCommand - 1], uint(timer.elapsed()));
+                timer.restart();
+            }
+
+            if (currentIteration >= singleFrameIterations + 3) {
+                printf(" #    | ms      | description\n");
+                printf("------+---------+------------------------------------------------------------\n");
+
+		qSort(iterationTimes);
+
+		int sum = 0;
+                for (int i = 0; i < iterationTimes.size(); ++i) {
+                    int delta = iterationTimes.at(i);
+                    if (i > 0)
+                        delta -= iterationTimes.at(i-1);
+		    sum += delta;
+                    qreal deltaF = delta / qreal(repeats);
+                    printf("%.5d | %.5f | %s\n", i, deltaF, qPrintable(buffer.commandDescription(start + i)));
+                }
+                printf("Total | %.5f | Total frame time\n", sum / qreal(repeats));
+                deleteLater();
+                return;
+            }
+
+            if (start + currentCommand >= end) {
+                currentCommand = 1;
+		++currentIteration;
+                if (currentIteration == 3) {
+                    timer.start();
+                    iterationTimes.fill(uint(-1));
+                }
+		if (currentIteration >= 3 && currentIteration < singleFrameIterations + 3)
+                    printf("Profiling iteration %d of %d\n", currentIteration - 2, singleFrameIterations);
+            } else {
+                ++currentCommand;
+	    }
+        }
+
+        return;
+    }
+
     buffer.draw(&p, visibleUpdates.at(currentFrame));
 
     ++currentFrame;
@@ -138,11 +207,9 @@ void ReplayWidget::paintEvent(QPaintEvent *)
             }
         }
     }
-
-    QTimer::singleShot(0, this, SLOT(updateRect()));
 }
 
-void ReplayWidget::resizeEvent(QResizeEvent *event)
+void ReplayWidget::resizeEvent(QResizeEvent *)
 {
     visibleUpdates.clear();
 
@@ -162,13 +229,15 @@ void ReplayWidget::resizeEvent(QResizeEvent *event)
 
 }
 
-ReplayWidget::ReplayWidget(const QString &filename_, int from_, int to_, bool single_)
+ReplayWidget::ReplayWidget(const QString &filename_, int from_, int to_, bool single_, int frame_)
     : currentFrame(0)
     , currentIteration(0)
     , filename(filename_)
     , from(from_)
     , to(to_)
     , single(single_)
+    , frame(frame_)
+    , currentCommand(1)
 {
     setWindowTitle(filename);
     QFile file(filename);
@@ -216,7 +285,8 @@ int main(int argc, char **argv)
         printf("Usage:\n  > %s [OPTIONS] [traceFile]\n", argv[0]);
         printf("OPTIONS\n"
                "   --range=from-to to specify a frame range.\n"
-               "   --singlerun to do only one run (without statistics)\n");
+               "   --singlerun to do only one run (without statistics)\n"
+               "   --instrumentframe=frame to instrument a single frame\n");
         return 1;
     }
 
@@ -228,6 +298,8 @@ int main(int argc, char **argv)
 
     bool single = false;
 
+    int frame = -1;
+
     int from = 0;
     int to = -1;
     for (int i = 1; i < app.arguments().size() - 1; ++i) {
@@ -253,13 +325,22 @@ int main(int argc, char **argv)
             }
         } else if (arg == QLatin1String("--singlerun")) {
             single = true;
+        } else if (arg.startsWith(QLatin1String("--instrumentframe="))) {
+            QString rest = arg.mid(18);
+            bool ok = false;
+            int frameCandidate = rest.toInt(&ok);
+            if (ok) {
+                frame = frameCandidate;
+            } else {
+                printf("ERROR: malformed syntax in argument %s\n", qPrintable(arg));
+            }
         } else {
             printf("Unrecognized argument: %s\n", qPrintable(arg));
             return 1;
         }
     }
 
-    ReplayWidget *widget = new ReplayWidget(app.arguments().last(), from, to, single);
+    ReplayWidget *widget = new ReplayWidget(app.arguments().last(), from, to, single, frame);
 
     if (!widget->updates.isEmpty()) {
         widget->show();
author	Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@nokia.com>	2010-03-26 10:47:01 (GMT)
committer	Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@nokia.com>	2010-03-26 10:47:01 (GMT)
commit	e7eb7bdf63791ed03257f2f23b1f515e4d89e054 (patch)
tree	1d580cea9ffbf342a029c73bd2cecc106811ff22
parent	47472906fd00e0eff820870330d481c4229ee285 (diff)
parent	41e9adb44137c8839d0d7e131802de198b0e7168 (diff)
download	Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.zip Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.tar.gz Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.tar.bz2