diff options
author | Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@nokia.com> | 2010-03-26 10:47:01 (GMT) |
---|---|---|
committer | Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@nokia.com> | 2010-03-26 10:47:01 (GMT) |
commit | e7eb7bdf63791ed03257f2f23b1f515e4d89e054 (patch) | |
tree | 1d580cea9ffbf342a029c73bd2cecc106811ff22 | |
parent | 47472906fd00e0eff820870330d481c4229ee285 (diff) | |
parent | 41e9adb44137c8839d0d7e131802de198b0e7168 (diff) | |
download | Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.zip Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.tar.gz Qt-e7eb7bdf63791ed03257f2f23b1f515e4d89e054.tar.bz2 |
Merge branch '4.7' of git@scm.dev.nokia.troll.no:qt/oslo-staging-2 into 4.7
29 files changed, 4385 insertions, 541 deletions
diff --git a/doc/src/legal/3rdparty.qdoc b/doc/src/legal/3rdparty.qdoc index d608038..8d0cd2a 100644 --- a/doc/src/legal/3rdparty.qdoc +++ b/doc/src/legal/3rdparty.qdoc @@ -454,4 +454,43 @@ OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. See \c src/3rdparty/webkit/JavaScriptCore/wtf/dtoa.cpp for license details. + + \section1 Pixman (\c pixman) version 0.17.11 + + \e{pixman is a library that provides low-level pixel manipulation + features such as image compositing and trapezoid rasterization.} -- quoted + from \c src/3rdparty/pixman/README + + We are only using the pixman-arm-neon-asm.h and pixman-arm-neon-asm.S + source files which have the following copyright and license header: + + \hr + + Copyright © 2009 Nokia Corporation + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice (including the next + paragraph) shall be included in all copies or substantial portions of the + Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + + Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + + \hr + + See \c src/3rdparty/pixman/pixman-arm-neon-asm.h and + \c src/3rdparty/pixman/pixman-arm-neon-asm.S */ diff --git a/src/3rdparty/pixman/README b/src/3rdparty/pixman/README new file mode 100644 index 0000000..843b069 --- /dev/null +++ b/src/3rdparty/pixman/README @@ -0,0 +1,26 @@ +pixman is a library that provides low-level pixel manipulation +features such as image compositing and trapezoid rasterization. + +Please submit bugs & patches to the libpixman bugzilla: + + https://bugs.freedesktop.org/enter_bug.cgi?product=pixman + +All questions regarding this software should be directed to either the +Xorg mailing list: + + http://lists.freedesktop.org/mailman/listinfo/xorg + +or the cairo mailing list: + + http://lists.freedesktop.org/mailman/listinfo/cairo + +The master development code repository can be found at: + + git://anongit.freedesktop.org/git/pixman + + http://gitweb.freedesktop.org/?p=pixman;a=summary + +For more information on the git code manager, see: + + http://wiki.x.org/wiki/GitPage + diff --git a/src/3rdparty/pixman/pixman-arm-neon-asm.S b/src/3rdparty/pixman/pixman-arm-neon-asm.S new file mode 100644 index 0000000..eb8cc4c --- /dev/null +++ b/src/3rdparty/pixman/pixman-arm-neon-asm.S @@ -0,0 +1,1709 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains implementations of NEON optimized pixel processing + * functions. There is no full and detailed tutorial, but some functions + * (those which are exposing some new or interesting features) are + * extensively commented and can be used as examples. + * + * You may want to have a look at the comments for following functions: + * - pixman_composite_over_8888_0565_asm_neon + * - pixman_composite_over_n_8_0565_asm_neon + */ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .fpu neon + .arch armv7a + .altmacro + +#include "pixman-arm-neon-asm.h" + +/* Global configuration options and preferences */ + +/* + * The code can optionally make use of unaligned memory accesses to improve + * performance of handling leading/trailing pixels for each scanline. + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for + * example in linux if unaligned memory accesses are not configured to + * generate.exceptions. + */ +.set RESPECT_STRICT_ALIGNMENT, 1 + +/* + * Set default prefetch type. There is a choice between the following options: + * + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work + * as NOP to workaround some HW bugs or for whatever other reason) + * + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where + * advanced prefetch intruduces heavy overhead) + * + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 + * which can run ARM and NEON instructions simultaneously so that extra ARM + * instructions do not add (many) extra cycles, but improve prefetch efficiency) + * + * Note: some types of function can't support advanced prefetch and fallback + * to simple one (those which handle 24bpp pixels) + */ +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED + +/* Prefetch distance in pixels for simple prefetch */ +.set PREFETCH_DISTANCE_SIMPLE, 64 + +/* + * Implementation of pixman_composite_over_8888_0565_asm_neon + * + * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and + * performs OVER compositing operation. Function fast_composite_over_8888_0565 + * from pixman-fast-path.c does the same in C and can be used as a reference. + * + * First we need to have some NEON assembly code which can do the actual + * operation on the pixels and provide it to the template macro. + * + * Template macro quite conveniently takes care of emitting all the necessary + * code for memory reading and writing (including quite tricky cases of + * handling unaligned leading/trailing pixels), so we only need to deal with + * the data in NEON registers. + * + * NEON registers allocation in general is recommented to be the following: + * d0, d1, d2, d3 - contain loaded source pixel data + * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) + * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) + * d28, d29, d30, d31 - place for storing the result (destination pixels) + * + * As can be seen above, four 64-bit NEON registers are used for keeping + * intermediate pixel data and up to 8 pixels can be processed in one step + * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). + * + * This particular function uses the following registers allocation: + * d0, d1, d2, d3 - contain loaded source pixel data + * d4, d5 - contain loaded destination pixels (they are needed) + * d28, d29 - place for storing the result (destination pixels) + */ + +/* + * Step one. We need to have some code to do some arithmetics on pixel data. + * This is implemented as a pair of macros: '*_head' and '*_tail'. When used + * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, + * perform all the needed calculations and write the result to {d28, d29}. + * The rationale for having two macros and not just one will be explained + * later. In practice, any single monolitic function which does the work can + * be split into two parts in any arbitrary way without affecting correctness. + * + * There is one special trick here too. Common template macro can optionally + * make our life a bit easier by doing R, G, B, A color components + * deinterleaving for 32bpp pixel formats (and this feature is used in + * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that + * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we + * actually use d0 register for blue channel (a vector of eight 8-bit + * values), d1 register for green, d2 for red and d3 for alpha. This + * simple conversion can be also done with a few NEON instructions: + * + * Packed to planar conversion: + * vuzp.8 d0, d1 + * vuzp.8 d2, d3 + * vuzp.8 d1, d3 + * vuzp.8 d0, d2 + * + * Planar to packed conversion: + * vzip.8 d0, d2 + * vzip.8 d1, d3 + * vzip.8 d2, d3 + * vzip.8 d0, d1 + * + * But pixel can be loaded directly in planar format using VLD4.8 NEON + * instruction. It is 1 cycle slower than VLD1.32, so this is not always + * desirable, that's why deinterleaving is optional. + * + * But anyway, here is the code: + */ +.macro pixman_composite_over_8888_0565_process_pixblock_head + /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format + and put data into d6 - red, d7 - green, d30 - blue */ + vshrn.u16 d6, q2, #8 + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vsri.u8 d6, d6, #5 + vmvn.8 d3, d3 /* invert source alpha */ + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + /* now do alpha blending, storing results in 8-bit planar format + into d16 - red, d19 - green, d18 - blue */ + vmull.u8 q10, d3, d6 + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + vrshr.u16 q13, q10, #8 + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_8888_0565_process_pixblock_tail + /* ... continue alpha blending */ + vqadd.u8 d16, d2, d20 + vqadd.u8 q9, q0, q11 + /* convert the result to r5g6b5 and store it into {d28, d29} */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +/* + * OK, now we got almost everything that we need. Using the above two + * macros, the work can be done right. But now we want to optimize + * it a bit. ARM Cortex-A8 is an in-order core, and benefits really + * a lot from good code scheduling and software pipelining. + * + * Let's construct some code, which will run in the core main loop. + * Some pseudo-code of the main loop will look like this: + * head + * while (...) { + * tail + * head + * } + * tail + * + * It may look a bit weird, but this setup allows to hide instruction + * latencies better and also utilize dual-issue capability more + * efficiently (make pairs of load-store and ALU instructions). + * + * So what we need now is a '*_tail_head' macro, which will be used + * in the core main loop. A trivial straightforward implementation + * of this macro would look like this: + * + * pixman_composite_over_8888_0565_process_pixblock_tail + * vst1.16 {d28, d29}, [DST_W, :128]! + * vld1.16 {d4, d5}, [DST_R, :128]! + * vld4.32 {d0, d1, d2, d3}, [SRC]! + * pixman_composite_over_8888_0565_process_pixblock_head + * cache_preload 8, 8 + * + * Now it also got some VLD/VST instructions. We simply can't move from + * processing one block of pixels to the other one with just arithmetics. + * The previously processed data needs to be written to memory and new + * data needs to be fetched. Fortunately, this main loop does not deal + * with partial leading/trailing pixels and can load/store a full block + * of pixels in a bulk. Additionally, destination buffer is already + * 16 bytes aligned here (which is good for performance). + * + * New things here are DST_R, DST_W, SRC and MASK identifiers. These + * are the aliases for ARM registers which are used as pointers for + * accessing data. We maintain separate pointers for reading and writing + * destination buffer (DST_R and DST_W). + * + * Another new thing is 'cache_preload' macro. It is used for prefetching + * data into CPU L2 cache and improve performance when dealing with large + * images which are far larger than cache size. It uses one argument + * (actually two, but they need to be the same here) - number of pixels + * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some + * details about this macro. Moreover, if good performance is needed + * the code from this macro needs to be copied into '*_tail_head' macro + * and mixed with the rest of code for optimal instructions scheduling. + * We are actually doing it below. + * + * Now after all the explanations, here is the optimized code. + * Different instruction streams (originaling from '*_head', '*_tail' + * and 'cache_preload' macro) use different indentation levels for + * better readability. Actually taking the code from one of these + * indentation levels and ignoring a few VLD/VST instructions would + * result in exactly the code from '*_head', '*_tail' or 'cache_preload' + * macro! + */ + +#if 1 + +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head + vqadd.u8 d16, d2, d20 + vld1.16 {d4, d5}, [DST_R, :128]! + vqadd.u8 q9, q0, q11 + vshrn.u16 d6, q2, #8 + vld4.8 {d0, d1, d2, d3}, [SRC]! + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vshll.u8 q14, d16, #8 + PF add PF_X, PF_X, #8 + vshll.u8 q8, d19, #8 + PF tst PF_CTL, #0xF + vsri.u8 d6, d6, #5 + PF addne PF_X, PF_X, #8 + vmvn.8 d3, d3 + PF subne PF_CTL, PF_CTL, #1 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + vmull.u8 q10, d3, d6 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vsri.u16 q14, q8, #5 + PF cmp PF_X, ORIG_W + vshll.u8 q9, d18, #8 + vrshr.u16 q13, q10, #8 + PF subge PF_X, PF_X, ORIG_W + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + PF subges PF_CTL, PF_CTL, #0x10 + vsri.u16 q14, q9, #11 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vraddhn.u16 d22, q12, q15 + vst1.16 {d28, d29}, [DST_W, :128]! +.endm + +#else + +/* If we did not care much about the performance, we would just use this... */ +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head + pixman_composite_over_8888_0565_process_pixblock_tail + vst1.16 {d28, d29}, [DST_W, :128]! + vld1.16 {d4, d5}, [DST_R, :128]! + vld4.32 {d0, d1, d2, d3}, [SRC]! + pixman_composite_over_8888_0565_process_pixblock_head + cache_preload 8, 8 +.endm + +#endif + +/* + * And now the final part. We are using 'generate_composite_function' macro + * to put all the stuff together. We are specifying the name of the function + * which we want to get, number of bits per pixel for the source, mask and + * destination (0 if unused, like mask in this case). Next come some bit + * flags: + * FLAG_DST_READWRITE - tells that the destination buffer is both read + * and written, for write-only buffer we would use + * FLAG_DST_WRITEONLY flag instead + * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data + * and separate color channels for 32bpp format. + * The next things are: + * - the number of pixels processed per iteration (8 in this case, because + * that's the maximum what can fit into four 64-bit NEON registers). + * - prefetch distance, measured in pixel blocks. In this case it is 5 times + * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal + * prefetch distance can be selected by running some benchmarks. + * + * After that we specify some macros, these are 'default_init', + * 'default_cleanup' here which are empty (but it is possible to have custom + * init/cleanup macros to be able to save/restore some extra NEON registers + * like d8-d15 or do anything else) followed by + * 'pixman_composite_over_8888_0565_process_pixblock_head', + * 'pixman_composite_over_8888_0565_process_pixblock_tail' and + * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' + * which we got implemented above. + * + * The last part is the NEON registers allocation scheme. + */ +generate_composite_function \ + pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_0565_process_pixblock_head, \ + pixman_composite_over_8888_0565_process_pixblock_tail, \ + pixman_composite_over_8888_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_n_0565_process_pixblock_head + /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format + and put data into d6 - red, d7 - green, d30 - blue */ + vshrn.u16 d6, q2, #8 + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vsri.u8 d6, d6, #5 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + /* now do alpha blending, storing results in 8-bit planar format + into d16 - red, d19 - green, d18 - blue */ + vmull.u8 q10, d3, d6 + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + vrshr.u16 q13, q10, #8 + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_n_0565_process_pixblock_tail + /* ... continue alpha blending */ + vqadd.u8 d16, d2, d20 + vqadd.u8 q9, q0, q11 + /* convert the result to r5g6b5 and store it into {d28, d29} */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_0565_process_pixblock_tail_head + pixman_composite_over_n_0565_process_pixblock_tail + vld1.16 {d4, d5}, [DST_R, :128]! + vst1.16 {d28, d29}, [DST_W, :128]! + pixman_composite_over_n_0565_process_pixblock_head +.endm + +.macro pixman_composite_over_n_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] + vmvn.8 d3, d3 /* invert source alpha */ +.endm + +generate_composite_function \ + pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_0565_init, \ + default_cleanup, \ + pixman_composite_over_n_0565_process_pixblock_head, \ + pixman_composite_over_n_0565_process_pixblock_tail, \ + pixman_composite_over_n_0565_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_0565_process_pixblock_head + vshll.u8 q8, d1, #8 + vshll.u8 q14, d2, #8 + vshll.u8 q9, d0, #8 +.endm + +.macro pixman_composite_src_8888_0565_process_pixblock_tail + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +.macro pixman_composite_src_8888_0565_process_pixblock_tail_head + vsri.u16 q14, q8, #5 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vld4.8 {d0, d1, d2, d3}, [SRC]! + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vsri.u16 q14, q9, #11 + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vshll.u8 q8, d1, #8 + vst1.16 {d28, d29}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vshll.u8 q14, d2, #8 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vshll.u8 q9, d0, #8 +.endm + +generate_composite_function \ + pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_0565_process_pixblock_head, \ + pixman_composite_src_8888_0565_process_pixblock_tail, \ + pixman_composite_src_8888_0565_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_src_0565_8888_process_pixblock_head + vshrn.u16 d30, q0, #8 + vshrn.u16 d29, q0, #3 + vsli.u16 q0, q0, #5 + vmov.u8 d31, #255 + vsri.u8 d30, d30, #5 + vsri.u8 d29, d29, #6 + vshrn.u16 d28, q0, #2 +.endm + +.macro pixman_composite_src_0565_8888_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_src_0565_8888_process_pixblock_tail_head + pixman_composite_src_0565_8888_process_pixblock_tail + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld1.16 {d0, d1}, [SRC]! + pixman_composite_src_0565_8888_process_pixblock_head + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_8888_process_pixblock_head, \ + pixman_composite_src_0565_8888_process_pixblock_tail, \ + pixman_composite_src_0565_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8000_8000_process_pixblock_head + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_8000_8000_process_pixblock_tail +.endm + +.macro pixman_composite_add_8000_8000_process_pixblock_tail_head + vld1.8 {d0, d1, d2, d3}, [SRC]! + PF add PF_X, PF_X, #32 + PF tst PF_CTL, #0xF + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + PF addne PF_X, PF_X, #32 + PF subne PF_CTL, PF_CTL, #1 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 +.endm + +generate_composite_function \ + pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8000_8000_process_pixblock_head, \ + pixman_composite_add_8000_8000_process_pixblock_tail, \ + pixman_composite_add_8000_8000_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8888_8888_process_pixblock_tail_head + vld1.8 {d0, d1, d2, d3}, [SRC]! + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + vqadd.u8 q14, q0, q2 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vqadd.u8 q15, q1, q3 +.endm + +generate_composite_function \ + pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8000_8000_process_pixblock_head, \ + pixman_composite_add_8000_8000_process_pixblock_tail, \ + pixman_composite_add_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8000_8000_process_pixblock_head, \ + pixman_composite_add_8000_8000_process_pixblock_tail, \ + pixman_composite_add_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_8888_8888_process_pixblock_head + vmvn.8 d24, d3 /* get inverted alpha */ + /* do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +.macro pixman_composite_over_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q14, q8, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + vld4.8 {d0, d1, d2, d3}, [SRC]! + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vmull.u8 q10, d22, d6 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8888_process_pixblock_tail_head + pixman_composite_over_8888_8888_process_pixblock_tail + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + pixman_composite_over_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d3[0]}, [DUMMY] + vdup.8 d0, d3[0] + vdup.8 d1, d3[1] + vdup.8 d2, d3[2] + vdup.8 d3, d3[3] +.endm + +generate_composite_function \ + pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8888_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head + vrshr.u16 q14, q8, #8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + PF cmp PF_X, ORIG_W + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! + vmvn.8 d22, d3 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF subge PF_X, PF_X, ORIG_W + vmull.u8 q8, d22, d4 + PF subges PF_CTL, PF_CTL, #0x10 + vmull.u8 q9, d22, d5 + vmull.u8 q10, d22, d6 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vmull.u8 q11, d22, d7 +.endm + +.macro pixman_composite_over_reverse_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d7[0]}, [DUMMY] + vdup.8 d4, d7[0] + vdup.8 d5, d7[1] + vdup.8 d6, d7[2] + vdup.8 d7, d7[3] +.endm + +generate_composite_function \ + pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_reverse_n_8888_init, \ + default_cleanup, \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 4, /* src_basereg */ \ + 24 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_n_8_0565_process_pixblock_head + /* in */ + vmull.u8 q0, d24, d8 + vmull.u8 q1, d24, d9 + vmull.u8 q6, d24, d10 + vmull.u8 q7, d24, d11 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + + vshrn.u16 d6, q2, #8 + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vsri.u8 d6, d6, #5 + vmvn.8 d3, d3 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + /* now do alpha blending */ + vmull.u8 q10, d3, d6 + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + vrshr.u16 q13, q10, #8 + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + vraddhn.u16 d22, q12, q15 +.endm + +.macro pixman_composite_over_n_8_0565_process_pixblock_tail + vqadd.u8 d16, d2, d20 + vqadd.u8 q9, q0, q11 + /* convert to r5g6b5 */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head + pixman_composite_over_n_8_0565_process_pixblock_tail + vst1.16 {d28, d29}, [DST_W, :128]! + vld1.16 {d4, d5}, [DST_R, :128]! + vld1.8 {d24}, [MASK]! + cache_preload 8, 8 + pixman_composite_over_n_8_0565_process_pixblock_head +.endm + +/* + * This function needs a special initialization of solid mask. + * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET + * offset, split into color components and replicated in d8-d11 + * registers. Additionally, this function needs all the NEON registers, + * so it has to save d8-d15 registers which are callee saved according + * to ABI. These registers are restored from 'cleanup' macro. All the + * other NEON registers are caller saved, so can be clobbered freely + * without introducing any problems. + */ +.macro pixman_composite_over_n_8_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8_0565_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ + FLAG_DST_READWRITE, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_0565_init, \ + pixman_composite_over_n_8_0565_cleanup, \ + pixman_composite_over_n_8_0565_process_pixblock_head, \ + pixman_composite_over_n_8_0565_process_pixblock_tail, \ + pixman_composite_over_n_8_0565_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_src_0565_0565_process_pixblock_head +.endm + +.macro pixman_composite_src_0565_0565_process_pixblock_tail +.endm + +.macro pixman_composite_src_0565_0565_process_pixblock_tail_head + vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! + vld1.16 {d0, d1, d2, d3}, [SRC]! + cache_preload 16, 16 +.endm + +generate_composite_function \ + pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 16, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0565_0565_process_pixblock_head, \ + pixman_composite_src_0565_0565_process_pixblock_tail, \ + pixman_composite_src_0565_0565_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8_process_pixblock_tail_head + vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #8 + vsli.u64 d0, d0, #16 + vsli.u64 d0, d0, #32 + vmov d1, d0 + vmov q1, q0 +.endm + +.macro pixman_composite_src_n_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ + FLAG_DST_WRITEONLY, \ + 32, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_8_init, \ + pixman_composite_src_n_8_cleanup, \ + pixman_composite_src_n_8_process_pixblock_head, \ + pixman_composite_src_n_8_process_pixblock_tail, \ + pixman_composite_src_n_8_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_0565_process_pixblock_head +.endm + +.macro pixman_composite_src_n_0565_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_0565_process_pixblock_tail_head + vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_0565_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #16 + vsli.u64 d0, d0, #32 + vmov d1, d0 + vmov q1, q0 +.endm + +.macro pixman_composite_src_n_0565_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 16, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_0565_init, \ + pixman_composite_src_n_0565_cleanup, \ + pixman_composite_src_n_0565_process_pixblock_head, \ + pixman_composite_src_n_0565_process_pixblock_tail, \ + pixman_composite_src_n_0565_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_n_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_n_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! +.endm + +.macro pixman_composite_src_n_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vld1.32 {d0[0]}, [DUMMY] + vsli.u64 d0, d0, #32 + vmov d1, d0 + vmov q1, q0 +.endm + +.macro pixman_composite_src_n_8888_cleanup +.endm + +generate_composite_function \ + pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 0, /* prefetch distance */ \ + pixman_composite_src_n_8888_init, \ + pixman_composite_src_n_8888_cleanup, \ + pixman_composite_src_n_8888_process_pixblock_head, \ + pixman_composite_src_n_8888_process_pixblock_tail, \ + pixman_composite_src_n_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_8888_8888_process_pixblock_head +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_8888_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + vld1.32 {d0, d1, d2, d3}, [SRC]! + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_8888_8888_process_pixblock_head, \ + pixman_composite_src_8888_8888_process_pixblock_tail, \ + pixman_composite_src_8888_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_x888_8888_process_pixblock_head + vorr q0, q0, q2 + vorr q1, q1, q2 +.endm + +.macro pixman_composite_src_x888_8888_process_pixblock_tail +.endm + +.macro pixman_composite_src_x888_8888_process_pixblock_tail_head + vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! + vld1.32 {d0, d1, d2, d3}, [SRC]! + vorr q0, q0, q2 + vorr q1, q1, q2 + cache_preload 8, 8 +.endm + +.macro pixman_composite_src_x888_8888_init + vmov.u8 q2, #0xFF + vshl.u32 q2, q2, #24 +.endm + +generate_composite_function \ + pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + pixman_composite_src_x888_8888_init, \ + default_cleanup, \ + pixman_composite_src_x888_8888_process_pixblock_head, \ + pixman_composite_src_x888_8888_process_pixblock_tail, \ + pixman_composite_src_x888_8888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_over_n_8_8888_process_pixblock_head + /* expecting deinterleaved source data in {d8, d9, d10, d11} */ + /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ + /* and destination data in {d4, d5, d6, d7} */ + /* mask is in d24 (d25, d26, d27 are unused) */ + + /* in */ + vmull.u8 q0, d24, d8 + vmull.u8 q1, d24, d9 + vmull.u8 q6, d24, d10 + vmull.u8 q7, d24, d11 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vmvn.8 d24, d3 /* get inverted alpha */ + /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ + /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ + /* now do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_over_n_8_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head + pixman_composite_over_n_8_8888_process_pixblock_tail + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vld1.8 {d24}, [MASK]! + cache_preload 8, 8 + pixman_composite_over_n_8_8888_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8_8888_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8_8888_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_8888_init, \ + pixman_composite_over_n_8_8888_cleanup, \ + pixman_composite_over_n_8_8888_process_pixblock_head, \ + pixman_composite_over_n_8_8888_process_pixblock_tail, \ + pixman_composite_over_n_8_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head + /* + * 'combine_mask_ca' replacement + * + * input: solid src (n) in {d8, d9, d10, d11} + * dest in {d4, d5, d6, d7 } + * mask in {d24, d25, d26, d27} + * output: updated src in {d0, d1, d2, d3 } + * updated mask in {d24, d25, d26, d3 } + */ + vmull.u8 q0, d24, d8 + vmull.u8 q1, d25, d9 + vmull.u8 q6, d26, d10 + vmull.u8 q7, d27, d11 + vmull.u8 q9, d11, d25 + vmull.u8 q12, d11, d24 + vmull.u8 q13, d11, d26 + vrshr.u16 q8, q0, #8 + vrshr.u16 q10, q1, #8 + vrshr.u16 q11, q6, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q10 + vraddhn.u16 d2, q6, q11 + vrshr.u16 q11, q12, #8 + vrshr.u16 q8, q9, #8 + vrshr.u16 q6, q13, #8 + vrshr.u16 q10, q7, #8 + vraddhn.u16 d24, q12, q11 + vraddhn.u16 d25, q9, q8 + vraddhn.u16 d26, q13, q6 + vraddhn.u16 d3, q7, q10 + /* + * 'combine_over_ca' replacement + * + * output: updated dest in {d28, d29, d30, d31} + */ + vmvn.8 d24, d24 + vmvn.8 d25, d25 + vmull.u8 q8, d24, d4 + vmull.u8 q9, d25, d5 + vmvn.8 d26, d26 + vmvn.8 d27, d3 + vmull.u8 q10, d26, d6 + vmull.u8 q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail + /* ... continue 'combine_over_ca' replacement */ + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q6, q10, #8 + vrshr.u16 q7, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q6, q10 + vraddhn.u16 d31, q7, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vrshr.u16 q6, q10, #8 + vrshr.u16 q7, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q6, q10 + vraddhn.u16 d31, q7, q11 + vld4.8 {d24, d25, d26, d27}, [MASK]! + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 + cache_preload 8, 8 + pixman_composite_over_n_8888_8888_ca_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_n_8888_8888_ca_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d8, d11[0] + vdup.8 d9, d11[1] + vdup.8 d10, d11[2] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_over_n_8888_8888_ca_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8888_8888_ca_init, \ + pixman_composite_over_n_8888_8888_ca_cleanup, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ + pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_n_8_8_process_pixblock_head + /* expecting source data in {d8, d9, d10, d11} */ + /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ + /* and destination data in {d4, d5, d6, d7} */ + /* mask is in d24, d25, d26, d27 */ + vmull.u8 q0, d24, d11 + vmull.u8 q1, d25, d11 + vmull.u8 q6, d26, d11 + vmull.u8 q7, d27, d11 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_n_8_8_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_n_8_8_process_pixblock_tail_head + pixman_composite_add_n_8_8_process_pixblock_tail + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + vld1.8 {d24, d25, d26, d27}, [MASK]! + cache_preload 32, 32 + pixman_composite_add_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_add_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d11[0]}, [DUMMY] + vdup.8 d11, d11[3] +.endm + +.macro pixman_composite_add_n_8_8_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_n_8_8_init, \ + pixman_composite_add_n_8_8_cleanup, \ + pixman_composite_add_n_8_8_process_pixblock_head, \ + pixman_composite_add_n_8_8_process_pixblock_tail, \ + pixman_composite_add_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8_8_8_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* mask in {d24, d25, d26, d27} */ + vmull.u8 q8, d24, d0 + vmull.u8 q9, d25, d1 + vmull.u8 q10, d26, d2 + vmull.u8 q11, d27, d3 + vrshr.u16 q0, q8, #8 + vrshr.u16 q1, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q12, q10 + vraddhn.u16 d3, q13, q11 + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_8_8_8_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_8_8_8_process_pixblock_tail_head + pixman_composite_add_8_8_8_process_pixblock_tail + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + vld1.8 {d24, d25, d26, d27}, [MASK]! + vld1.8 {d0, d1, d2, d3}, [SRC]! + cache_preload 32, 32 + pixman_composite_add_8_8_8_process_pixblock_head +.endm + +.macro pixman_composite_add_8_8_8_init +.endm + +.macro pixman_composite_add_8_8_8_cleanup +.endm + +generate_composite_function \ + pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_add_8_8_8_init, \ + pixman_composite_add_8_8_8_cleanup, \ + pixman_composite_add_8_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_8_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* mask in {d24, d25, d26, d27} */ + vmull.u8 q8, d27, d0 + vmull.u8 q9, d27, d1 + vmull.u8 q10, d27, d2 + vmull.u8 q11, d27, d3 + vrshr.u16 q0, q8, #8 + vrshr.u16 q1, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d0, q0, q8 + vraddhn.u16 d1, q1, q9 + vraddhn.u16 d2, q12, q10 + vraddhn.u16 d3, q13, q11 + vqadd.u8 q14, q0, q2 + vqadd.u8 q15, q1, q3 +.endm + +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + pixman_composite_add_8888_8888_8888_process_pixblock_tail + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + vld4.8 {d24, d25, d26, d27}, [MASK]! + vld4.8 {d0, d1, d2, d3}, [SRC]! + cache_preload 8, 8 + pixman_composite_add_8888_8888_8888_process_pixblock_head +.endm + +generate_composite_function \ + pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + +generate_composite_function_single_scanline \ + pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init, \ + default_cleanup, \ + pixman_composite_add_8888_8888_8888_process_pixblock_head, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ + pixman_composite_add_8888_8888_8888_process_pixblock_tail_head + +/******************************************************************************/ + +.macro pixman_composite_over_8888_n_8888_process_pixblock_head + /* expecting source data in {d0, d1, d2, d3} */ + /* destination data in {d4, d5, d6, d7} */ + /* solid mask is in d15 */ + + /* 'in' */ + vmull.u8 q8, d15, d3 + vmull.u8 q6, d15, d2 + vmull.u8 q5, d15, d1 + vmull.u8 q4, d15, d0 + vrshr.u16 q13, q8, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q11, q5, #8 + vrshr.u16 q10, q4, #8 + vraddhn.u16 d3, q8, q13 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d1, q5, q11 + vraddhn.u16 d0, q4, q10 + vmvn.8 d24, d3 /* get inverted alpha */ + /* now do alpha blending */ + vmull.u8 q8, d24, d4 + vmull.u8 q9, d24, d5 + vmull.u8 q10, d24, d6 + vmull.u8 q11, d24, d7 +.endm + +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + vld4.8 {d0, d1, d2, d3}, [SRC]! + cache_preload 8, 8 + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_8888_n_8888_init + add DUMMY, sp, #48 + vpush {d8-d15} + vld1.32 {d15[0]}, [DUMMY] + vdup.8 d15, d15[3] +.endm + +.macro pixman_composite_over_8888_n_8888_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_8888_n_8888_init, \ + pixman_composite_over_8888_n_8888_cleanup, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail_head + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + vld4.8 {d0, d1, d2, d3}, [SRC]! + cache_preload 8, 8 + vld4.8 {d12, d13, d14, d15}, [MASK]! + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_8888_8888_8888_init + vpush {d8-d15} +.endm + +.macro pixman_composite_over_8888_8888_8888_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_8888_8888_8888_init, \ + pixman_composite_over_8888_8888_8888_cleanup, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + +generate_composite_function_single_scanline \ + pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + pixman_composite_over_8888_8888_8888_init, \ + pixman_composite_over_8888_8888_8888_cleanup, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + +/******************************************************************************/ + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head + vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_8888_n_8888_process_pixblock_tail + vld4.8 {d0, d1, d2, d3}, [SRC]! + cache_preload 8, 8 + vld1.8 {d15}, [MASK]! + pixman_composite_over_8888_n_8888_process_pixblock_head + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! +.endm + +.macro pixman_composite_over_8888_8_8888_init + vpush {d8-d15} +.endm + +.macro pixman_composite_over_8888_8_8888_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_8888_8_8888_init, \ + pixman_composite_over_8888_8_8888_cleanup, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ + pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 15 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0888_process_pixblock_head +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_0888_process_pixblock_tail_head + vst3.8 {d0, d1, d2}, [DST_W]! + vld3.8 {d0, d1, d2}, [SRC]! + cache_preload 8, 8 +.endm + +generate_composite_function \ + pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0888_0888_process_pixblock_head, \ + pixman_composite_src_0888_0888_process_pixblock_tail, \ + pixman_composite_src_0888_0888_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_head + vswp d0, d2 +.endm + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail +.endm + +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head + vst4.8 {d0, d1, d2, d3}, [DST_W]! + vld3.8 {d0, d1, d2}, [SRC]! + vswp d0, d2 + cache_preload 8, 8 +.endm + +.macro pixman_composite_src_0888_8888_rev_init + veor d3, d3, d3 +.endm + +generate_composite_function \ + pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + pixman_composite_src_0888_8888_rev_init, \ + default_cleanup, \ + pixman_composite_src_0888_8888_rev_process_pixblock_head, \ + pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ + pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ + 0, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_head + vshll.u8 q8, d1, #8 + vshll.u8 q9, d2, #8 +.endm + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail + vshll.u8 q14, d0, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 +.endm + +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head + vshll.u8 q14, d0, #8 + vld3.8 {d0, d1, d2}, [SRC]! + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 + vshll.u8 q8, d1, #8 + vst1.16 {d28, d29}, [DST_W, :128]! + vshll.u8 q9, d2, #8 +.endm + +generate_composite_function \ + pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ + FLAG_DST_WRITEONLY, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_0888_0565_rev_process_pixblock_head, \ + pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ + pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ + +/******************************************************************************/ + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_head + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 +.endm + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + vraddhn.u16 d30, q11, q8 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d28, q13, q10 +.endm + +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head + vrshr.u16 q11, q8, #8 + vswp d3, d31 + vrshr.u16 q12, q9, #8 + vrshr.u16 q13, q10, #8 + vld4.8 {d0, d1, d2, d3}, [SRC]! + vraddhn.u16 d30, q11, q8 + PF add PF_X, PF_X, #8 + PF tst PF_CTL, #0xF + PF addne PF_X, PF_X, #8 + PF subne PF_CTL, PF_CTL, #1 + vraddhn.u16 d29, q12, q9 + vraddhn.u16 d28, q13, q10 + vmull.u8 q8, d3, d0 + vmull.u8 q9, d3, d1 + vmull.u8 q10, d3, d2 + vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! + PF cmp PF_X, ORIG_W + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endm + +generate_composite_function \ + pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + pixman_composite_src_pixbuf_8888_process_pixblock_head, \ + pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ + pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 0, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 0 /* mask_basereg */ diff --git a/src/3rdparty/pixman/pixman-arm-neon-asm.h b/src/3rdparty/pixman/pixman-arm-neon-asm.h new file mode 100644 index 0000000..56c3fae --- /dev/null +++ b/src/3rdparty/pixman/pixman-arm-neon-asm.h @@ -0,0 +1,906 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains a macro ('generate_composite_function') which can + * construct 2D image processing functions, based on a common template. + * Any combinations of source, destination and mask images with 8bpp, + * 16bpp, 24bpp, 32bpp color formats are supported. + * + * This macro takes care of: + * - handling of leading and trailing unaligned pixels + * - doing most of the work related to L2 cache preload + * - encourages the use of software pipelining for better instructions + * scheduling + * + * The user of this macro has to provide some configuration parameters + * (bit depths for the images, prefetch distance, etc.) and a set of + * macros, which should implement basic code chunks responsible for + * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage + * examples. + * + * TODO: + * - try overlapped pixel method (from Ian Rickards) when processing + * exactly two blocks of pixels + * - maybe add an option to do reverse scanline processing + */ + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY, 0 +.set FLAG_DST_READWRITE, 1 +.set FLAG_DEINTERLEAVE_32BPP, 2 + +/* + * Offset in stack where mask and source pointer/stride can be accessed + * from 'init' macro. This is useful for doing special handling for solid mask. + */ +.set ARGS_STACK_OFFSET, 40 + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ +.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ +.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ + +/* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data). + */ + +.macro pixldst1 op, elem_size, reg1, mem_operand, abits +.if abits > 0 + op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! +.else + op&.&elem_size {d®1}, [&mem_operand&]! +.endif +.endm + +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits +.if abits > 0 + op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! +.else + op&.&elem_size {d®1, d®2}, [&mem_operand&]! +.endif +.endm + +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits +.if abits > 0 + op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! +.else + op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! +.endif +.endm + +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits + op&.&elem_size {d®1[idx]}, [&mem_operand&]! +.endm + +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand + op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! +.endm + +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand + op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! +.endm + +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits +.if numbytes == 32 + pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif numbytes == 16 + pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits +.elseif numbytes == 8 + pixldst1 op, elem_size, %(basereg+1), mem_operand, abits +.elseif numbytes == 4 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) + pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits + .elseif elem_size == 16 + pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits + pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits + .else + pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits + pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits + pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits + pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits + .endif +.elseif numbytes == 2 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) + pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits + .else + pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits + pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits + .endif +.elseif numbytes == 1 + pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else + pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixst numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else + pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixld_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 + pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else + pixld numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +.macro pixst_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 + pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else + pixst numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +.macro vuzp8 reg1, reg2 + vuzp.8 d®1, d®2 +.endm + +.macro vzip8 reg1, reg2 + vzip.8 d®1, d®2 +.endm + +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixdeinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vuzp8 %(basereg+0), %(basereg+1) + vuzp8 %(basereg+2), %(basereg+3) + vuzp8 %(basereg+1), %(basereg+3) + vuzp8 %(basereg+0), %(basereg+2) +.endif +.endm + +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vzip8 %(basereg+0), %(basereg+2) + vzip8 %(basereg+1), %(basereg+3) + vzip8 %(basereg+2), %(basereg+3) + vzip8 %(basereg+0), %(basereg+1) +.endif +.endm + +/* + * This is a macro for implementing cache preload. The main idea is that + * cache preload logic is mostly independent from the rest of pixels + * processing code. It starts at the top left pixel and moves forward + * across pixels and can jump across scanlines. Prefetch distance is + * handled in an 'incremental' way: it starts from 0 and advances to the + * optimal distance over time. After reaching optimal prefetch distance, + * it is kept constant. There are some checks which prevent prefetching + * unneeded pixel lines below the image (but it still can prefetch a bit + * more data on the right side of the image - not a big issue and may + * be actually helpful when rendering text glyphs). Additional trick is + * the use of LDR instruction for prefetch instead of PLD when moving to + * the next line, the point is that we have a high chance of getting TLB + * miss in this case, and PLD would be useless. + * + * This sounds like it may introduce a noticeable overhead (when working with + * fully cached data). But in reality, due to having a separate pipeline and + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can + * execute simultaneously with NEON and be completely shadowed by it. Thus + * we get no performance overhead at all (*). This looks like a very nice + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, + * but still can implement some rather advanced prefetch logic in sofware + * for almost zero cost! + * + * (*) The overhead of the prefetcher is visible when running some trivial + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with the graphics data. + */ +.macro PF a, x:vararg +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) + a x +.endif +.endm + +.macro cache_preload std_increment, boost_increment +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) +.if regs_shortage + PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ +.endif +.if std_increment != 0 + PF add PF_X, PF_X, #std_increment +.endif + PF tst PF_CTL, #0xF + PF addne PF_X, PF_X, #boost_increment + PF subne PF_CTL, PF_CTL, #1 + PF cmp PF_X, ORIG_W +.if src_bpp_shift >= 0 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +.endif +.if dst_r_bpp != 0 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +.endif +.if mask_bpp_shift >= 0 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] +.endif + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 +.if src_bpp_shift >= 0 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endif +.if dst_r_bpp != 0 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +.endif +.if mask_bpp_shift >= 0 + PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! +.endif +.endif +.endm + +.macro cache_preload_simple +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) +.if src_bpp > 0 + pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] +.endif +.if dst_r_bpp > 0 + pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] +.endif +.if mask_bpp > 0 + pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] +.endif +.endif +.endm + +/* + * Macro which is used to process leading pixels until destination + * pointer is properly aligned (at 16 bytes boundary). When destination + * buffer uses 16bpp format, this is unnecessary, or even pointless. + */ +.macro ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head +.if dst_w_bpp != 24 + tst DST_R, #0xF + beq 2f + +.irp lowbit, 1, 2, 4, 8, 16 +local skip1 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_R, #lowbit + beq 1f +.endif + pixld (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC + pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK +.if dst_r_bpp > 0 + pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R +.else + add DST_R, DST_R, #lowbit +.endif + PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) + sub W, W, #(lowbit * 8 / dst_w_bpp) +1: +.endif +.endr + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + + process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + process_pixblock_tail + + pixinterleave dst_w_bpp, dst_w_basereg +.irp lowbit, 1, 2, 4, 8, 16 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_W, #lowbit + beq 1f +.endif + pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W +1: +.endif +.endr +.endif +2: +.endm + +/* + * Special code for processing up to (pixblock_size - 1) remaining + * trailing pixels. As SIMD processing performs operation on + * pixblock_size pixels, anything smaller than this has to be loaded + * and stored in a special way. Loading and storing of pixel data is + * performed in such a way that we fill some 'slots' in the NEON + * registers (some slots naturally are unused), then perform compositing + * operation as usual. In the end, the data is taken from these 'slots' + * and saved to memory. + * + * cache_preload_flag - allows to suppress prefetch if + * set to 0 + * dst_aligned_flag - selects whether destination buffer + * is aligned + */ +.macro process_trailing_pixels cache_preload_flag, \ + dst_aligned_flag, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + tst W, #(pixblock_size - 1) + beq 2f +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size + tst W, #chunk_size + beq 1f + pixld chunk_size, src_bpp, src_basereg, SRC + pixld chunk_size, mask_bpp, mask_basereg, MASK +.if dst_aligned_flag != 0 + pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.else + pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.endif +.if cache_preload_flag != 0 + PF add PF_X, PF_X, #chunk_size +.endif +1: +.endif +.endr + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + + process_pixblock_head +.if cache_preload_flag != 0 + cache_preload 0, pixblock_size + cache_preload_simple +.endif + process_pixblock_tail + pixinterleave dst_w_bpp, dst_w_basereg +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size + tst W, #chunk_size + beq 1f +.if dst_aligned_flag != 0 + pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.else + pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.endif +1: +.endif +.endr +2: +.endm + +/* + * Macro, which performs all the needed operations to switch to the next + * scanline and start the next loop iteration unless all the scanlines + * are already processed. + */ +.macro advance_to_next_scanline start_of_loop_label +.if regs_shortage + ldrd W, [sp] /* load W and H (width and height) from stack */ +.else + mov W, ORIG_W +.endif + add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift +.if src_bpp != 0 + add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift +.endif +.if mask_bpp != 0 + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift +.endif +.if (dst_w_bpp != 24) + sub DST_W, DST_W, W, lsl #dst_bpp_shift +.endif +.if (src_bpp != 24) && (src_bpp != 0) + sub SRC, SRC, W, lsl #src_bpp_shift +.endif +.if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift +.endif + subs H, H, #1 + mov DST_R, DST_W +.if regs_shortage + str H, [sp, #4] /* save updated height to stack */ +.endif + bge start_of_loop_label +.endm + +/* + * Registers are allocated in the following way by default: + * d0, d1, d2, d3 - reserved for loading source pixel data + * d4, d5, d6, d7 - reserved for loading destination pixel data + * d24, d25, d26, d27 - reserved for loading mask pixel data + * d28, d29, d30, d31 - final destination pixel data for writeback to memory + */ +.macro generate_composite_function fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags, \ + pixblock_size_, \ + prefetch_distance, \ + init, \ + cleanup, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + + .func fname + .global fname + /* For ELF format also set function visibility to hidden */ +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif +fname: + push {r4-r12, lr} /* save all registers */ + +/* + * Select prefetch type for this function. If prefetch distance is + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch + * has to be used instead of ADVANCED. + */ + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT +.if prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ + ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE +.endif + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set pixblock_size, pixblock_size_ + .set dst_w_basereg, dst_w_basereg_ + .set dst_r_basereg, dst_r_basereg_ + .set src_basereg, src_basereg_ + .set mask_basereg, mask_basereg_ + +/* + * Assign symbolic names to registers + */ + W .req r0 /* width (is updated during processing) */ + H .req r1 /* height (is updated during processing) */ + DST_W .req r2 /* destination buffer pointer for writes */ + DST_STRIDE .req r3 /* destination image stride */ + SRC .req r4 /* source buffer pointer */ + SRC_STRIDE .req r5 /* source image stride */ + DST_R .req r6 /* destination buffer pointer for reads */ + + MASK .req r7 /* mask pointer */ + MASK_STRIDE .req r8 /* mask stride */ + + PF_CTL .req r9 /* combined lines counter and prefetch */ + /* distance increment counter */ + PF_X .req r10 /* pixel index in a scanline for current */ + /* pretetch position */ + PF_SRC .req r11 /* pointer to source scanline start */ + /* for prefetch purposes */ + PF_DST .req r12 /* pointer to destination scanline start */ + /* for prefetch purposes */ + PF_MASK .req r14 /* pointer to mask scanline start */ + /* for prefetch purposes */ +/* + * Check whether we have enough registers for all the local variables. + * If we don't have enough registers, original width and height are + * kept on top of stack (and 'regs_shortage' variable is set to indicate + * this for the rest of code). Even if there are enough registers, the + * allocation scheme may be a bit different depending on whether source + * or mask is not used. + */ +.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) + ORIG_W .req r10 /* saved original width */ + DUMMY .req r12 /* temporary register */ + .set regs_shortage, 0 +.elseif mask_bpp == 0 + ORIG_W .req r7 /* saved original width */ + DUMMY .req r8 /* temporary register */ + .set regs_shortage, 0 +.elseif src_bpp == 0 + ORIG_W .req r4 /* saved original width */ + DUMMY .req r5 /* temporary register */ + .set regs_shortage, 0 +.else + ORIG_W .req r1 /* saved original width */ + DUMMY .req r1 /* temporary register */ + .set regs_shortage, 1 +.endif + + .set mask_bpp_shift, -1 +.if src_bpp == 32 + .set src_bpp_shift, 2 +.elseif src_bpp == 24 + .set src_bpp_shift, 0 +.elseif src_bpp == 16 + .set src_bpp_shift, 1 +.elseif src_bpp == 8 + .set src_bpp_shift, 0 +.elseif src_bpp == 0 + .set src_bpp_shift, -1 +.else + .error "requested src bpp (src_bpp) is not supported" +.endif +.if mask_bpp == 32 + .set mask_bpp_shift, 2 +.elseif mask_bpp == 24 + .set mask_bpp_shift, 0 +.elseif mask_bpp == 8 + .set mask_bpp_shift, 0 +.elseif mask_bpp == 0 + .set mask_bpp_shift, -1 +.else + .error "requested mask bpp (mask_bpp) is not supported" +.endif +.if dst_w_bpp == 32 + .set dst_bpp_shift, 2 +.elseif dst_w_bpp == 24 + .set dst_bpp_shift, 0 +.elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 +.elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 +.else + .error "requested dst bpp (dst_w_bpp) is not supported" +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp +.else + .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else + .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + +.if prefetch_distance < 0 || prefetch_distance > 15 + .error "invalid prefetch distance (prefetch_distance)" +.endif + +.if src_bpp > 0 + ldr SRC, [sp, #40] +.endif +.if mask_bpp > 0 + ldr MASK, [sp, #48] +.endif + PF mov PF_X, #0 +.if src_bpp > 0 + ldr SRC_STRIDE, [sp, #44] +.endif +.if mask_bpp > 0 + ldr MASK_STRIDE, [sp, #52] +.endif + mov DST_R, DST_W + +.if src_bpp == 24 + sub SRC_STRIDE, SRC_STRIDE, W + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 +.endif +.if mask_bpp == 24 + sub MASK_STRIDE, MASK_STRIDE, W + sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 +.endif +.if dst_w_bpp == 24 + sub DST_STRIDE, DST_STRIDE, W + sub DST_STRIDE, DST_STRIDE, W, lsl #1 +.endif + +/* + * Setup advanced prefetcher initial state + */ + PF mov PF_SRC, SRC + PF mov PF_DST, DST_R + PF mov PF_MASK, MASK + /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ + PF mov PF_CTL, H, lsl #4 + PF add PF_CTL, #(prefetch_distance - 0x10) + + init +.if regs_shortage + push {r0, r1} +.endif + subs H, H, #1 +.if regs_shortage + str H, [sp, #4] /* save updated height to stack */ +.else + mov ORIG_W, W +.endif + blt 9f + cmp W, #(pixblock_size * 2) + blt 8f +/* + * This is the start of the pipelined loop, which if optimized for + * long scanlines + */ +0: + ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + pixld pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + PF add PF_X, PF_X, #pixblock_size + process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + subs W, W, #(pixblock_size * 2) + blt 2f +1: + process_pixblock_tail_head + cache_preload_simple + subs W, W, #pixblock_size + bge 1b +2: + process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 1, 1, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + advance_to_next_scanline 0b + +.if regs_shortage + pop {r0, r1} +.endif + cleanup + pop {r4-r12, pc} /* exit */ +/* + * This is the start of the loop, designed to process images with small width + * (less than pixblock_size * 2 pixels). In this case neither pipelining + * nor prefetch are used. + */ +8: + /* Process exactly pixblock_size pixels if needed */ + tst W, #pixblock_size + beq 1f + pixld pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + pixld pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + process_pixblock_head + process_pixblock_tail + pixst pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +1: + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 0, 0, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + advance_to_next_scanline 8b +9: +.if regs_shortage + pop {r0, r1} +.endif + cleanup + pop {r4-r12, pc} /* exit */ + + .unreq SRC + .unreq MASK + .unreq DST_R + .unreq DST_W + .unreq ORIG_W + .unreq W + .unreq H + .unreq SRC_STRIDE + .unreq DST_STRIDE + .unreq MASK_STRIDE + .unreq PF_CTL + .unreq PF_X + .unreq PF_SRC + .unreq PF_DST + .unreq PF_MASK + .unreq DUMMY + .endfunc +.endm + +/* + * A simplified variant of function generation template for a single + * scanline processing (for implementing pixman combine functions) + */ +.macro generate_composite_function_single_scanline fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags, \ + pixblock_size_, \ + init, \ + cleanup, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + + .func fname + .global fname + /* For ELF format also set function visibility to hidden */ +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif +fname: + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set pixblock_size, pixblock_size_ + .set dst_w_basereg, dst_w_basereg_ + .set dst_r_basereg, dst_r_basereg_ + .set src_basereg, src_basereg_ + .set mask_basereg, mask_basereg_ +/* + * Assign symbolic names to registers + */ + W .req r0 /* width (is updated during processing) */ + DST_W .req r1 /* destination buffer pointer for writes */ + SRC .req r2 /* source buffer pointer */ + DST_R .req ip /* destination buffer pointer for reads */ + MASK .req r3 /* mask pointer */ + +.if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp +.else + .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else + .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + + init + mov DST_R, DST_W + + cmp W, #pixblock_size + blt 8f + + ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + subs W, W, #pixblock_size + blt 7f + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + pixld pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + process_pixblock_head + subs W, W, #pixblock_size + blt 2f +1: + process_pixblock_tail_head + subs W, W, #pixblock_size + bge 1b +2: + process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +7: + /* Process the remaining trailing pixels in the scanline (dst aligned) */ + process_trailing_pixels 0, 1, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + cleanup + bx lr /* exit */ +8: + /* Process the remaining trailing pixels in the scanline (dst unaligned) */ + process_trailing_pixels 0, 0, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + cleanup + bx lr /* exit */ + + .unreq SRC + .unreq MASK + .unreq DST_R + .unreq DST_W + .unreq W + .endfunc +.endm + +.macro default_init +.endm + +.macro default_cleanup +.endm diff --git a/src/gui/image/qimage.cpp b/src/gui/image/qimage.cpp index 94307de..233c58d 100644 --- a/src/gui/image/qimage.cpp +++ b/src/gui/image/qimage.cpp @@ -2988,19 +2988,19 @@ static void convert_Indexed8_to_X32(QImageData *dest, const QImageData *src, Qt: colorTable.resize(256); for (int i=0; i<256; ++i) colorTable[i] = qRgb(i, i, i); - } int w = src->width; const uchar *src_data = src->data; uchar *dest_data = dest->data; + int tableSize = colorTable.size() - 1; for (int y = 0; y < src->height; y++) { uint *p = (uint *)dest_data; const uchar *b = src_data; uint *end = p + w; while (p < end) - *p++ = colorTable.at(*b++); + *p++ = colorTable.at(qMin<int>(tableSize, *b++)); src_data += src->bytes_per_line; dest_data += dest->bytes_per_line; diff --git a/src/gui/image/qpixmap_raster.cpp b/src/gui/image/qpixmap_raster.cpp index 0b1c18d..b183d0d 100644 --- a/src/gui/image/qpixmap_raster.cpp +++ b/src/gui/image/qpixmap_raster.cpp @@ -182,6 +182,7 @@ void QRasterPixmapData::fromImage(const QImage &sourceImage, QImage::Format opaqueFormat = QNativeImage::systemFormat(); QImage::Format alphaFormat = QImage::Format_ARGB32_Premultiplied; +#ifndef QT_HAVE_NEON switch (opaqueFormat) { case QImage::Format_RGB16: alphaFormat = QImage::Format_ARGB8565_Premultiplied; @@ -189,6 +190,7 @@ void QRasterPixmapData::fromImage(const QImage &sourceImage, default: // We don't care about the others... break; } +#endif if (!sourceImage.hasAlphaChannel() || ((flags & Qt::NoOpaqueDetection) == 0 @@ -238,6 +240,7 @@ void QRasterPixmapData::fill(const QColor &color) if (alpha != 255) { if (!image.hasAlphaChannel()) { QImage::Format toFormat; +#ifndef QT_HAVE_NEON if (image.format() == QImage::Format_RGB16) toFormat = QImage::Format_ARGB8565_Premultiplied; else if (image.format() == QImage::Format_RGB666) @@ -247,6 +250,7 @@ void QRasterPixmapData::fill(const QColor &color) else if (image.format() == QImage::Format_RGB444) toFormat = QImage::Format_ARGB4444_Premultiplied; else +#endif toFormat = QImage::Format_ARGB32_Premultiplied; image = QImage(image.width(), image.height(), toFormat); } diff --git a/src/gui/itemviews/qitemdelegate.cpp b/src/gui/itemviews/qitemdelegate.cpp index cba213b..d5f6fd2 100644 --- a/src/gui/itemviews/qitemdelegate.cpp +++ b/src/gui/itemviews/qitemdelegate.cpp @@ -69,6 +69,7 @@ #include <qdebug.h> #include <qlocale.h> #include <qdialog.h> +#include <qmath.h> #include <limits.h> @@ -1148,7 +1149,8 @@ QRect QItemDelegate::textRectangle(QPainter * /*painter*/, const QRect &rect, d->textLayout.setTextOption(d->textOption); d->textLayout.setFont(font); d->textLayout.setText(QItemDelegatePrivate::replaceNewLine(text)); - const QSize size = d->doTextLayout(rect.width()).toSize(); + QSizeF fpSize = d->doTextLayout(rect.width()); + const QSize size = QSize(qCeil(fpSize.width()), qCeil(fpSize.height())); // ###: textRectangle should take style option as argument const int textMargin = QApplication::style()->pixelMetric(QStyle::PM_FocusFrameHMargin) + 1; return QRect(0, 0, size.width() + 2 * textMargin, size.height()); diff --git a/src/gui/painting/painting.pri b/src/gui/painting/painting.pri index a6cc9c7..ed8ee76 100644 --- a/src/gui/painting/painting.pri +++ b/src/gui/painting/painting.pri @@ -91,6 +91,8 @@ SOURCES += \ HEADERS += \ painting/qpaintengine_raster_p.h \ + painting/qdrawhelper_p.h \ + painting/qblendfunctions_p.h \ painting/qrasterdefs_p.h \ painting/qgrayraster_p.h @@ -379,11 +381,23 @@ symbian { QMAKE_CXXFLAGS.ARMCC *= -O3 } -neon { +neon:*-g++* { DEFINES += QT_HAVE_NEON HEADERS += painting/qdrawhelper_neon_p.h SOURCES += painting/qdrawhelper_neon.cpp QMAKE_CXXFLAGS *= -mfpu=neon + + DRAWHELPER_NEON_ASM_FILES = ../3rdparty/pixman/pixman-arm-neon-asm.S painting/qdrawhelper_neon_asm.S + + neon_compiler.commands = $$QMAKE_CXX -c + neon_compiler.commands += $(CXXFLAGS) $(INCPATH) ${QMAKE_FILE_IN} -o ${QMAKE_FILE_OUT} + neon_compiler.dependency_type = TYPE_C + neon_compiler.output = ${QMAKE_VAR_OBJECTS_DIR}${QMAKE_FILE_BASE}$${first(QMAKE_EXT_OBJ)} + neon_compiler.input = DRAWHELPER_NEON_ASM_FILES + neon_compiler.variable_out = OBJECTS + neon_compiler.name = compiling[neon] ${QMAKE_FILE_IN} + silent:neon_compiler.commands = @echo compiling[neon] ${QMAKE_FILE_IN} && $$neon_compiler.commands + QMAKE_EXTRA_COMPILERS += neon_compiler } contains(QT_CONFIG, zlib) { diff --git a/src/gui/painting/qblendfunctions.cpp b/src/gui/painting/qblendfunctions.cpp index dc33896..24908ce 100644 --- a/src/gui/painting/qblendfunctions.cpp +++ b/src/gui/painting/qblendfunctions.cpp @@ -40,7 +40,7 @@ ****************************************************************************/ #include <qmath.h> -#include "qdrawhelper_p.h" +#include "qblendfunctions_p.h" QT_BEGIN_NAMESPACE @@ -88,6 +88,8 @@ static inline quint16 convert_argb32_to_rgb16(quint32 spix) struct Blend_RGB16_on_RGB16_NoAlpha { inline void write(quint16 *dst, quint16 src) { *dst = src; } + + inline void flush(void *) {} }; struct Blend_RGB16_on_RGB16_ConstAlpha { @@ -100,6 +102,8 @@ struct Blend_RGB16_on_RGB16_ConstAlpha { *dst = BYTE_MUL_RGB16(src, m_alpha) + BYTE_MUL_RGB16(*dst, m_ialpha); } + inline void flush(void *) {} + quint32 m_alpha; quint32 m_ialpha; }; @@ -114,6 +118,8 @@ struct Blend_ARGB24_on_RGB16_SourceAlpha { *dst = s; } } + + inline void flush(void *) {} }; struct Blend_ARGB24_on_RGB16_SourceAndConstAlpha { @@ -132,6 +138,8 @@ struct Blend_ARGB24_on_RGB16_SourceAndConstAlpha { } } + inline void flush(void *) {} + quint32 m_alpha; }; @@ -145,6 +153,8 @@ struct Blend_ARGB32_on_RGB16_SourceAlpha { *dst = s; } } + + inline void flush(void *) {} }; struct Blend_ARGB32_on_RGB16_SourceAndConstAlpha { @@ -163,99 +173,11 @@ struct Blend_ARGB32_on_RGB16_SourceAndConstAlpha { } } + inline void flush(void *) {} + quint32 m_alpha; }; -template <typename SRC, typename T> -void qt_scale_image_16bit(uchar *destPixels, int dbpl, - const uchar *srcPixels, int sbpl, - const QRectF &targetRect, - const QRectF &srcRect, - const QRect &clip, - T blender) -{ - qreal sx = targetRect.width() / (qreal) srcRect.width(); - qreal sy = targetRect.height() / (qreal) srcRect.height(); - - int ix = 0x00010000 / sx; - int iy = 0x00010000 / sy; - -// qDebug() << "scale:" << endl -// << " - target" << targetRect << endl -// << " - source" << srcRect << endl -// << " - clip" << clip << endl -// << " - sx=" << sx << " sy=" << sy << " ix=" << ix << " iy=" << iy; - - int cx1 = clip.x(); - int cx2 = clip.x() + clip.width(); - int cy1 = clip.top(); - int cy2 = clip.y() + clip.height(); - - int tx1 = qRound(targetRect.left()); - int tx2 = qRound(targetRect.right()); - int ty1 = qRound(targetRect.top()); - int ty2 = qRound(targetRect.bottom()); - - if (tx2 < tx1) - qSwap(tx2, tx1); - - if (ty2 < ty1) - qSwap(ty2, ty1); - - if (tx1 < cx1) - tx1 = cx1; - - if (tx2 >= cx2) - tx2 = cx2; - - if (tx1 >= tx2) - return; - - if (ty1 < cy1) - ty1 = cy1; - - if (ty2 >= cy2) - ty2 = cy2; - - if (ty1 >= ty2) - return; - - int h = ty2 - ty1; - int w = tx2 - tx1; - - - quint32 basex; - quint32 srcy; - - if (sx < 0) { - int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1; - basex = quint32(srcRect.right() * 65536) + dstx; - } else { - int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1; - basex = quint32(srcRect.left() * 65536) + dstx; - } - if (sy < 0) { - int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1; - srcy = quint32(srcRect.bottom() * 65536) + dsty; - } else { - int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1; - srcy = quint32(srcRect.top() * 65536) + dsty; - } - - quint16 *dst = ((quint16 *) (destPixels + ty1 * dbpl)) + tx1; - - while (h--) { - const SRC *src = (const SRC *) (srcPixels + (srcy >> 16) * sbpl); - int srcx = basex; - for (int x=0; x<w; ++x) { - blender.write(&dst[x], src[srcx >> 16]); - srcx += ix; - } - dst = (quint16 *)(((uchar *) dst) + dbpl); - srcy += iy; - } -} - void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, const QRectF &targetRect, @@ -447,10 +369,10 @@ static void qt_blend_argb24_on_rgb16(uchar *destPixels, int dbpl, -static void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl, - const uchar *srcPixels, int sbpl, - int w, int h, - int const_alpha) +void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) { quint16 *dst = (quint16 *) destPixels; const quint32 *src = (const quint32 *) srcPixels; @@ -643,6 +565,8 @@ void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, struct Blend_RGB32_on_RGB32_NoAlpha { inline void write(quint32 *dst, quint32 src) { *dst = src; } + + inline void flush(void *) {} }; struct Blend_RGB32_on_RGB32_ConstAlpha { @@ -655,6 +579,8 @@ struct Blend_RGB32_on_RGB32_ConstAlpha { *dst = BYTE_MUL(src, m_alpha) + BYTE_MUL(*dst, m_ialpha); } + inline void flush(void *) {} + quint32 m_alpha; quint32 m_ialpha; }; @@ -663,6 +589,8 @@ struct Blend_ARGB32_on_ARGB32_SourceAlpha { inline void write(quint32 *dst, quint32 src) { *dst = src + BYTE_MUL(*dst, qAlpha(~src)); } + + inline void flush(void *) {} }; struct Blend_ARGB32_on_ARGB32_SourceAndConstAlpha { @@ -676,98 +604,12 @@ struct Blend_ARGB32_on_ARGB32_SourceAndConstAlpha { *dst = src + BYTE_MUL(*dst, qAlpha(~src)); } + inline void flush(void *) {} + quint32 m_alpha; quint32 m_ialpha; }; -template <typename T> void qt_scale_image_32bit(uchar *destPixels, int dbpl, - const uchar *srcPixels, int sbpl, - const QRectF &targetRect, - const QRectF &srcRect, - const QRect &clip, - T blender) -{ - qreal sx = targetRect.width() / (qreal) srcRect.width(); - qreal sy = targetRect.height() / (qreal) srcRect.height(); - - int ix = 0x00010000 / sx; - int iy = 0x00010000 / sy; - -// qDebug() << "scale:" << endl -// << " - target" << targetRect << endl -// << " - source" << srcRect << endl -// << " - clip" << clip << endl -// << " - sx=" << sx << " sy=" << sy << " ix=" << ix << " iy=" << iy; - - int cx1 = clip.x(); - int cx2 = clip.x() + clip.width(); - int cy1 = clip.top(); - int cy2 = clip.y() + clip.height(); - - int tx1 = qRound(targetRect.left()); - int tx2 = qRound(targetRect.right()); - int ty1 = qRound(targetRect.top()); - int ty2 = qRound(targetRect.bottom()); - - if (tx2 < tx1) - qSwap(tx2, tx1); - - if (ty2 < ty1) - qSwap(ty2, ty1); - - if (tx1 < cx1) - tx1 = cx1; - - if (tx2 >= cx2) - tx2 = cx2; - - if (tx1 >= tx2) - return; - - if (ty1 < cy1) - ty1 = cy1; - - if (ty2 >= cy2) - ty2 = cy2; - - if (ty1 >= ty2) - return; - - int h = ty2 - ty1; - int w = tx2 - tx1; - - quint32 basex; - quint32 srcy; - - if (sx < 0) { - int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1; - basex = quint32(srcRect.right() * 65536) + dstx; - } else { - int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1; - basex = quint32(srcRect.left() * 65536) + dstx; - } - if (sy < 0) { - int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1; - srcy = quint32(srcRect.bottom() * 65536) + dsty; - } else { - int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1; - srcy = quint32(srcRect.top() * 65536) + dsty; - } - - quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1; - - while (h--) { - const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl); - int srcx = basex; - for (int x=0; x<w; ++x) { - blender.write(&dst[x], src[srcx >> 16]); - srcx += ix; - } - dst = (quint32 *)(((uchar *) dst) + dbpl); - srcy += iy; - } -} - void qt_scale_image_rgb32_on_rgb32(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, const QRectF &targetRect, @@ -818,244 +660,6 @@ void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl, } } -struct QTransformImageVertex -{ - qreal x, y, u, v; // destination coordinates (x, y) and source coordinates (u, v) -}; - -template <class SrcT, class DestT, class Blender> -void qt_transform_image_rasterize(DestT *destPixels, int dbpl, - const SrcT *srcPixels, int sbpl, - const QTransformImageVertex &topLeft, const QTransformImageVertex &bottomLeft, - const QTransformImageVertex &topRight, const QTransformImageVertex &bottomRight, - const QRect &sourceRect, - const QRect &clip, - qreal topY, qreal bottomY, - int dudx, int dvdx, int dudy, int dvdy, int u0, int v0, - Blender blender) -{ - int fromY = qMax(qRound(topY), clip.top()); - int toY = qMin(qRound(bottomY), clip.top() + clip.height()); - if (fromY >= toY) - return; - - qreal leftSlope = (bottomLeft.x - topLeft.x) / (bottomLeft.y - topLeft.y); - qreal rightSlope = (bottomRight.x - topRight.x) / (bottomRight.y - topRight.y); - int dx_l = int(leftSlope * 0x10000); - int dx_r = int(rightSlope * 0x10000); - int x_l = int((topLeft.x + (0.5 + fromY - topLeft.y) * leftSlope + 0.5) * 0x10000); - int x_r = int((topRight.x + (0.5 + fromY - topRight.y) * rightSlope + 0.5) * 0x10000); - - int fromX, toX, x1, x2, u, v, i, ii; - DestT *line; - for (int y = fromY; y < toY; ++y) { - line = reinterpret_cast<DestT *>(reinterpret_cast<uchar *>(destPixels) + y * dbpl); - - fromX = qMax(x_l >> 16, clip.left()); - toX = qMin(x_r >> 16, clip.left() + clip.width()); - if (fromX < toX) { - // Because of rounding, we can get source coordinates outside the source image. - // Clamp these coordinates to the source rect to avoid segmentation fault and - // garbage on the screen. - - // Find the first pixel on the current scan line where the source coordinates are within the source rect. - x1 = fromX; - u = x1 * dudx + y * dudy + u0; - v = x1 * dvdx + y * dvdy + v0; - for (; x1 < toX; ++x1) { - int uu = u >> 16; - int vv = v >> 16; - if (uu >= sourceRect.left() && uu < sourceRect.left() + sourceRect.width() - && vv >= sourceRect.top() && vv < sourceRect.top() + sourceRect.height()) { - break; - } - u += dudx; - v += dvdx; - } - - // Find the last pixel on the current scan line where the source coordinates are within the source rect. - x2 = toX; - u = (x2 - 1) * dudx + y * dudy + u0; - v = (x2 - 1) * dvdx + y * dvdy + v0; - for (; x2 > x1; --x2) { - int uu = u >> 16; - int vv = v >> 16; - if (uu >= sourceRect.left() && uu < sourceRect.left() + sourceRect.width() - && vv >= sourceRect.top() && vv < sourceRect.top() + sourceRect.height()) { - break; - } - u -= dudx; - v -= dvdx; - } - - // Set up values at the beginning of the scan line. - u = fromX * dudx + y * dudy + u0; - v = fromX * dvdx + y * dvdy + v0; - line += fromX; - - // Beginning of the scan line, with per-pixel checks. - i = x1 - fromX; - while (i) { - int uu = qBound(sourceRect.left(), u >> 16, sourceRect.left() + sourceRect.width() - 1); - int vv = qBound(sourceRect.top(), v >> 16, sourceRect.top() + sourceRect.height() - 1); - blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + vv * sbpl)[uu]); - u += dudx; - v += dvdx; - ++line; - --i; - } - - // Middle of the scan line, without checks. - // Manual loop unrolling. - i = x2 - x1; - ii = i >> 3; - while (ii) { - blender.write(&line[0], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; - blender.write(&line[1], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; - blender.write(&line[2], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; - blender.write(&line[3], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; - blender.write(&line[4], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; - blender.write(&line[5], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; - blender.write(&line[6], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; - blender.write(&line[7], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; - line += 8; - --ii; - } - switch (i & 7) { - case 7: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; - case 6: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; - case 5: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; - case 4: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; - case 3: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; - case 2: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; - case 1: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; - } - - // End of the scan line, with per-pixel checks. - i = toX - x2; - while (i) { - int uu = qBound(sourceRect.left(), u >> 16, sourceRect.left() + sourceRect.width() - 1); - int vv = qBound(sourceRect.top(), v >> 16, sourceRect.top() + sourceRect.height() - 1); - blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + vv * sbpl)[uu]); - u += dudx; - v += dvdx; - ++line; - --i; - } - } - x_l += dx_l; - x_r += dx_r; - } -} - -template <class SrcT, class DestT, class Blender> -void qt_transform_image(DestT *destPixels, int dbpl, - const SrcT *srcPixels, int sbpl, - const QRectF &targetRect, - const QRectF &sourceRect, - const QRect &clip, - const QTransform &targetRectTransform, - Blender blender) -{ - enum Corner - { - TopLeft, - TopRight, - BottomRight, - BottomLeft - }; - - // map source rectangle to destination. - QTransformImageVertex v[4]; - v[TopLeft].u = v[BottomLeft].u = sourceRect.left(); - v[TopLeft].v = v[TopRight].v = sourceRect.top(); - v[TopRight].u = v[BottomRight].u = sourceRect.right(); - v[BottomLeft].v = v[BottomRight].v = sourceRect.bottom(); - targetRectTransform.map(targetRect.left(), targetRect.top(), &v[TopLeft].x, &v[TopLeft].y); - targetRectTransform.map(targetRect.right(), targetRect.top(), &v[TopRight].x, &v[TopRight].y); - targetRectTransform.map(targetRect.left(), targetRect.bottom(), &v[BottomLeft].x, &v[BottomLeft].y); - targetRectTransform.map(targetRect.right(), targetRect.bottom(), &v[BottomRight].x, &v[BottomRight].y); - - // find topmost vertex. - int topmost = 0; - for (int i = 1; i < 4; ++i) { - if (v[i].y < v[topmost].y) - topmost = i; - } - // rearrange array such that topmost vertex is at index 0. - switch (topmost) { - case 1: - { - QTransformImageVertex t = v[0]; - for (int i = 0; i < 3; ++i) - v[i] = v[i+1]; - v[3] = t; - } - break; - case 2: - qSwap(v[0], v[2]); - qSwap(v[1], v[3]); - break; - case 3: - { - QTransformImageVertex t = v[3]; - for (int i = 3; i > 0; --i) - v[i] = v[i-1]; - v[0] = t; - } - break; - } - - // if necessary, swap vertex 1 and 3 such that 1 is to the left of 3. - qreal dx1 = v[1].x - v[0].x; - qreal dy1 = v[1].y - v[0].y; - qreal dx2 = v[3].x - v[0].x; - qreal dy2 = v[3].y - v[0].y; - if (dx1 * dy2 - dx2 * dy1 > 0) - qSwap(v[1], v[3]); - - QTransformImageVertex u = {v[1].x - v[0].x, v[1].y - v[0].y, v[1].u - v[0].u, v[1].v - v[0].v}; - QTransformImageVertex w = {v[2].x - v[0].x, v[2].y - v[0].y, v[2].u - v[0].u, v[2].v - v[0].v}; - - qreal det = u.x * w.y - u.y * w.x; - if (det == 0) - return; - - qreal invDet = 1.0 / det; - qreal m11, m12, m21, m22, mdx, mdy; - - m11 = (u.u * w.y - u.y * w.u) * invDet; - m12 = (u.x * w.u - u.u * w.x) * invDet; - m21 = (u.v * w.y - u.y * w.v) * invDet; - m22 = (u.x * w.v - u.v * w.x) * invDet; - mdx = v[0].u - m11 * v[0].x - m12 * v[0].y; - mdy = v[0].v - m21 * v[0].x - m22 * v[0].y; - - int dudx = int(m11 * 0x10000); - int dvdx = int(m21 * 0x10000); - int dudy = int(m12 * 0x10000); - int dvdy = int(m22 * 0x10000); - int u0 = qCeil((0.5 * m11 + 0.5 * m12 + mdx) * 0x10000) - 1; - int v0 = qCeil((0.5 * m21 + 0.5 * m22 + mdy) * 0x10000) - 1; - - int x1 = qFloor(sourceRect.left()); - int y1 = qFloor(sourceRect.top()); - int x2 = qCeil(sourceRect.right()); - int y2 = qCeil(sourceRect.bottom()); - QRect sourceRectI(x1, y1, x2 - x1, y2 - y1); - - // rasterize trapezoids. - if (v[1].y < v[3].y) { - qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[0], v[3], sourceRectI, clip, v[0].y, v[1].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); - qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[0], v[3], sourceRectI, clip, v[1].y, v[3].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); - qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[3], v[2], sourceRectI, clip, v[3].y, v[2].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); - } else { - qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[0], v[3], sourceRectI, clip, v[0].y, v[3].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); - qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[3], v[2], sourceRectI, clip, v[3].y, v[1].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); - qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[3], v[2], sourceRectI, clip, v[1].y, v[2].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); - } -} - void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, const QRectF &targetRect, diff --git a/src/gui/painting/qblendfunctions_p.h b/src/gui/painting/qblendfunctions_p.h new file mode 100644 index 0000000..ad754b0 --- /dev/null +++ b/src/gui/painting/qblendfunctions_p.h @@ -0,0 +1,497 @@ +/**************************************************************************** +** +** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the QtGui module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef QBLENDFUNCTIONS_P_H +#define QBLENDFUNCTIONS_P_H + +#include <qmath.h> +#include "qdrawhelper_p.h" + +QT_BEGIN_NAMESPACE + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +template <typename SRC, typename T> +void qt_scale_image_16bit(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &srcRect, + const QRect &clip, + T blender) +{ + qreal sx = targetRect.width() / (qreal) srcRect.width(); + qreal sy = targetRect.height() / (qreal) srcRect.height(); + + int ix = 0x00010000 / sx; + int iy = 0x00010000 / sy; + +// qDebug() << "scale:" << endl +// << " - target" << targetRect << endl +// << " - source" << srcRect << endl +// << " - clip" << clip << endl +// << " - sx=" << sx << " sy=" << sy << " ix=" << ix << " iy=" << iy; + + int cx1 = clip.x(); + int cx2 = clip.x() + clip.width(); + int cy1 = clip.top(); + int cy2 = clip.y() + clip.height(); + + int tx1 = qRound(targetRect.left()); + int tx2 = qRound(targetRect.right()); + int ty1 = qRound(targetRect.top()); + int ty2 = qRound(targetRect.bottom()); + + if (tx2 < tx1) + qSwap(tx2, tx1); + + if (ty2 < ty1) + qSwap(ty2, ty1); + + if (tx1 < cx1) + tx1 = cx1; + + if (tx2 >= cx2) + tx2 = cx2; + + if (tx1 >= tx2) + return; + + if (ty1 < cy1) + ty1 = cy1; + + if (ty2 >= cy2) + ty2 = cy2; + + if (ty1 >= ty2) + return; + + int h = ty2 - ty1; + int w = tx2 - tx1; + + + quint32 basex; + quint32 srcy; + + if (sx < 0) { + int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1; + basex = quint32(srcRect.right() * 65536) + dstx; + } else { + int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1; + basex = quint32(srcRect.left() * 65536) + dstx; + } + if (sy < 0) { + int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1; + srcy = quint32(srcRect.bottom() * 65536) + dsty; + } else { + int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1; + srcy = quint32(srcRect.top() * 65536) + dsty; + } + + quint16 *dst = ((quint16 *) (destPixels + ty1 * dbpl)) + tx1; + + while (h--) { + const SRC *src = (const SRC *) (srcPixels + (srcy >> 16) * sbpl); + int srcx = basex; + int x = 0; + for (; x<w-7; x+=8) { + blender.write(&dst[x], src[srcx >> 16]); srcx += ix; + blender.write(&dst[x+1], src[srcx >> 16]); srcx += ix; + blender.write(&dst[x+2], src[srcx >> 16]); srcx += ix; + blender.write(&dst[x+3], src[srcx >> 16]); srcx += ix; + blender.write(&dst[x+4], src[srcx >> 16]); srcx += ix; + blender.write(&dst[x+5], src[srcx >> 16]); srcx += ix; + blender.write(&dst[x+6], src[srcx >> 16]); srcx += ix; + blender.write(&dst[x+7], src[srcx >> 16]); srcx += ix; + } + for (; x<w; ++x) { + blender.write(&dst[x], src[srcx >> 16]); + srcx += ix; + } + blender.flush(&dst[x]); + dst = (quint16 *)(((uchar *) dst) + dbpl); + srcy += iy; + } +} + +template <typename T> void qt_scale_image_32bit(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &srcRect, + const QRect &clip, + T blender) +{ + qreal sx = targetRect.width() / (qreal) srcRect.width(); + qreal sy = targetRect.height() / (qreal) srcRect.height(); + + int ix = 0x00010000 / sx; + int iy = 0x00010000 / sy; + +// qDebug() << "scale:" << endl +// << " - target" << targetRect << endl +// << " - source" << srcRect << endl +// << " - clip" << clip << endl +// << " - sx=" << sx << " sy=" << sy << " ix=" << ix << " iy=" << iy; + + int cx1 = clip.x(); + int cx2 = clip.x() + clip.width(); + int cy1 = clip.top(); + int cy2 = clip.y() + clip.height(); + + int tx1 = qRound(targetRect.left()); + int tx2 = qRound(targetRect.right()); + int ty1 = qRound(targetRect.top()); + int ty2 = qRound(targetRect.bottom()); + + if (tx2 < tx1) + qSwap(tx2, tx1); + + if (ty2 < ty1) + qSwap(ty2, ty1); + + if (tx1 < cx1) + tx1 = cx1; + + if (tx2 >= cx2) + tx2 = cx2; + + if (tx1 >= tx2) + return; + + if (ty1 < cy1) + ty1 = cy1; + + if (ty2 >= cy2) + ty2 = cy2; + + if (ty1 >= ty2) + return; + + int h = ty2 - ty1; + int w = tx2 - tx1; + + quint32 basex; + quint32 srcy; + + if (sx < 0) { + int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1; + basex = quint32(srcRect.right() * 65536) + dstx; + } else { + int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1; + basex = quint32(srcRect.left() * 65536) + dstx; + } + if (sy < 0) { + int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1; + srcy = quint32(srcRect.bottom() * 65536) + dsty; + } else { + int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1; + srcy = quint32(srcRect.top() * 65536) + dsty; + } + + quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1; + + while (h--) { + const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl); + int srcx = basex; + int x = 0; + for (; x<w; ++x) { + blender.write(&dst[x], src[srcx >> 16]); + srcx += ix; + } + blender.flush(&dst[x]); + dst = (quint32 *)(((uchar *) dst) + dbpl); + srcy += iy; + } +} + +struct QTransformImageVertex +{ + qreal x, y, u, v; // destination coordinates (x, y) and source coordinates (u, v) +}; + +template <class SrcT, class DestT, class Blender> +void qt_transform_image_rasterize(DestT *destPixels, int dbpl, + const SrcT *srcPixels, int sbpl, + const QTransformImageVertex &topLeft, const QTransformImageVertex &bottomLeft, + const QTransformImageVertex &topRight, const QTransformImageVertex &bottomRight, + const QRect &sourceRect, + const QRect &clip, + qreal topY, qreal bottomY, + int dudx, int dvdx, int dudy, int dvdy, int u0, int v0, + Blender blender) +{ + int fromY = qMax(qRound(topY), clip.top()); + int toY = qMin(qRound(bottomY), clip.top() + clip.height()); + if (fromY >= toY) + return; + + qreal leftSlope = (bottomLeft.x - topLeft.x) / (bottomLeft.y - topLeft.y); + qreal rightSlope = (bottomRight.x - topRight.x) / (bottomRight.y - topRight.y); + int dx_l = int(leftSlope * 0x10000); + int dx_r = int(rightSlope * 0x10000); + int x_l = int((topLeft.x + (0.5 + fromY - topLeft.y) * leftSlope + 0.5) * 0x10000); + int x_r = int((topRight.x + (0.5 + fromY - topRight.y) * rightSlope + 0.5) * 0x10000); + + int fromX, toX, x1, x2, u, v, i, ii; + DestT *line; + for (int y = fromY; y < toY; ++y) { + line = reinterpret_cast<DestT *>(reinterpret_cast<uchar *>(destPixels) + y * dbpl); + + fromX = qMax(x_l >> 16, clip.left()); + toX = qMin(x_r >> 16, clip.left() + clip.width()); + if (fromX < toX) { + // Because of rounding, we can get source coordinates outside the source image. + // Clamp these coordinates to the source rect to avoid segmentation fault and + // garbage on the screen. + + // Find the first pixel on the current scan line where the source coordinates are within the source rect. + x1 = fromX; + u = x1 * dudx + y * dudy + u0; + v = x1 * dvdx + y * dvdy + v0; + for (; x1 < toX; ++x1) { + int uu = u >> 16; + int vv = v >> 16; + if (uu >= sourceRect.left() && uu < sourceRect.left() + sourceRect.width() + && vv >= sourceRect.top() && vv < sourceRect.top() + sourceRect.height()) { + break; + } + u += dudx; + v += dvdx; + } + + // Find the last pixel on the current scan line where the source coordinates are within the source rect. + x2 = toX; + u = (x2 - 1) * dudx + y * dudy + u0; + v = (x2 - 1) * dvdx + y * dvdy + v0; + for (; x2 > x1; --x2) { + int uu = u >> 16; + int vv = v >> 16; + if (uu >= sourceRect.left() && uu < sourceRect.left() + sourceRect.width() + && vv >= sourceRect.top() && vv < sourceRect.top() + sourceRect.height()) { + break; + } + u -= dudx; + v -= dvdx; + } + + // Set up values at the beginning of the scan line. + u = fromX * dudx + y * dudy + u0; + v = fromX * dvdx + y * dvdy + v0; + line += fromX; + + // Beginning of the scan line, with per-pixel checks. + i = x1 - fromX; + while (i) { + int uu = qBound(sourceRect.left(), u >> 16, sourceRect.left() + sourceRect.width() - 1); + int vv = qBound(sourceRect.top(), v >> 16, sourceRect.top() + sourceRect.height() - 1); + blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + vv * sbpl)[uu]); + u += dudx; + v += dvdx; + ++line; + --i; + } + + // Middle of the scan line, without checks. + // Manual loop unrolling. + i = x2 - x1; + ii = i >> 3; + while (ii) { + blender.write(&line[0], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; + blender.write(&line[1], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; + blender.write(&line[2], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; + blender.write(&line[3], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; + blender.write(&line[4], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; + blender.write(&line[5], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; + blender.write(&line[6], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; + blender.write(&line[7], reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; + + line += 8; + + --ii; + } + switch (i & 7) { + case 7: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; + case 6: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; + case 5: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; + case 4: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; + case 3: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; + case 2: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; + case 1: blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + (v >> 16) * sbpl)[u >> 16]); u += dudx; v += dvdx; ++line; + } + + // End of the scan line, with per-pixel checks. + i = toX - x2; + while (i) { + int uu = qBound(sourceRect.left(), u >> 16, sourceRect.left() + sourceRect.width() - 1); + int vv = qBound(sourceRect.top(), v >> 16, sourceRect.top() + sourceRect.height() - 1); + blender.write(line, reinterpret_cast<const SrcT *>(reinterpret_cast<const uchar *>(srcPixels) + vv * sbpl)[uu]); + u += dudx; + v += dvdx; + ++line; + --i; + } + + blender.flush(line); + } + x_l += dx_l; + x_r += dx_r; + } +} + +template <class SrcT, class DestT, class Blender> +void qt_transform_image(DestT *destPixels, int dbpl, + const SrcT *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + const QTransform &targetRectTransform, + Blender blender) +{ + enum Corner + { + TopLeft, + TopRight, + BottomRight, + BottomLeft + }; + + // map source rectangle to destination. + QTransformImageVertex v[4]; + v[TopLeft].u = v[BottomLeft].u = sourceRect.left(); + v[TopLeft].v = v[TopRight].v = sourceRect.top(); + v[TopRight].u = v[BottomRight].u = sourceRect.right(); + v[BottomLeft].v = v[BottomRight].v = sourceRect.bottom(); + targetRectTransform.map(targetRect.left(), targetRect.top(), &v[TopLeft].x, &v[TopLeft].y); + targetRectTransform.map(targetRect.right(), targetRect.top(), &v[TopRight].x, &v[TopRight].y); + targetRectTransform.map(targetRect.left(), targetRect.bottom(), &v[BottomLeft].x, &v[BottomLeft].y); + targetRectTransform.map(targetRect.right(), targetRect.bottom(), &v[BottomRight].x, &v[BottomRight].y); + + // find topmost vertex. + int topmost = 0; + for (int i = 1; i < 4; ++i) { + if (v[i].y < v[topmost].y) + topmost = i; + } + // rearrange array such that topmost vertex is at index 0. + switch (topmost) { + case 1: + { + QTransformImageVertex t = v[0]; + for (int i = 0; i < 3; ++i) + v[i] = v[i+1]; + v[3] = t; + } + break; + case 2: + qSwap(v[0], v[2]); + qSwap(v[1], v[3]); + break; + case 3: + { + QTransformImageVertex t = v[3]; + for (int i = 3; i > 0; --i) + v[i] = v[i-1]; + v[0] = t; + } + break; + } + + // if necessary, swap vertex 1 and 3 such that 1 is to the left of 3. + qreal dx1 = v[1].x - v[0].x; + qreal dy1 = v[1].y - v[0].y; + qreal dx2 = v[3].x - v[0].x; + qreal dy2 = v[3].y - v[0].y; + if (dx1 * dy2 - dx2 * dy1 > 0) + qSwap(v[1], v[3]); + + QTransformImageVertex u = {v[1].x - v[0].x, v[1].y - v[0].y, v[1].u - v[0].u, v[1].v - v[0].v}; + QTransformImageVertex w = {v[2].x - v[0].x, v[2].y - v[0].y, v[2].u - v[0].u, v[2].v - v[0].v}; + + qreal det = u.x * w.y - u.y * w.x; + if (det == 0) + return; + + qreal invDet = 1.0 / det; + qreal m11, m12, m21, m22, mdx, mdy; + + m11 = (u.u * w.y - u.y * w.u) * invDet; + m12 = (u.x * w.u - u.u * w.x) * invDet; + m21 = (u.v * w.y - u.y * w.v) * invDet; + m22 = (u.x * w.v - u.v * w.x) * invDet; + mdx = v[0].u - m11 * v[0].x - m12 * v[0].y; + mdy = v[0].v - m21 * v[0].x - m22 * v[0].y; + + int dudx = int(m11 * 0x10000); + int dvdx = int(m21 * 0x10000); + int dudy = int(m12 * 0x10000); + int dvdy = int(m22 * 0x10000); + int u0 = qCeil((0.5 * m11 + 0.5 * m12 + mdx) * 0x10000) - 1; + int v0 = qCeil((0.5 * m21 + 0.5 * m22 + mdy) * 0x10000) - 1; + + int x1 = qFloor(sourceRect.left()); + int y1 = qFloor(sourceRect.top()); + int x2 = qCeil(sourceRect.right()); + int y2 = qCeil(sourceRect.bottom()); + QRect sourceRectI(x1, y1, x2 - x1, y2 - y1); + + // rasterize trapezoids. + if (v[1].y < v[3].y) { + qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[0], v[3], sourceRectI, clip, v[0].y, v[1].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); + qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[0], v[3], sourceRectI, clip, v[1].y, v[3].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); + qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[3], v[2], sourceRectI, clip, v[3].y, v[2].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); + } else { + qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[0], v[3], sourceRectI, clip, v[0].y, v[3].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); + qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[0], v[1], v[3], v[2], sourceRectI, clip, v[3].y, v[1].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); + qt_transform_image_rasterize(destPixels, dbpl, srcPixels, sbpl, v[1], v[2], v[3], v[2], sourceRectI, clip, v[1].y, v[2].y, dudx, dvdx, dudy, dvdy, u0, v0, blender); + } +} + +QT_END_NAMESPACE + +#endif // QBLENDFUNCTIONS_P_H diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 1f75ec7..917b910 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -175,7 +175,7 @@ Q_STATIC_TEMPLATE_FUNCTION uint * QT_FASTCALL destFetch(uint *buffer, QRasterBuf # define SPANFUNC_POINTER_DESTFETCH(Arg) destFetch<Arg> -static const DestFetchProc destFetchProc[QImage::NImageFormats] = +static DestFetchProc destFetchProc[QImage::NImageFormats] = { 0, // Format_Invalid destFetchMono, // Format_Mono, @@ -323,7 +323,7 @@ Q_STATIC_TEMPLATE_FUNCTION void QT_FASTCALL destStore(QRasterBuffer *rasterBuffe # define SPANFUNC_POINTER_DESTSTORE(DEST) destStore<DEST> -static const DestStoreProc destStoreProc[QImage::NImageFormats] = +static DestStoreProc destStoreProc[QImage::NImageFormats] = { 0, // Format_Invalid destStoreMono, // Format_Mono, @@ -2827,7 +2827,7 @@ static void QT_FASTCALL rasterop_SourceAndNotDestination(uint *dest, } } -static const CompositionFunctionSolid functionForModeSolid_C[] = { +static CompositionFunctionSolid functionForModeSolid_C[] = { comp_func_solid_SourceOver, comp_func_solid_DestinationOver, comp_func_solid_Clear, @@ -2865,7 +2865,7 @@ static const CompositionFunctionSolid functionForModeSolid_C[] = { static const CompositionFunctionSolid *functionForModeSolid = functionForModeSolid_C; -static const CompositionFunction functionForMode_C[] = { +static CompositionFunction functionForMode_C[] = { comp_func_SourceOver, comp_func_DestinationOver, comp_func_Clear, @@ -7961,6 +7961,20 @@ void qInitDrawhelperAsm() qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon; qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_neon; qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_neon; + qBlendFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_rgb16_neon; + qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB16] = qt_blend_rgb16_on_argb32_neon; + + qScaleFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_rgb16_neon; + qScaleFunctions[QImage::Format_RGB16][QImage::Format_RGB16] = qt_scale_image_rgb16_on_rgb16_neon; + + qTransformFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_transform_image_argb32_on_rgb16_neon; + qTransformFunctions[QImage::Format_RGB16][QImage::Format_RGB16] = qt_transform_image_rgb16_on_rgb16_neon; + + qDrawHelper[QImage::Format_RGB16].alphamapBlit = qt_alphamapblit_quint16_neon; + + functionForMode_C[QPainter::CompositionMode_SourceOver] = qt_blend_argb32_on_argb32_scanline_neon; + destFetchProc[QImage::Format_RGB16] = qt_destFetchRGB16_neon; + destStoreProc[QImage::Format_RGB16] = qt_destStoreRGB16_neon; } #endif diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp index 77c5202..ee5f24a 100644 --- a/src/gui/painting/qdrawhelper_neon.cpp +++ b/src/gui/painting/qdrawhelper_neon.cpp @@ -40,10 +40,13 @@ ****************************************************************************/ #include <private/qdrawhelper_p.h> +#include <private/qblendfunctions_p.h> +#include <private/qmath_p.h> #ifdef QT_HAVE_NEON #include <private/qdrawhelper_neon_p.h> +#include <private/qpaintengine_raster_p.h> #include <arm_neon.h> QT_BEGIN_NAMESPACE @@ -87,60 +90,142 @@ static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, u return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half)); } -void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl, - const uchar *srcPixels, int sbpl, - int w, int h, - int const_alpha) +extern "C" void +pixman_composite_over_8888_0565_asm_neon (int32_t w, + int32_t h, + uint16_t *dst, + int32_t dst_stride, + uint32_t *src, + int32_t src_stride); + +extern "C" void +pixman_composite_over_8888_8888_asm_neon (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint32_t *src, + int32_t src_stride); + +extern "C" void +pixman_composite_src_0565_8888_asm_neon (int32_t w, + int32_t h, + uint32_t *dst, + int32_t dst_stride, + uint16_t *src, + int32_t src_stride); + +extern "C" void +pixman_composite_over_n_8_0565_asm_neon (int32_t w, + int32_t h, + uint16_t *dst, + int32_t dst_stride, + uint32_t src, + int32_t unused, + uint8_t *mask, + int32_t mask_stride); + +extern "C" void +pixman_composite_scanline_over_asm_neon (int32_t w, + const uint32_t *dst, + const uint32_t *src); + +// qblendfunctions.cpp +void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + +void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) { - const uint *src = (const uint *) srcPixels; - uint *dst = (uint *) destPixels; - uint16x8_t half = vdupq_n_u16(0x80); - uint16x8_t full = vdupq_n_u16(0xff); - if (const_alpha == 256) { - for (int y = 0; y < h; ++y) { - int x = 0; - for (; x < w-3; x += 4) { - uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]); - if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) { - // all opaque - vst1q_u32((uint32_t *)&dst[x], src32); - } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) { - uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]); + dbpl /= 4; + sbpl /= 2; - const uint8x16_t src8 = vreinterpretq_u8_u32(src32); - const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32); + quint32 *dst = (quint32 *) destPixels; + quint16 *src = (quint16 *) srcPixels; - const uint8x8_t src8_low = vget_low_u8(src8); - const uint8x8_t dst8_low = vget_low_u8(dst8); + if (const_alpha != 256) { + quint8 a = (255 * const_alpha) >> 8; + quint8 ia = 255 - a; + + while (h--) { + for (int x=0; x<w; ++x) + dst[x] = INTERPOLATE_PIXEL_255(qt_colorConvert(src[x], dst[x]), a, dst[x], ia); + dst += dbpl; + src += sbpl; + } + return; + } - const uint8x8_t src8_high = vget_high_u8(src8); - const uint8x8_t dst8_high = vget_high_u8(dst8); + pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl); +} - const uint16x8_t src16_low = vmovl_u8(src8_low); - const uint16x8_t dst16_low = vmovl_u8(dst8_low); +extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha); - const uint16x8_t src16_high = vmovl_u8(src8_high); - const uint16x8_t dst16_high = vmovl_u8(dst8_high); +void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) +{ + quint16 *dst = (quint16 *) destPixels; + quint32 *src = (quint32 *) srcPixels; - const uint16x8_t result16_low = qvsource_over_u16(src16_low, dst16_low, half, full); - const uint16x8_t result16_high = qvsource_over_u16(src16_high, dst16_high, half, full); + if (const_alpha != 256) { + for (int y=0; y<h; ++y) { + int i = 0; + for (; i < w-7; i += 8) + blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha); - const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low)); - const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high)); + if (i < w) { + int tail = w - i; - vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high)); + quint16 dstBuffer[8]; + quint32 srcBuffer[8]; + + for (int j = 0; j < tail; ++j) { + dstBuffer[j] = dst[i + j]; + srcBuffer[j] = src[i + j]; + } + + blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha); + + for (int j = 0; j < tail; ++j) { + dst[i + j] = dstBuffer[j]; + src[i + j] = srcBuffer[j]; } } - for (; x<w; ++x) { - uint s = src[x]; - if (s >= 0xff000000) - dst[x] = s; - else if (s != 0) - dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); - } - dst = (quint32 *)(((uchar *) dst) + dbpl); - src = (const quint32 *)(((const uchar *) src) + sbpl); + + dst = (quint16 *)(((uchar *) dst) + dbpl); + src = (quint32 *)(((uchar *) src) + sbpl); } + return; + } + + pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4); +} + +void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha) +{ + if (const_alpha == 255) { + pixman_composite_scanline_over_asm_neon(length, dest, src); + } else { + qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255); + } +} + +void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) +{ + const uint *src = (const uint *) srcPixels; + uint *dst = (uint *) destPixels; + uint16x8_t half = vdupq_n_u16(0x80); + uint16x8_t full = vdupq_n_u16(0xff); + if (const_alpha == 256) { + pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4); } else if (const_alpha != 0) { const_alpha = (const_alpha * 255) >> 8; uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha); @@ -254,6 +339,246 @@ void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl, } } +void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer, + int x, int y, quint32 color, + const uchar *bitmap, + int mapWidth, int mapHeight, int mapStride, + const QClipData *) +{ + quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x; + const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16); + + uchar *mask = const_cast<uchar *>(bitmap); + + pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride); +} + +extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha); + +template <typename SRC, typename BlendFunc> +struct Blend_on_RGB16_SourceAndConstAlpha_Neon { + Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha) + : m_index(0) + , m_blender(blender) + , m_const_alpha(const_alpha) + { + } + + inline void write(quint16 *dst, quint32 src) + { + srcBuffer[m_index++] = src; + + if (m_index == 8) { + m_blender(dst - 7, srcBuffer, m_const_alpha); + m_index = 0; + } + } + + inline void flush(quint16 *dst) + { + if (m_index > 0) { + quint16 dstBuffer[8]; + for (int i = 0; i < m_index; ++i) + dstBuffer[i] = dst[i - m_index]; + + m_blender(dstBuffer, srcBuffer, m_const_alpha); + + for (int i = 0; i < m_index; ++i) + dst[i - m_index] = dstBuffer[i]; + + m_index = 0; + } + } + + SRC srcBuffer[8]; + + int m_index; + BlendFunc m_blender; + int m_const_alpha; +}; + +template <typename SRC, typename BlendFunc> +Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc> +Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha) +{ + return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha); +} + +void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + int const_alpha) +{ + if (const_alpha == 0) + return; + + qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, + Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha)); +} + +void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + int const_alpha); + +void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + int const_alpha) +{ + if (const_alpha == 0) + return; + + if (const_alpha == 256) { + qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, const_alpha); + return; + } + + qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, + Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha)); +} + +extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + const QTransform &targetRectTransform, + int const_alpha); + +void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + const QTransform &targetRectTransform, + int const_alpha) +{ + if (const_alpha == 0) + return; + + if (const_alpha == 256) { + qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha); + return; + } + + qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl, + reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform, + Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha)); +} + +void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + const QTransform &targetRectTransform, + int const_alpha) +{ + if (const_alpha == 0) + return; + + qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl, + reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform, + Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha)); +} + +static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src) +{ + asm volatile ( + "vld1.16 { d0, d1 }, [%[SRC]]\n\t" + + /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format + and put data into d4 - red, d3 - green, d2 - blue */ + "vshrn.u16 d4, q0, #8\n\t" + "vshrn.u16 d3, q0, #3\n\t" + "vsli.u16 q0, q0, #5\n\t" + "vsri.u8 d4, d4, #5\n\t" + "vsri.u8 d3, d3, #6\n\t" + "vshrn.u16 d2, q0, #2\n\t" + + /* fill d5 - alpha with 0xff */ + "mov r2, #255\n\t" + "vdup.8 d5, r2\n\t" + + "vst4.8 { d2, d3, d4, d5 }, [%[DST]]" + : : [DST]"r" (dst), [SRC]"r" (src) + : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5" + ); +} + +uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length) +{ + const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x; + + int i = 0; + for (; i < length - 7; i += 8) + convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]); + + if (i < length) { + quint16 srcBuffer[8]; + quint32 dstBuffer[8]; + + int tail = length - i; + for (int j = 0; j < tail; ++j) + srcBuffer[j] = data[i + j]; + + convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer); + + for (int j = 0; j < tail; ++j) + buffer[i + j] = dstBuffer[j]; + } + + return buffer; +} + +static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src) +{ + asm volatile ( + "vld4.8 { d0, d1, d2, d3 }, [%[SRC]]\n\t" + + /* convert to r5g6b5 and store it into {d28, d29} */ + "vshll.u8 q14, d2, #8\n\t" + "vshll.u8 q8, d1, #8\n\t" + "vshll.u8 q9, d0, #8\n\t" + "vsri.u16 q14, q8, #5\n\t" + "vsri.u16 q14, q9, #11\n\t" + + "vst1.16 { d28, d29 }, [%[DST]]" + : : [DST]"r" (dst), [SRC]"r" (src) + : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29" + ); +} + +void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length) +{ + quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x; + + int i = 0; + for (; i < length - 7; i += 8) + convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]); + + if (i < length) { + quint32 srcBuffer[8]; + quint16 dstBuffer[8]; + + int tail = length - i; + for (int j = 0; j < tail; ++j) + srcBuffer[j] = buffer[i + j]; + + convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer); + + for (int j = 0; j < tail; ++j) + data[i + j] = dstBuffer[j]; + } +} + QT_END_NAMESPACE #endif // QT_HAVE_NEON diff --git a/src/gui/painting/qdrawhelper_neon_asm.S b/src/gui/painting/qdrawhelper_neon_asm.S new file mode 100644 index 0000000..9992817 --- /dev/null +++ b/src/gui/painting/qdrawhelper_neon_asm.S @@ -0,0 +1,192 @@ +/**************************************************************************** +** +** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the QtGui module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +.text +.fpu neon +.arch armv7a +.altmacro + +/* void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha) */ + + .func blend_8_pixels_argb32_on_rgb16_neon + .global blend_8_pixels_argb32_on_rgb16_neon + /* For ELF format also set function visibility to hidden */ +#ifdef __ELF__ + .hidden blend_8_pixels_argb32_on_rgb16_neon + .type blend_8_pixels_argb32_on_rgb16_neon, %function +#endif +blend_8_pixels_argb32_on_rgb16_neon: + vld4.8 { d0, d1, d2, d3 }, [r1] + vld1.16 { d4, d5 }, [r0] + + cmp r2, #256 + beq .blend_32_inner + + vdup.8 d6, r2 + + /* multiply by const_alpha */ + vmull.u8 q8, d6, d0 + vmull.u8 q9, d6, d1 + vmull.u8 q10, d6, d2 + vmull.u8 q11, d6, d3 + + vshrn.u16 d0, q8, #8 + vshrn.u16 d1, q9, #8 + vshrn.u16 d2, q10, #8 + vshrn.u16 d3, q11, #8 + +.blend_32_inner: + /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format + and put data into d6 - red, d7 - green, d30 - blue */ + vshrn.u16 d6, q2, #8 + vshrn.u16 d7, q2, #3 + vsli.u16 q2, q2, #5 + vsri.u8 d6, d6, #5 + vmvn.8 d3, d3 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q2, #2 + + pld [r0, #128] + + /* now do alpha blending, storing results in 8-bit planar format + into d16 - red, d19 - green, d18 - blue */ + vmull.u8 q10, d3, d6 + vmull.u8 q11, d3, d7 + vmull.u8 q12, d3, d30 + vrshr.u16 q13, q10, #8 + vrshr.u16 q3, q11, #8 + vrshr.u16 q15, q12, #8 + vraddhn.u16 d20, q10, q13 + vraddhn.u16 d23, q11, q3 + vraddhn.u16 d22, q12, q15 + vqadd.u8 d16, d2, d20 + vqadd.u8 q9, q0, q11 + /* convert the result to r5g6b5 and store it into {d28, d29} */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 + + vst1.16 { d28, d29 }, [r0] + + bx lr + + .endfunc + +/* void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha) */ + + .func blend_8_pixels_rgb16_on_rgb16_neon + .global blend_8_pixels_rgb16_on_rgb16_neon + /* For ELF format also set function visibility to hidden */ +#ifdef __ELF__ + .hidden blend_8_pixels_rgb16_on_rgb16_neon + .type blend_8_pixels_rgb16_on_rgb16_neon, %function +#endif +blend_8_pixels_rgb16_on_rgb16_neon: + vld1.16 { d0, d1 }, [r0] + vld1.16 { d2, d3 }, [r1] + + rsb r3, r2, #256 + vdup.8 d4, r2 + vdup.8 d5, r3 + + /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format + and put data into d6 - red, d7 - green, d30 - blue */ + vshrn.u16 d6, q0, #8 + vshrn.u16 d7, q0, #3 + vsli.u16 q0, q0, #5 + vsri.u8 d6, d6, #5 + vsri.u8 d7, d7, #6 + vshrn.u16 d30, q0, #2 + + /* same from {d2, d3} into {d26, d27, d28} */ + vshrn.u16 d26, q1, #8 + vshrn.u16 d27, q1, #3 + vsli.u16 q1, q1, #5 + vsri.u8 d26, d26, #5 + vsri.u8 d27, d27, #6 + vshrn.u16 d28, q1, #2 + + /* multiply dst by inv const_alpha */ + vmull.u8 q10, d5, d6 + vmull.u8 q11, d5, d7 + vmull.u8 q12, d5, d30 + + vshrn.u16 d6, q10, #8 + vshrn.u16 d7, q11, #8 + vshrn.u16 d30, q12, #8 + + /* multiply src by const_alpha */ + vmull.u8 q10, d4, d26 + vmull.u8 q11, d4, d27 + vmull.u8 q12, d4, d28 + + vshrn.u16 d26, q10, #8 + vshrn.u16 d27, q11, #8 + vshrn.u16 d28, q12, #8 + + /* preload dst + 128 */ + pld [r0, #128] + + /* add components, storing results in 8-bit planar format + into d16 - red, d19 - green, d18 - blue */ + vadd.u8 d16, d26, d6 + vadd.u8 d19, d27, d7 + vadd.u8 d18, d28, d30 + + /* convert the result to r5g6b5 and store it into {d28, d29} */ + vshll.u8 q14, d16, #8 + vshll.u8 q8, d19, #8 + vshll.u8 q9, d18, #8 + vsri.u16 q14, q8, #5 + vsri.u16 q14, q9, #11 + + vst1.16 { d28, d29 }, [r0] + + bx lr + + .endfunc diff --git a/src/gui/painting/qdrawhelper_neon_p.h b/src/gui/painting/qdrawhelper_neon_p.h index 1994441..d6a4509 100644 --- a/src/gui/painting/qdrawhelper_neon_p.h +++ b/src/gui/painting/qdrawhelper_neon_p.h @@ -69,6 +69,64 @@ void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl, int w, int h, int const_alpha); +void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + +void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, + const uint *src, + int length, + uint const_alpha); + +void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha); + +void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer, + int x, int y, quint32 color, + const uchar *bitmap, + int mapWidth, int mapHeight, int mapStride, + const QClipData *clip); + +void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + int const_alpha); + +void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + int const_alpha); + +void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + const QTransform &targetRectTransform, + int const_alpha); + +void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + const QRectF &targetRect, + const QRectF &sourceRect, + const QRect &clip, + const QTransform &targetRectTransform, + int const_alpha); + +uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, + QRasterBuffer *rasterBuffer, + int x, int y, int length); + +void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, + int x, int y, const uint *buffer, int length); + #endif // QT_HAVE_NEON QT_END_NAMESPACE diff --git a/src/gui/painting/qpaintbuffer.cpp b/src/gui/painting/qpaintbuffer.cpp index 39b76c8..ca2077f 100644 --- a/src/gui/painting/qpaintbuffer.cpp +++ b/src/gui/painting/qpaintbuffer.cpp @@ -269,24 +269,304 @@ void QPaintBuffer::draw(QPainter *painter, int frame) const printf("\n"); #endif - if (painter && !painter->isActive()) - return; + processCommands(painter, frameStartIndex(frame), frameEndIndex(frame)); + +#ifdef QPAINTBUFFER_DEBUG_DRAW + qDebug() << "QPaintBuffer::draw() -------------------------------- DONE!"; +#endif +} + +int QPaintBuffer::frameStartIndex(int frame) const +{ + return (frame == 0) ? 0 : d_ptr->frames.at(frame - 1); +} + +int QPaintBuffer::frameEndIndex(int frame) const +{ + return (frame == d_ptr->frames.size()) ? d_ptr->commands.size() : d_ptr->frames.at(frame); +} + +int QPaintBuffer::processCommands(QPainter *painter, int begin, int end) const +{ + if (!painter || !painter->isActive()) + return 0; QPaintEngineEx *xengine = painter->paintEngine()->isExtended() ? (QPaintEngineEx *) painter->paintEngine() : 0; if (xengine) { QPaintEngineExReplayer player; - player.draw(*this, painter, frame); + player.processCommands(*this, painter, begin, end); } else { QPainterReplayer player; - player.draw(*this, painter, frame); + player.processCommands(*this, painter, begin, end); } -#ifdef QPAINTBUFFER_DEBUG_DRAW - qDebug() << "QPaintBuffer::draw() -------------------------------- DONE!"; -#endif + int depth = 0; + for (int i = begin; i < end; ++i) { + const QPaintBufferCommand &cmd = d_ptr->commands.at(i); + if (cmd.id == QPaintBufferPrivate::Cmd_Save) + ++depth; + else if (cmd.id == QPaintBufferPrivate::Cmd_Restore) + --depth; + } + return depth; } +QString QPaintBuffer::commandDescription(int command) const +{ + QString desc; + QDebug debug(&desc); + + const QPaintBufferCommand &cmd = d_ptr->commands.at(command); + + switch (cmd.id) { + case QPaintBufferPrivate::Cmd_Save: { + debug << "Cmd_Save"; + break; } + + case QPaintBufferPrivate::Cmd_Restore: { + debug << "Cmd_Restore"; + break; } + + case QPaintBufferPrivate::Cmd_SetBrush: { + QBrush brush = qVariantValue<QBrush>(d_ptr->variants.at(cmd.offset)); + debug << "Cmd_SetBrush: " << brush; + break; } + + case QPaintBufferPrivate::Cmd_SetBrushOrigin: { + debug << "Cmd_SetBrushOrigin: " << d_ptr->variants.at(cmd.offset).toPointF(); + break; } + + case QPaintBufferPrivate::Cmd_SetCompositionMode: { + QPainter::CompositionMode mode = (QPainter::CompositionMode) cmd.extra; + debug << "ExCmd_SetCompositionMode, mode: " << mode; + break; } + + case QPaintBufferPrivate::Cmd_SetOpacity: { + debug << "ExCmd_SetOpacity: " << d_ptr->variants.at(cmd.offset).toDouble(); + break; } + + case QPaintBufferPrivate::Cmd_DrawVectorPath: { + debug << "ExCmd_DrawVectorPath: size: " << cmd.size +// << ", hints:" << d->ints[cmd.offset2+cmd.size] + << "pts/elms:" << cmd.offset << cmd.offset2; + break; } + + case QPaintBufferPrivate::Cmd_StrokeVectorPath: { + QPen pen = qVariantValue<QPen>(d_ptr->variants.at(cmd.extra)); + debug << "ExCmd_StrokeVectorPath: size: " << cmd.size +// << ", hints:" << d->ints[cmd.offset2+cmd.size] + << "pts/elms:" << cmd.offset << cmd.offset2 << pen; + break; } + + case QPaintBufferPrivate::Cmd_FillVectorPath: { + QBrush brush = qVariantValue<QBrush>(d_ptr->variants.at(cmd.extra)); + debug << "ExCmd_FillVectorPath: size: " << cmd.size +// << ", hints:" << d->ints[cmd.offset2+cmd.size] + << "pts/elms:" << cmd.offset << cmd.offset2 << brush; + break; } + + case QPaintBufferPrivate::Cmd_FillRectBrush: { + QBrush brush = qVariantValue<QBrush>(d_ptr->variants.at(cmd.extra)); + QRectF *rect = (QRectF *)(d_ptr->floats.constData() + cmd.offset); + debug << "ExCmd_FillRectBrush, offset: " << cmd.offset << " rect: " << *rect << " brush: " << brush; + break; } + + case QPaintBufferPrivate::Cmd_FillRectColor: { + QColor color = qVariantValue<QColor>(d_ptr->variants.at(cmd.extra)); + QRectF *rect = (QRectF *)(d_ptr->floats.constData() + cmd.offset); + debug << "ExCmd_FillRectBrush, offset: " << cmd.offset << " rect: " << *rect << " color: " << color; + break; } + + case QPaintBufferPrivate::Cmd_DrawPolygonF: { + debug << "ExCmd_DrawPolygonF, offset: " << cmd.offset << " size: " << cmd.size + << " mode: " << cmd.extra + << d_ptr->floats.at(cmd.offset) + << d_ptr->floats.at(cmd.offset+1); + break; } + + case QPaintBufferPrivate::Cmd_DrawPolygonI: { + debug << "ExCmd_DrawPolygonI, offset: " << cmd.offset << " size: " << cmd.size + << " mode: " << cmd.extra + << d_ptr->ints.at(cmd.offset) + << d_ptr->ints.at(cmd.offset+1); + break; } + + case QPaintBufferPrivate::Cmd_DrawEllipseF: { + debug << "ExCmd_DrawEllipseF, offset: " << cmd.offset; + break; } + + case QPaintBufferPrivate::Cmd_DrawLineF: { + debug << "ExCmd_DrawLineF, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawLineI: { + debug << "ExCmd_DrawLineI, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawPointsF: { + debug << "ExCmd_DrawPointsF, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawPointsI: { + debug << "ExCmd_DrawPointsI, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawPolylineF: { + debug << "ExCmd_DrawPolylineF, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawPolylineI: { + debug << "ExCmd_DrawPolylineI, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawRectF: { + debug << "ExCmd_DrawRectF, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawRectI: { + debug << "ExCmd_DrawRectI, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_SetClipEnabled: { + bool clipEnabled = d_ptr->variants.at(cmd.offset).toBool(); + debug << "ExCmd_SetClipEnabled:" << clipEnabled; + break; } + + case QPaintBufferPrivate::Cmd_ClipVectorPath: { + QVectorPathCmd path(d_ptr, cmd); + debug << "ExCmd_ClipVectorPath:" << path().elementCount(); + break; } + + case QPaintBufferPrivate::Cmd_ClipRect: { + QRect rect(QPoint(d_ptr->ints.at(cmd.offset), d_ptr->ints.at(cmd.offset + 1)), + QPoint(d_ptr->ints.at(cmd.offset + 2), d_ptr->ints.at(cmd.offset + 3))); + debug << "ExCmd_ClipRect:" << rect << cmd.extra; + break; } + + case QPaintBufferPrivate::Cmd_ClipRegion: { + QRegion region(d_ptr->variants.at(cmd.offset).value<QRegion>()); + debug << "ExCmd_ClipRegion:" << region.boundingRect() << cmd.extra; + break; } + + case QPaintBufferPrivate::Cmd_SetPen: { + QPen pen = qVariantValue<QPen>(d_ptr->variants.at(cmd.offset)); + debug << "Cmd_SetPen: " << pen; + break; } + + case QPaintBufferPrivate::Cmd_SetTransform: { + QTransform xform = qVariantValue<QTransform>(d_ptr->variants.at(cmd.offset)); + debug << "Cmd_SetTransform, offset: " << cmd.offset << xform; + break; } + + case QPaintBufferPrivate::Cmd_SetRenderHints: { + debug << "Cmd_SetRenderHints, hints: " << cmd.extra; + break; } + + case QPaintBufferPrivate::Cmd_SetBackgroundMode: { + debug << "Cmd_SetBackgroundMode: " << cmd.extra; + break; } + + case QPaintBufferPrivate::Cmd_DrawConvexPolygonF: { + debug << "Cmd_DrawConvexPolygonF, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawConvexPolygonI: { + debug << "Cmd_DrawConvexPolygonI, offset: " << cmd.offset << " size: " << cmd.size; + break; } + + case QPaintBufferPrivate::Cmd_DrawEllipseI: { + debug << "Cmd_DrawEllipseI, offset: " << cmd.offset; + break; } + + case QPaintBufferPrivate::Cmd_DrawPixmapRect: { + QPixmap pm(d_ptr->variants.at(cmd.offset).value<QPixmap>()); + QRectF r(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1), + d_ptr->floats.at(cmd.extra+2), d_ptr->floats.at(cmd.extra+3)); + + QRectF sr(d_ptr->floats.at(cmd.extra+4), d_ptr->floats.at(cmd.extra+5), + d_ptr->floats.at(cmd.extra+6), d_ptr->floats.at(cmd.extra+7)); + debug << "Cmd_DrawPixmapRect:" << r << sr << pm.size(); + break; } + + case QPaintBufferPrivate::Cmd_DrawPixmapPos: { + QPixmap pm(d_ptr->variants.at(cmd.offset).value<QPixmap>()); + QPointF pos(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1)); + debug << "Cmd_DrawPixmapPos:" << pos << pm.size(); + break; } + + case QPaintBufferPrivate::Cmd_DrawTiledPixmap: { + QPixmap pm(d_ptr->variants.at(cmd.offset).value<QPixmap>()); + QRectF r(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1), + d_ptr->floats.at(cmd.extra+2), d_ptr->floats.at(cmd.extra+3)); + + QPointF offset(d_ptr->floats.at(cmd.extra+4), d_ptr->floats.at(cmd.extra+5)); + debug << "Cmd_DrawTiledPixmap:" << r << offset << pm.size(); + break; } + + case QPaintBufferPrivate::Cmd_DrawImageRect: { + QImage image(d_ptr->variants.at(cmd.offset).value<QImage>()); + QRectF r(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1), + d_ptr->floats.at(cmd.extra+2), d_ptr->floats.at(cmd.extra+3)); + QRectF sr(d_ptr->floats.at(cmd.extra+4), d_ptr->floats.at(cmd.extra+5), + d_ptr->floats.at(cmd.extra+6), d_ptr->floats.at(cmd.extra+7)); + debug << "Cmd_DrawImageRect:" << r << sr << image.size(); + break; } + + case QPaintBufferPrivate::Cmd_DrawImagePos: { + QImage image(d_ptr->variants.at(cmd.offset).value<QImage>()); + QPointF pos(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1)); + debug << "Cmd_DrawImagePos:" << pos << image.size(); + break; } + + case QPaintBufferPrivate::Cmd_DrawText: { + QPointF pos(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1)); + QList<QVariant> variants(d_ptr->variants.at(cmd.offset).value<QList<QVariant> >()); + + QFont font(variants.at(0).value<QFont>()); + QString text(variants.at(1).value<QString>()); + + debug << "Cmd_DrawText:" << pos << text << font.family(); + break; } + + case QPaintBufferPrivate::Cmd_DrawTextItem: { + QPointF pos(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1)); + QTextItemIntCopy *tiCopy = reinterpret_cast<QTextItemIntCopy *>(qVariantValue<void *>(d_ptr->variants.at(cmd.offset))); + QTextItemInt &ti = (*tiCopy)(); + QString text(ti.text()); + + QFont font(ti.font()); + font.setUnderline(false); + font.setStrikeOut(false); + font.setOverline(false); + + const QTextItemInt &si = static_cast<const QTextItemInt &>(ti); + qreal justificationWidth = 0; + if (si.justified) + justificationWidth = si.width.toReal(); + + debug << "Cmd_DrawTextItem:" << pos << " " << text; + break; } + case QPaintBufferPrivate::Cmd_SystemStateChanged: { + QRegion systemClip(d_ptr->variants.at(cmd.offset).value<QRegion>()); + + debug << "Cmd_SystemStateChanged:" << systemClip; + break; } + case QPaintBufferPrivate::Cmd_Translate: { + QPointF delta(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1)); + debug << "Cmd_Translate:" << delta; + break; } + case QPaintBufferPrivate::Cmd_DrawStaticText: { + QPointF delta(d_ptr->floats.at(cmd.extra), d_ptr->floats.at(cmd.extra+1)); + QVariantList variants(d_ptr->variants.at(cmd.offset).value<QVariantList>()); + + QStaticText text(variants.at(0).value<QStaticText>()); + debug << "Cmd_DrawStaticText:" << text.text(); + break; } + } + + return desc; +} QRectF QPaintBuffer::boundingRect() const { @@ -1110,15 +1390,12 @@ void QPainterReplayer::setupTransform(QPainter *_painter) painter->setTransform(m_world_matrix); } -void QPainterReplayer::draw(const QPaintBuffer &buffer, QPainter *_painter, int frame) +void QPainterReplayer::processCommands(const QPaintBuffer &buffer, QPainter *p, int begin, int end) { d = buffer.d_ptr; - setupTransform(_painter); - - int frameStart = (frame == 0) ? 0 : d->frames.at(frame-1); - int frameEnd = (frame == d->frames.size()) ? d->commands.size() : d->frames.at(frame); + painter = p; - for (int cmdIndex=frameStart; cmdIndex<frameEnd; ++cmdIndex) { + for (int cmdIndex = begin; cmdIndex < end; ++cmdIndex) { const QPaintBufferCommand &cmd = d->commands.at(cmdIndex); process(cmd); } diff --git a/src/gui/painting/qpaintbuffer_p.h b/src/gui/painting/qpaintbuffer_p.h index 0fde290..4576947 100644 --- a/src/gui/painting/qpaintbuffer_p.h +++ b/src/gui/painting/qpaintbuffer_p.h @@ -78,6 +78,12 @@ public: int numFrames() const; void draw(QPainter *painter, int frame = 0) const; + + int frameStartIndex(int frame) const; + int frameEndIndex(int frame) const; + int processCommands(QPainter *painter, int begin, int end) const; + QString commandDescription(int command) const; + void setBoundingRect(const QRectF &rect); QRectF boundingRect() const; @@ -317,7 +323,7 @@ public: void setupTransform(QPainter *painter); virtual void process(const QPaintBufferCommand &cmd); - void draw(const QPaintBuffer &buffer, QPainter *painter, int frame); + void processCommands(const QPaintBuffer &buffer, QPainter *painter, int begin, int end); protected: QPaintBufferPrivate *d; diff --git a/src/gui/painting/qtransform.cpp b/src/gui/painting/qtransform.cpp index 4f42a58..988d678 100644 --- a/src/gui/painting/qtransform.cpp +++ b/src/gui/painting/qtransform.cpp @@ -1037,8 +1037,18 @@ QDataStream & operator>>(QDataStream &s, QTransform &t) #ifndef QT_NO_DEBUG_STREAM QDebug operator<<(QDebug dbg, const QTransform &m) { - dbg.nospace() << "QTransform(" - << "11=" << m.m11() + static const char *typeStr[] = + { + "TxNone", + "TxTranslate", + "TxScale", + "TxRotate", + "TxShear", + "TxProject" + }; + + dbg.nospace() << "QTransform(type=" << typeStr[m.type()] << ',' + << " 11=" << m.m11() << " 12=" << m.m12() << " 13=" << m.m13() << " 21=" << m.m21() @@ -1048,6 +1058,7 @@ QDebug operator<<(QDebug dbg, const QTransform &m) << " 32=" << m.m32() << " 33=" << m.m33() << ')'; + return dbg.space(); } #endif diff --git a/src/plugins/bearer/corewlan/qcorewlanengine.h b/src/plugins/bearer/corewlan/qcorewlanengine.h index 5e93193..11f5d96 100644 --- a/src/plugins/bearer/corewlan/qcorewlanengine.h +++ b/src/plugins/bearer/corewlan/qcorewlanengine.h @@ -102,7 +102,7 @@ protected: void getUserConfigurations(); QString getNetworkNameFromSsid(const QString &ssid); QString getSsidFromNetworkName(const QString &name); - QStringList foundNetwork(const QString &id, const QString &ssid, const QNetworkConfiguration::StateFlags state, const QString &interfaceName); + QStringList foundNetwork(const QString &id, const QString &ssid, const QNetworkConfiguration::StateFlags state, const QString &interfaceName, const QNetworkConfiguration::Purpose purpose); }; QT_END_NAMESPACE diff --git a/src/plugins/bearer/corewlan/qcorewlanengine.mm b/src/plugins/bearer/corewlan/qcorewlanengine.mm index a366d00..b59ccee 100644 --- a/src/plugins/bearer/corewlan/qcorewlanengine.mm +++ b/src/plugins/bearer/corewlan/qcorewlanengine.mm @@ -271,7 +271,7 @@ void QCoreWlanEngine::connectToId(const QString &id) SecKeychainAttributeList attributeList = {3,attributes}; SecKeychainSearchRef searchRef; - OSErr result = SecKeychainSearchCreateFromAttributes(NULL, kSecGenericPasswordItemClass, &attributeList, &searchRef); + SecKeychainSearchCreateFromAttributes(NULL, kSecGenericPasswordItemClass, &attributeList, &searchRef); NSString *password = @""; SecKeychainItemRef searchItem; @@ -429,7 +429,14 @@ QStringList QCoreWlanEngine::scanForSsids(const QString &interfaceName) state = QNetworkConfiguration::Undefined; } } - found.append(foundNetwork(id, networkSsid, state, interfaceName)); + QNetworkConfiguration::Purpose purpose = QNetworkConfiguration::UnknownPurpose; + if([[apNetwork securityMode] intValue] == kCWSecurityModeOpen) { + purpose = QNetworkConfiguration::PublicPurpose; + } else { + purpose = QNetworkConfiguration::PrivatePurpose; + } + + found.append(foundNetwork(id, networkSsid, state, interfaceName, purpose)); } //end row } //end error @@ -470,13 +477,13 @@ QStringList QCoreWlanEngine::scanForSsids(const QString &interfaceName) state = QNetworkConfiguration::Defined; } - found.append(foundNetwork(id, networkName, state, interfaceName)); + found.append(foundNetwork(id, networkName, state, interfaceName, QNetworkConfiguration::UnknownPurpose)); } } return found; } -QStringList QCoreWlanEngine::foundNetwork(const QString &id, const QString &name, const QNetworkConfiguration::StateFlags state, const QString &interfaceName) +QStringList QCoreWlanEngine::foundNetwork(const QString &id, const QString &name, const QNetworkConfiguration::StateFlags state, const QString &interfaceName, const QNetworkConfiguration::Purpose purpose) { QStringList found; QMutexLocker locker(&mutex); @@ -507,6 +514,10 @@ QStringList QCoreWlanEngine::foundNetwork(const QString &id, const QString &name changed = true; } + if (ptr->purpose != purpose) { + ptr->purpose = purpose; + changed = true; + } ptr->mutex.unlock(); if (changed) { @@ -524,6 +535,7 @@ QStringList QCoreWlanEngine::foundNetwork(const QString &id, const QString &name ptr->state = state; ptr->type = QNetworkConfiguration::InternetAccessPoint; ptr->bearer = QLatin1String("WLAN"); + ptr->purpose = purpose; accessPointConfigurations.insert(ptr->id, ptr); configurationInterface.insert(ptr->id, interfaceName); diff --git a/src/plugins/bearer/networkmanager/qnetworkmanagerengine.cpp b/src/plugins/bearer/networkmanager/qnetworkmanagerengine.cpp index 28ee38e..72d6838 100644 --- a/src/plugins/bearer/networkmanager/qnetworkmanagerengine.cpp +++ b/src/plugins/bearer/networkmanager/qnetworkmanagerengine.cpp @@ -602,7 +602,11 @@ void QNetworkManagerEngine::newAccessPoint(const QString &path, const QDBusObjec ptr->isValid = true; ptr->id = QString::number(qHash(objectPath.path())); ptr->type = QNetworkConfiguration::InternetAccessPoint; - ptr->purpose = QNetworkConfiguration::PublicPurpose; + if(accessPoint->flags() == NM_802_11_AP_FLAGS_PRIVACY) { + ptr->purpose = QNetworkConfiguration::PrivatePurpose; + } else { + ptr->purpose = QNetworkConfiguration::PublicPurpose; + } ptr->state = QNetworkConfiguration::Undefined; ptr->bearer = QLatin1String("WLAN"); @@ -718,6 +722,7 @@ QNetworkConfigurationPrivate *QNetworkManagerEngine::parseConnection(const QStri if (connectionType == QLatin1String("802-3-ethernet")) { cpPriv->bearer = QLatin1String("Ethernet"); + cpPriv->purpose = QNetworkConfiguration::PublicPurpose; foreach (const QDBusObjectPath &devicePath, interface->getDevices()) { QNetworkManagerInterfaceDevice device(devicePath.path()); @@ -734,7 +739,12 @@ QNetworkConfigurationPrivate *QNetworkManagerEngine::parseConnection(const QStri cpPriv->bearer = QLatin1String("WLAN"); const QString connectionSsid = map.value("802-11-wireless").value("ssid").toString(); - + const QString connectionSecurity = map.value("802-11-wireless").value("security").toString(); + if(!connectionSecurity.isEmpty()) { + cpPriv->purpose = QNetworkConfiguration::PrivatePurpose; + } else { + cpPriv->purpose = QNetworkConfiguration::PublicPurpose; + } for (int i = 0; i < accessPoints.count(); ++i) { if (connectionSsid == accessPoints.at(i)->ssid()) { cpPriv->state |= QNetworkConfiguration::Discovered; diff --git a/src/plugins/mediaservices/directshow/mediaplayer/directshowmediatype.cpp b/src/plugins/mediaservices/directshow/mediaplayer/directshowmediatype.cpp index cf6d45b..f8f519d 100644 --- a/src/plugins/mediaservices/directshow/mediaplayer/directshowmediatype.cpp +++ b/src/plugins/mediaservices/directshow/mediaplayer/directshowmediatype.cpp @@ -130,9 +130,17 @@ QVideoSurfaceFormat DirectShowMediaType::formatFromType(const AM_MEDIA_TYPE &typ if (header->AvgTimePerFrame > 0) format.setFrameRate(10000 /header->AvgTimePerFrame); - format.setScanLineDirection(header->bmiHeader.biHeight < 0 - ? QVideoSurfaceFormat::TopToBottom - : QVideoSurfaceFormat::BottomToTop); + switch (qt_typeLookup[i].pixelFormat) { + case QVideoFrame::Format_RGB32: + case QVideoFrame::Format_BGR24: + case QVideoFrame::Format_RGB565: + case QVideoFrame::Format_RGB555: + if (header->bmiHeader.biHeight >= 0) + format.setScanLineDirection(QVideoSurfaceFormat::BottomToTop); + break; + default: + break; + } return format; } else if (IsEqualGUID(type.formattype, FORMAT_VideoInfo2)) { @@ -145,9 +153,17 @@ QVideoSurfaceFormat DirectShowMediaType::formatFromType(const AM_MEDIA_TYPE &typ if (header->AvgTimePerFrame > 0) format.setFrameRate(10000 / header->AvgTimePerFrame); - format.setScanLineDirection(header->bmiHeader.biHeight < 0 - ? QVideoSurfaceFormat::TopToBottom - : QVideoSurfaceFormat::BottomToTop); + switch (qt_typeLookup[i].pixelFormat) { + case QVideoFrame::Format_RGB32: + case QVideoFrame::Format_BGR24: + case QVideoFrame::Format_RGB565: + case QVideoFrame::Format_RGB555: + if (header->bmiHeader.biHeight >= 0) + format.setScanLineDirection(QVideoSurfaceFormat::BottomToTop); + break; + default: + break; + } return format; } diff --git a/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.cpp b/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.cpp index d54d188..a5f143f 100644 --- a/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.cpp +++ b/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.cpp @@ -191,6 +191,8 @@ void DirectShowPlayerService::load(const QMediaContent &media, QIODevice *stream m_atEnd = false; m_metaDataControl->updateGraph(0, 0); + QCoreApplication::postEvent(this, new QEvent(QEvent::Type(VideoOutputChange))); + if (m_resources.isEmpty() && !stream) { m_pendingTasks = 0; m_graphStatus = NoMedia; @@ -464,6 +466,8 @@ void DirectShowPlayerService::doRender(QMutexLocker *locker) QCoreApplication::postEvent(this, new QEvent(QEvent::Type(Error))); } + QCoreApplication::postEvent(this, new QEvent(QEvent::Type(VideoOutputChange))); + m_executedTasks |= Render; } } @@ -1144,6 +1148,9 @@ void DirectShowPlayerService::customEvent(QEvent *event) QMutexLocker locker(&m_mutex); m_playerControl->updatePosition(m_position); + } else if (event->type() == QEvent::Type(VideoOutputChange)) { + if (m_videoWindowControl) + m_videoWindowControl->updateNativeSize(); } else { QMediaService::customEvent(event); } diff --git a/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.h b/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.h index 23515d0..d3ef809 100644 --- a/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.h +++ b/src/plugins/mediaservices/directshow/mediaplayer/directshowplayerservice.h @@ -164,7 +164,8 @@ private: DurationChange, StatusChange, EndOfMedia, - PositionChange + PositionChange, + VideoOutputChange }; enum GraphStatus diff --git a/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.cpp b/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.cpp index 4b9aeb8..e25dd99 100644 --- a/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.cpp +++ b/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.cpp @@ -51,6 +51,7 @@ Vmr9VideoWindowControl::Vmr9VideoWindowControl(QObject *parent) , m_filter(com_new<IBaseFilter>(CLSID_VideoMixingRenderer9, IID_IBaseFilter)) , m_windowId(0) , m_dirtyValues(0) + , m_aspectRatioMode(Qt::KeepAspectRatio) , m_brightness(0) , m_contrast(0) , m_hue(0) @@ -90,33 +91,30 @@ void Vmr9VideoWindowControl::setWinId(WId id) QRect Vmr9VideoWindowControl::displayRect() const { - QRect rect; - - if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>( - m_filter, IID_IVMRWindowlessControl9)) { - RECT sourceRect; - RECT displayRect; - - if (control->GetVideoPosition(&sourceRect, &displayRect) == S_OK) { - rect = QRect( - displayRect.left, - displayRect.bottom, - displayRect.right - displayRect.left, - displayRect.bottom - displayRect.top); - } - control->Release(); - } - return rect; + return m_displayRect; } void Vmr9VideoWindowControl::setDisplayRect(const QRect &rect) { + m_displayRect = rect; + if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>( m_filter, IID_IVMRWindowlessControl9)) { RECT sourceRect = { 0, 0, 0, 0 }; RECT displayRect = { rect.left(), rect.top(), rect.right(), rect.bottom() }; control->GetNativeVideoSize(&sourceRect.right, &sourceRect.bottom, 0, 0); + + if (m_aspectRatioMode == Qt::KeepAspectRatioByExpanding) { + QSize clippedSize = rect.size(); + clippedSize.scale(sourceRect.right, sourceRect.bottom, Qt::KeepAspectRatio); + + sourceRect.left = (sourceRect.right - clippedSize.width()) / 2; + sourceRect.top = (sourceRect.bottom - clippedSize.height()) / 2; + sourceRect.right = sourceRect.left + clippedSize.width(); + sourceRect.bottom = sourceRect.top + clippedSize.height(); + } + control->SetVideoPosition(&sourceRect, &displayRect); control->Release(); } @@ -134,7 +132,6 @@ void Vmr9VideoWindowControl::setFullScreen(bool fullScreen) void Vmr9VideoWindowControl::repaint() { - if (QWidget *widget = QWidget::find(m_windowId)) { HDC dc = widget->getDC(); if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>( @@ -164,21 +161,13 @@ QSize Vmr9VideoWindowControl::nativeSize() const Qt::AspectRatioMode Vmr9VideoWindowControl::aspectRatioMode() const { - Qt::AspectRatioMode mode = Qt::KeepAspectRatio; - - if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>( - m_filter, IID_IVMRWindowlessControl9)) { - DWORD arMode; - - if (control->GetAspectRatioMode(&arMode) == S_OK && arMode == VMR9ARMode_None) - mode = Qt::IgnoreAspectRatio; - control->Release(); - } - return mode; + return m_aspectRatioMode; } void Vmr9VideoWindowControl::setAspectRatioMode(Qt::AspectRatioMode mode) { + m_aspectRatioMode = mode; + if (IVMRWindowlessControl9 *control = com_cast<IVMRWindowlessControl9>( m_filter, IID_IVMRWindowlessControl9)) { switch (mode) { @@ -188,10 +177,15 @@ void Vmr9VideoWindowControl::setAspectRatioMode(Qt::AspectRatioMode mode) case Qt::KeepAspectRatio: control->SetAspectRatioMode(VMR9ARMode_LetterBox); break; + case Qt::KeepAspectRatioByExpanding: + control->SetAspectRatioMode(VMR9ARMode_LetterBox); + break; default: break; } control->Release(); + + setDisplayRect(m_displayRect); } } @@ -259,6 +253,13 @@ void Vmr9VideoWindowControl::setSaturation(int saturation) emit saturationChanged(saturation); } +void Vmr9VideoWindowControl::updateNativeSize() +{ + setDisplayRect(m_displayRect); + + emit nativeSizeChanged(); +} + void Vmr9VideoWindowControl::setProcAmpValues() { if (IVMRMixerControl9 *control = com_cast<IVMRMixerControl9>(m_filter, IID_IVMRMixerControl9)) { diff --git a/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.h b/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.h index bf4fb42..beac433 100644 --- a/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.h +++ b/src/plugins/mediaservices/directshow/mediaplayer/vmr9videowindowcontrol.h @@ -90,6 +90,8 @@ public: int saturation() const; void setSaturation(int saturation); + void updateNativeSize(); + private: void setProcAmpValues(); float scaleProcAmpValue( @@ -98,6 +100,8 @@ private: IBaseFilter *m_filter; WId m_windowId; DWORD m_dirtyValues; + Qt::AspectRatioMode m_aspectRatioMode; + QRect m_displayRect; int m_brightness; int m_contrast; int m_hue; diff --git a/src/plugins/mediaservices/gstreamer/qgstreamervideooverlay.cpp b/src/plugins/mediaservices/gstreamer/qgstreamervideooverlay.cpp index 427d514..f381f7f 100644 --- a/src/plugins/mediaservices/gstreamer/qgstreamervideooverlay.cpp +++ b/src/plugins/mediaservices/gstreamer/qgstreamervideooverlay.cpp @@ -192,21 +192,36 @@ void QGstreamerVideoOverlay::surfaceFormatChanged() void QGstreamerVideoOverlay::setScaledDisplayRect() { + QRect formatViewport = m_surface->surfaceFormat().viewport(); + switch (m_aspectRatioMode) { case Qt::KeepAspectRatio: { - QSize size = m_surface->surfaceFormat().viewport().size(); - + QSize size = m_surface->surfaceFormat().sizeHint(); size.scale(m_displayRect.size(), Qt::KeepAspectRatio); QRect rect(QPoint(0, 0), size); rect.moveCenter(m_displayRect.center()); m_surface->setDisplayRect(rect); + m_surface->setViewport(formatViewport); } break; case Qt::IgnoreAspectRatio: m_surface->setDisplayRect(m_displayRect); + m_surface->setViewport(formatViewport); + break; + case Qt::KeepAspectRatioByExpanding: + { + QSize size = m_displayRect.size(); + size.scale(m_surface->surfaceFormat().sizeHint(), Qt::KeepAspectRatio); + + QRect viewport(QPoint(0, 0), size); + viewport.moveCenter(formatViewport.center()); + + m_surface->setDisplayRect(m_displayRect); + m_surface->setViewport(viewport); + } break; }; } diff --git a/src/plugins/mediaservices/gstreamer/qx11videosurface.cpp b/src/plugins/mediaservices/gstreamer/qx11videosurface.cpp index cbd5a76..70b8527 100644 --- a/src/plugins/mediaservices/gstreamer/qx11videosurface.cpp +++ b/src/plugins/mediaservices/gstreamer/qx11videosurface.cpp @@ -213,6 +213,16 @@ void QX11VideoSurface::setDisplayRect(const QRect &rect) m_displayRect = rect; } +QRect QX11VideoSurface::viewport() const +{ + return m_viewport; +} + +void QX11VideoSurface::setViewport(const QRect &rect) +{ + m_viewport = rect; +} + int QX11VideoSurface::brightness() const { return getAttribute("XV_BRIGHTNESS", m_brightnessRange.first, m_brightnessRange.second); diff --git a/src/plugins/mediaservices/gstreamer/qx11videosurface.h b/src/plugins/mediaservices/gstreamer/qx11videosurface.h index 1be963e..10f79a6 100644 --- a/src/plugins/mediaservices/gstreamer/qx11videosurface.h +++ b/src/plugins/mediaservices/gstreamer/qx11videosurface.h @@ -67,6 +67,9 @@ public: QRect displayRect() const; void setDisplayRect(const QRect &rect); + QRect viewport() const; + void setViewport(const QRect &rect); + int brightness() const; void setBrightness(int brightness); diff --git a/tools/qttracereplay/main.cpp b/tools/qttracereplay/main.cpp index be7906b..101d512 100644 --- a/tools/qttracereplay/main.cpp +++ b/tools/qttracereplay/main.cpp @@ -49,7 +49,7 @@ class ReplayWidget : public QWidget { Q_OBJECT public: - ReplayWidget(const QString &filename, int from, int to, bool single); + ReplayWidget(const QString &filename, int from, int to, bool single, int frame); void paintEvent(QPaintEvent *event); void resizeEvent(QResizeEvent *event); @@ -66,27 +66,96 @@ public: QTime timer; QList<uint> visibleUpdates; - QList<uint> iterationTimes; + + QVector<uint> iterationTimes; QString filename; int from; int to; bool single; + + int frame; + int currentCommand; }; void ReplayWidget::updateRect() { - if (!visibleUpdates.isEmpty()) + if (frame >= 0 && !updates.isEmpty()) + update(updates.at(frame)); + else if (!visibleUpdates.isEmpty()) update(updates.at(visibleUpdates.at(currentFrame))); } +const int singleFrameRepeatsPerCommand = 100; +const int singleFrameIterations = 4; + void ReplayWidget::paintEvent(QPaintEvent *) { QPainter p(this); + QTimer::singleShot(0, this, SLOT(updateRect())); + // p.setClipRegion(frames.at(currentFrame).updateRegion); + if (frame >= 0) { + int start = buffer.frameStartIndex(frame); + int end = buffer.frameEndIndex(frame); + + iterationTimes.resize(end - start); + + int saveRestoreStackDepth = buffer.processCommands(&p, start, start + currentCommand); + + for (int i = 0; i < saveRestoreStackDepth; ++i) + p.restore(); + + const int repeats = currentIteration >= 3 ? singleFrameRepeatsPerCommand : 1; + + ++currentFrame; + if (currentFrame == repeats) { + currentFrame = 0; + if (currentIteration >= 3) { + iterationTimes[currentCommand - 1] = qMin(iterationTimes[currentCommand - 1], uint(timer.elapsed())); + timer.restart(); + } + + if (currentIteration >= singleFrameIterations + 3) { + printf(" # | ms | description\n"); + printf("------+---------+------------------------------------------------------------\n"); + + qSort(iterationTimes); + + int sum = 0; + for (int i = 0; i < iterationTimes.size(); ++i) { + int delta = iterationTimes.at(i); + if (i > 0) + delta -= iterationTimes.at(i-1); + sum += delta; + qreal deltaF = delta / qreal(repeats); + printf("%.5d | %.5f | %s\n", i, deltaF, qPrintable(buffer.commandDescription(start + i))); + } + printf("Total | %.5f | Total frame time\n", sum / qreal(repeats)); + deleteLater(); + return; + } + + if (start + currentCommand >= end) { + currentCommand = 1; + ++currentIteration; + if (currentIteration == 3) { + timer.start(); + iterationTimes.fill(uint(-1)); + } + if (currentIteration >= 3 && currentIteration < singleFrameIterations + 3) + printf("Profiling iteration %d of %d\n", currentIteration - 2, singleFrameIterations); + } else { + ++currentCommand; + } + } + + return; + } + buffer.draw(&p, visibleUpdates.at(currentFrame)); ++currentFrame; @@ -138,11 +207,9 @@ void ReplayWidget::paintEvent(QPaintEvent *) } } } - - QTimer::singleShot(0, this, SLOT(updateRect())); } -void ReplayWidget::resizeEvent(QResizeEvent *event) +void ReplayWidget::resizeEvent(QResizeEvent *) { visibleUpdates.clear(); @@ -162,13 +229,15 @@ void ReplayWidget::resizeEvent(QResizeEvent *event) } -ReplayWidget::ReplayWidget(const QString &filename_, int from_, int to_, bool single_) +ReplayWidget::ReplayWidget(const QString &filename_, int from_, int to_, bool single_, int frame_) : currentFrame(0) , currentIteration(0) , filename(filename_) , from(from_) , to(to_) , single(single_) + , frame(frame_) + , currentCommand(1) { setWindowTitle(filename); QFile file(filename); @@ -216,7 +285,8 @@ int main(int argc, char **argv) printf("Usage:\n > %s [OPTIONS] [traceFile]\n", argv[0]); printf("OPTIONS\n" " --range=from-to to specify a frame range.\n" - " --singlerun to do only one run (without statistics)\n"); + " --singlerun to do only one run (without statistics)\n" + " --instrumentframe=frame to instrument a single frame\n"); return 1; } @@ -228,6 +298,8 @@ int main(int argc, char **argv) bool single = false; + int frame = -1; + int from = 0; int to = -1; for (int i = 1; i < app.arguments().size() - 1; ++i) { @@ -253,13 +325,22 @@ int main(int argc, char **argv) } } else if (arg == QLatin1String("--singlerun")) { single = true; + } else if (arg.startsWith(QLatin1String("--instrumentframe="))) { + QString rest = arg.mid(18); + bool ok = false; + int frameCandidate = rest.toInt(&ok); + if (ok) { + frame = frameCandidate; + } else { + printf("ERROR: malformed syntax in argument %s\n", qPrintable(arg)); + } } else { printf("Unrecognized argument: %s\n", qPrintable(arg)); return 1; } } - ReplayWidget *widget = new ReplayWidget(app.arguments().last(), from, to, single); + ReplayWidget *widget = new ReplayWidget(app.arguments().last(), from, to, single, frame); if (!widget->updates.isEmpty()) { widget->show(); |