summaryrefslogtreecommitdiffstats
path: root/src/gui
diff options
context:
space:
mode:
authorOlivier Goffart <olivier.goffart@nokia.com>2010-09-03 15:13:15 (GMT)
committerOlivier Goffart <olivier.goffart@nokia.com>2010-09-08 12:00:15 (GMT)
commit320ab7a9cfc484025f703dfdd39349a5f94fbe69 (patch)
tree64fce61a27709a270cbdc8d0721ef96241696f1e /src/gui
parent0be5e5e86522616543975331c1f979147d4c8cdb (diff)
downloadQt-320ab7a9cfc484025f703dfdd39349a5f94fbe69.zip
Qt-320ab7a9cfc484025f703dfdd39349a5f94fbe69.tar.gz
Qt-320ab7a9cfc484025f703dfdd39349a5f94fbe69.tar.bz2
qdrawhelper: use SSE2 for interpolation in fetchTransformedBilinear
when scale down, with no rotations Process the pixel 4 by 4 and do the interpolation using SSE2 Reviewed-by: Samuel
Diffstat (limited to 'src/gui')
-rw-r--r--src/gui/painting/qdrawhelper.cpp77
1 files changed, 77 insertions, 0 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 6d921da..a302a83 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -675,6 +675,43 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i
return (((tlrb + trrb + blrb + brrb) >> 8) & 0x00ff00ff) | ((tlag + trag + blag + brag) & 0xff00ff00);
}
+#if defined(QT_ALWAYS_HAVE_SSE2)
+#define interpolate_4_pixels_16_sse2(tl, tr, bl, br, distx, disty, colorMask, v_256, b) \
+{ \
+ const __m128i dxdy = _mm_mullo_epi16 (distx, disty); \
+ const __m128i distx_ = _mm_slli_epi16(distx, 4); \
+ const __m128i disty_ = _mm_slli_epi16(disty, 4); \
+ const __m128i idxidy = _mm_add_epi16(dxdy, _mm_sub_epi16(v_256, _mm_add_epi16(distx_, disty_))); \
+ const __m128i dxidy = _mm_sub_epi16(distx_, dxdy); \
+ const __m128i idxdy = _mm_sub_epi16(disty_, dxdy); \
+ \
+ __m128i tlAG = _mm_srli_epi16(tl, 8); \
+ __m128i tlRB = _mm_and_si128(tl, colorMask); \
+ __m128i trAG = _mm_srli_epi16(tr, 8); \
+ __m128i trRB = _mm_and_si128(tr, colorMask); \
+ __m128i blAG = _mm_srli_epi16(bl, 8); \
+ __m128i blRB = _mm_and_si128(bl, colorMask); \
+ __m128i brAG = _mm_srli_epi16(br, 8); \
+ __m128i brRB = _mm_and_si128(br, colorMask); \
+ \
+ tlAG = _mm_mullo_epi16(tlAG, idxidy); \
+ tlRB = _mm_mullo_epi16(tlRB, idxidy); \
+ trAG = _mm_mullo_epi16(trAG, dxidy); \
+ trRB = _mm_mullo_epi16(trRB, dxidy); \
+ blAG = _mm_mullo_epi16(blAG, idxdy); \
+ blRB = _mm_mullo_epi16(blRB, idxdy); \
+ brAG = _mm_mullo_epi16(brAG, dxdy); \
+ brRB = _mm_mullo_epi16(brRB, dxdy); \
+ \
+ /* Add the values, and shift to only keep 8 significant bits per colors */ \
+ __m128i rAG =_mm_add_epi16(_mm_add_epi16(tlAG, trAG), _mm_add_epi16(blAG, brAG)); \
+ __m128i rRB =_mm_add_epi16(_mm_add_epi16(tlRB, trRB), _mm_add_epi16(blRB, brRB)); \
+ rAG = _mm_andnot_si128(colorMask, rAG); \
+ rRB = _mm_srli_epi16(rRB, 8); \
+ _mm_storeu_si128((__m128i*)(b), _mm_or_si128(rAG, rRB)); \
+}
+#endif
+
template<TextureBlendType blendType>
Q_STATIC_TEMPLATE_FUNCTION inline void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2)
@@ -882,6 +919,46 @@ const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator *
const uchar *s1 = data->texture.scanLine(y1);
const uchar *s2 = data->texture.scanLine(y2);
int disty = (fy & 0x0000ffff) >> 12;
+
+#if defined(QT_ALWAYS_HAVE_SSE2)
+ const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+ //const __m128i distShuffleMask = _mm_set_epi8(13, 12, 13, 12, 9, 8, 9, 8, 5, 4, 5, 4, 1, 0, 1, 0);
+ const __m128i v_256 = _mm_set1_epi16(256);
+ const __m128i v_disty = _mm_set1_epi16(disty);
+ __m128i v_fdx = _mm_set1_epi32(fdx*4);
+
+ union Vect_buffer { __m128i vect; quint32 i[4]; };
+ Vect_buffer v_fx;
+
+ for (int i = 0; i < 4; i++) {
+ v_fx.i[i] = fx;
+ fx += fdx;
+ }
+
+ while (b < end-3) {
+
+ Vect_buffer tl, tr, bl, br;
+
+ for (int i = 0; i < 4; i++) {
+ int x1 = v_fx.i[i] >> 16;
+ int x2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
+ tl.i[i] = fetch(s1, x1, data->texture.colorTable);
+ tr.i[i] = fetch(s1, x2, data->texture.colorTable);
+ bl.i[i] = fetch(s2, x1, data->texture.colorTable);
+ br.i[i] = fetch(s2, x2, data->texture.colorTable);
+ }
+ __m128i v_distx = _mm_srli_epi16(v_fx.vect, 12); //distx = (fx & 0x0000ffff) >> 12;
+ //v_distx = _mm_shuffle_epi8(v_disty, distShuffleMask); //distx |= distx << 16;
+ v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+ v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+
+ interpolate_4_pixels_16_sse2(tl.vect, tr.vect, bl.vect, br.vect, v_distx, v_disty, colorMask, v_256, b);
+ b+=4;
+ v_fx.vect = _mm_add_epi32(v_fx.vect, v_fdx);
+ }
+ fx = v_fx.i[0];
+#endif
while (b < end) {
int x1 = (fx >> 16);
int x2;