Implement the general blending of ARGB32_pm with SSSE3

SSSE3 provides two tools to improve the blending speed over SSE2: -palignr -byte permutation The alignement is enforced on src and dst with palignr to always make aligned access. The extraction of the alpha mask is done with a byte permutation in order to save two instructions per cycle. On Atom, this patch gives between 0% (aligned src) to 10% of improvement (unaligned 4 and 12 bytes). On Core 2, this patch gives consistently 8% to 10% of improvement for every miss-alignment. Reviewed-by: Samuel Rødal
author: Benjamin Poulain <benjamin.poulain@nokia.com> 2010-08-13 18:57:07 (GMT)
committer: Benjamin Poulain <benjamin.poulain@nokia.com> 2010-08-16 14:03:13 (GMT)
commit: 0fb9e0fff4097bf0b84ff217526b0a9c33b69414 (patch)
tree: ada241b9a5bc13147d9adc9292d58e714211e6dc /tests
parent: f7a501515fcf1dafecb88a40e18721ea14fd0a13 (diff)
download: Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.zip
Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.tar.gz
Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.tar.bz2
1 files changed, 43 insertions, 0 deletions
diff --git a/tests/benchmarks/gui/image/blendbench/main.cpp b/tests/benchmarks/gui/image/blendbench/main.cpp
index f53654b..d420d6c 100644
--- a/tests/benchmarks/gui/image/blendbench/main.cpp
+++ b/tests/benchmarks/gui/image/blendbench/main.cpp
@@ -106,6 +106,9 @@ private slots:
 
     void blendBenchAlpha_data();
     void blendBenchAlpha();
+
+    void unalignedBlendArgb32_data();
+    void unalignedBlendArgb32();
 };
 
 void BlendBench::blendBench_data()
@@ -179,6 +182,46 @@ void BlendBench::blendBenchAlpha()
     }
 }
 
+void BlendBench::unalignedBlendArgb32_data()
+{
+    // The performance of blending can depend of the alignment of the data
+    // on 16 bytes. Some SIMD instruction set have significantly better
+    // memory access when the memory is aligned on 16 bytes boundary.
+
+    // offset in 32 bits words
+    QTest::addColumn<int>("offset");
+    QTest::newRow("aligned on 16 bytes") << 0;
+    QTest::newRow("unaligned by 4 bytes") << 1;
+    QTest::newRow("unaligned by 8 bytes") << 2;
+    QTest::newRow("unaligned by 12 bytes") << 3;
+}
+
+void BlendBench::unalignedBlendArgb32()
+{
+    const int dimension = 1024;
+
+    // We use dst aligned by design. We don't want to test all the combination of alignemnt for src and dst.
+    // Moreover, it make sense for us to align dst in the implementation because it is accessed more often.
+    uchar *dstMemory = static_cast<uchar*>(qMallocAligned((dimension * dimension * sizeof(quint32)), 16));
+    QImage destination(dstMemory, dimension, dimension, QImage::Format_ARGB32_Premultiplied);
+    destination.fill(0x12345678); // avoid special cases of alpha
+
+    uchar *srcMemory = static_cast<uchar*>(qMallocAligned((dimension * dimension * sizeof(quint32)) + 16, 16));
+    QFETCH(int, offset);
+    srcMemory += (offset * sizeof(quint32));
+
+    QImage src(srcMemory, dimension, dimension, QImage::Format_ARGB32_Premultiplied);
+    src.fill(0x87654321);
+
+    QPainter painter(&destination);
+    QBENCHMARK {
+        painter.drawImage(QPoint(), src);
+    }
+
+    qFreeAligned(srcMemory);
+    qFreeAligned(dstMemory);
+}
+
 QTEST_MAIN(BlendBench)
 
 #include "main.moc"
author	Benjamin Poulain <benjamin.poulain@nokia.com>	2010-08-13 18:57:07 (GMT)
committer	Benjamin Poulain <benjamin.poulain@nokia.com>	2010-08-16 14:03:13 (GMT)
commit	0fb9e0fff4097bf0b84ff217526b0a9c33b69414 (patch)
tree	ada241b9a5bc13147d9adc9292d58e714211e6dc /tests
parent	f7a501515fcf1dafecb88a40e18721ea14fd0a13 (diff)
download	Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.zip Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.tar.gz Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.tar.bz2