diff options
author | Benjamin Poulain <benjamin.poulain@nokia.com> | 2010-08-13 18:57:07 (GMT) |
---|---|---|
committer | Benjamin Poulain <benjamin.poulain@nokia.com> | 2010-08-16 14:03:13 (GMT) |
commit | 0fb9e0fff4097bf0b84ff217526b0a9c33b69414 (patch) | |
tree | ada241b9a5bc13147d9adc9292d58e714211e6dc /tests/benchmarks | |
parent | f7a501515fcf1dafecb88a40e18721ea14fd0a13 (diff) | |
download | Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.zip Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.tar.gz Qt-0fb9e0fff4097bf0b84ff217526b0a9c33b69414.tar.bz2 |
Implement the general blending of ARGB32_pm with SSSE3
SSSE3 provides two tools to improve the blending speed over SSE2:
-palignr
-byte permutation
The alignement is enforced on src and dst with palignr to always make
aligned access.
The extraction of the alpha mask is done with a byte permutation in
order to save two instructions per cycle.
On Atom, this patch gives between 0% (aligned src) to 10% of
improvement (unaligned 4 and 12 bytes).
On Core 2, this patch gives consistently 8% to 10% of improvement
for every miss-alignment.
Reviewed-by: Samuel Rødal
Diffstat (limited to 'tests/benchmarks')
-rw-r--r-- | tests/benchmarks/gui/image/blendbench/main.cpp | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/tests/benchmarks/gui/image/blendbench/main.cpp b/tests/benchmarks/gui/image/blendbench/main.cpp index f53654b..d420d6c 100644 --- a/tests/benchmarks/gui/image/blendbench/main.cpp +++ b/tests/benchmarks/gui/image/blendbench/main.cpp @@ -106,6 +106,9 @@ private slots: void blendBenchAlpha_data(); void blendBenchAlpha(); + + void unalignedBlendArgb32_data(); + void unalignedBlendArgb32(); }; void BlendBench::blendBench_data() @@ -179,6 +182,46 @@ void BlendBench::blendBenchAlpha() } } +void BlendBench::unalignedBlendArgb32_data() +{ + // The performance of blending can depend of the alignment of the data + // on 16 bytes. Some SIMD instruction set have significantly better + // memory access when the memory is aligned on 16 bytes boundary. + + // offset in 32 bits words + QTest::addColumn<int>("offset"); + QTest::newRow("aligned on 16 bytes") << 0; + QTest::newRow("unaligned by 4 bytes") << 1; + QTest::newRow("unaligned by 8 bytes") << 2; + QTest::newRow("unaligned by 12 bytes") << 3; +} + +void BlendBench::unalignedBlendArgb32() +{ + const int dimension = 1024; + + // We use dst aligned by design. We don't want to test all the combination of alignemnt for src and dst. + // Moreover, it make sense for us to align dst in the implementation because it is accessed more often. + uchar *dstMemory = static_cast<uchar*>(qMallocAligned((dimension * dimension * sizeof(quint32)), 16)); + QImage destination(dstMemory, dimension, dimension, QImage::Format_ARGB32_Premultiplied); + destination.fill(0x12345678); // avoid special cases of alpha + + uchar *srcMemory = static_cast<uchar*>(qMallocAligned((dimension * dimension * sizeof(quint32)) + 16, 16)); + QFETCH(int, offset); + srcMemory += (offset * sizeof(quint32)); + + QImage src(srcMemory, dimension, dimension, QImage::Format_ARGB32_Premultiplied); + src.fill(0x87654321); + + QPainter painter(&destination); + QBENCHMARK { + painter.drawImage(QPoint(), src); + } + + qFreeAligned(srcMemory); + qFreeAligned(dstMemory); +} + QTEST_MAIN(BlendBench) #include "main.moc" |