Add optimized S32A_D565_Opaque_Dither blitter.

Adds optimization of Skia S32A_D565_Opaque_Dither blitter using ARM NEON instruction set. Improves performance in browser. Change-Id: Ie20fb3cf3ad9a60f01b79e3ed2ddba7651bcf013 Signed-off-by: Henrik Smiding <henrik.smiding@stericsson.com> Signed-off-by: Patrik Ryd <patrik.ryd@stericsson.com>
author: Henrik Smiding <henrik.smiding@stericsson.com> 2013-04-26 15:52:13 +0200
committer: Evan McClain <aeroevan@gmail.com> 2013-05-26 09:30:24 -0400
commit: 506e364aaa1689ca5b8edc68178c0a134dd16781 (patch)
tree: 9e6804224fc1e65f701222228290526d119e9ba6 /src
parent: 416f34f7ddf0a948d092fd11933565b1acee3fe9 (diff)
download: external_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.zip
external_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.tar.gz
external_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.tar.bz2
1 files changed, 129 insertions, 1 deletions
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 9ca2b77..8e56e25 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -1464,7 +1464,135 @@ static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
 
 ///////////////////////////////////////////////////////////////////////////////
 
-#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS)
+
+/* This function was broken out to keep GCC from storing all registers on the stack
+   even though they would not be used in the assembler code */
+static __attribute__ ((noinline)) void S32A_D565_Opaque_Dither_Handle8(uint16_t * SK_RESTRICT dst,
+                                                                       const SkPMColor* SK_RESTRICT src,
+                                                                       int count, U8CPU alpha, int x, int y) {
+    DITHER_565_SCAN(y);
+    do {
+        SkPMColor c = *src++;
+        SkPMColorAssert(c);
+        if (c) {
+            unsigned a = SkGetPackedA32(c);
+
+            // dither and alpha are just temporary variables to work-around
+            // an ICE in debug.
+            unsigned dither = DITHER_VALUE(x);
+            unsigned alpha = SkAlpha255To256(a);
+            int d = SkAlphaMul(dither, alpha);
+
+            unsigned sr = SkGetPackedR32(c);
+            unsigned sg = SkGetPackedG32(c);
+            unsigned sb = SkGetPackedB32(c);
+            sr = SkDITHER_R32_FOR_565(sr, d);
+            sg = SkDITHER_G32_FOR_565(sg, d);
+            sb = SkDITHER_B32_FOR_565(sb, d);
+
+            uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+            uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+            dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+            // now src and dst expanded are in g:11 r:10 x:1 b:10
+            *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+        }
+        dst += 1;
+        DITHER_INC_X(x);
+    } while (--count != 0);
+}
+
+
+static void S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,
+                                         const SkPMColor* SK_RESTRICT src,
+                                         int count, U8CPU alpha, int x, int y) {
+    SkASSERT(255 == alpha);
+
+    if (count >= 8) {
+        asm volatile (
+                    "pld            [%[src]]                        \n\t"   // Preload source
+                    "pld            [%[dst]]                        \n\t"   // Preload destination pixels
+                    "and            %[y], %[y], #0x03               \n\t"   // Mask y by 3
+                    "vmov.i8        d31, #0x01                      \n\t"   // Set up alpha constant
+                    "add            %[y], %[y], lsl #1              \n\t"   // and multiply with 12 to get the row offset
+                    "and            %[x], %[x], #0x03               \n\t"   // Mask x by 3
+                    "vmov.i16       q12, #256                       \n\t"   // Set up alpha constant
+                    "add            %[y], %[matrix], %[y], lsl #2   \n\t"   //
+                    "add            r7, %[x], %[y]                  \n\t"   //
+                    "vld1.8         {d26}, [r7]                     \n\t"   // Load dither values
+                    "add            %[x], %[count]                  \n\t"   //
+                    "vmov.i16       q11, #0x3F                      \n\t"   // Set up green mask constant
+                    "and            %[x], %[x], #0x03               \n\t"   // Mask x by 3
+                    "vmovl.u8       q13, d26                        \n\t"   // Expand dither to 16-bit
+                    "add            r7, %[x], %[y]                  \n\t"   //
+                    "vmov.i16       q10, #0x1F                      \n\t"   // Set up blue mask constant
+                    "vld1.8         {d28}, [r7]                     \n\t"   // Load iteration 2+ dither values
+                    "ands           r7, %[count], #7                \n\t"   // Calculate first iteration increment
+                    "moveq          r7, #8                          \n\t"   // Do full iteration?
+                    "vmovl.u8       q14, d28                        \n\t"   // Expand dither to 16-bit
+                    "vld4.8         {d0-d3}, [%[src]]               \n\t"   // Load eight source pixels
+                    "vld1.16        {q3}, [%[dst]]                  \n\t"   // Load destination 565 pixels
+                    "add            %[src], r7, lsl #2              \n\t"   // Increment source pointer
+                    "add            %[dst], r7, lsl #1              \n\t"   // Increment destination buffer pointer
+                    "subs           %[count], r7                    \n\t"   // Decrement loop counter
+                    "sub            r7, %[dst], r7, lsl #1          \n\t"   // Save original destination pointer
+                    "b              2f                              \n\t"
+                    "1:                                             \n\t"
+                    "vld4.8         {d0-d3}, [%[src]]!              \n\t"   // Load eight source pixels
+                    "vld1.16        {q3}, [%[dst]]!                 \n\t"   // Load destination 565 pixels
+                    "vst1.16        {q2}, [r7]                      \n\t"   // Write result to memory
+                    "sub            r7, %[dst], #8*2                \n\t"   // Calculate next loop's destination pointer
+                    "subs           %[count], #8                    \n\t"   // Decrement loop counter
+                    "2:                                             \n\t"
+                    "pld            [%[src]]                        \n\t"   // Preload destination pixels
+                    "pld            [%[dst]]                        \n\t"   // Preload destination pixels
+                    "vaddl.u8       q2, d3, d31                     \n\t"   // Add 1 to alpha to get 0-256
+                    "vshr.u8        d16, d0, #5                     \n\t"   // Calculate source red subpixel
+                    "vmul.u16       q2, q2, q13                     \n\t"   // Multiply alpha with dither value
+                    "vsub.i8        d0, d16                         \n\t"   // red = (red - (red >> 5) + dither)
+                    "vshrn.i16      d30, q2, #8                     \n\t"   // Shift and narrow result to 0-7
+                    "vadd.i8        d0, d30                         \n\t"   //
+                    "vshr.u8        d16, d2, #5                     \n\t"   // Calculate source blue subpixel
+                    "vsub.i8        d2, d16                         \n\t"   // blue = (blue - (blue >> 5) + dither)
+                    "vshr.u8        d16, d1, #6                     \n\t"   // Calculate source green subpixel
+                    "vadd.i8        d2, d30                         \n\t"   //
+                    "vsub.i8        d1, d16                         \n\t"   // green = (green - (green >> 6) + (dither >> 1))
+                    "vshr.u8        d30, #1                         \n\t"   //
+                    "vadd.i8        d1, d30                         \n\t"   //
+                    "vsubw.u8       q2, q12, d3                     \n\t"   // Calculate inverse alpha 256-1
+                    "vshr.u16       q8, q3, #5                      \n\t"   // Extract destination green pixel
+                    "vshr.u16       q9, q3, #11                     \n\t"   // Extract destination red pixel
+                    "vand           q8, q11                         \n\t"   // Shift green
+                    "vand           q3, q10                         \n\t"   // Extract destination blue pixel
+                    "vshr.u16       q2, #3                          \n\t"   // Shift alpha
+                    "vshll.u8       q1, d2, #2                      \n\t"   // Calculate destination blue pixel
+                    "vmla.i16       q1, q3, q2                      \n\t"   // ...and add to source pixel
+                    "vshll.u8       q3, d1, #3                      \n\t"   // Calculate destination green pixel
+                    "vmov.u8        q13, q14                        \n\t"   // Set dither matrix to iteration 2+ values
+                    "vmla.i16       q3, q8, q2                      \n\t"   // ...and add to source pixel
+                    "vshll.u8       q8, d0, #2                      \n\t"   // Calculate destination red pixel
+                    "vmla.i16       q8, q9, q2                      \n\t"   // ...and add to source pixel
+                    "vshr.u16       q1, #5                          \n\t"   // Pack blue pixel
+                    "vand           q2, q1, q10                     \n\t"   //
+                    "vshr.u16       q3, #5                          \n\t"   // Pack green pixel
+                    "vsli.16        q2, q3, #5                      \n\t"   // ...and insert
+                    "vshr.u16       q8, #5                          \n\t"   // Pack red pixel
+                    "vsli.16        q2, q8, #11                     \n\t"   // ...and insert
+                    "bne            1b                              \n\t"   // If inner loop counter != 0, loop
+                    "vst1.16        {q2}, [r7]                      \n\t"   // Write result to memory
+                    : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [x] "+r" (x), [y] "+r" (y)
+                    : [matrix] "r" (gDitherMatrix_Neon)
+                    : "cc", "memory", "r7", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+                    );
+    }
+    else {
+        S32A_D565_Opaque_Dither_Handle8(dst, src, count, alpha, x, y);
+    }
+}
+
+#define	S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
+
+#elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
 
 #undef	DEBUG_OPAQUE_DITHER
author	Henrik Smiding <henrik.smiding@stericsson.com>	2013-04-26 15:52:13 +0200
committer	Evan McClain <aeroevan@gmail.com>	2013-05-26 09:30:24 -0400
commit	506e364aaa1689ca5b8edc68178c0a134dd16781 (patch)
tree	9e6804224fc1e65f701222228290526d119e9ba6 /src
parent	416f34f7ddf0a948d092fd11933565b1acee3fe9 (diff)
download	external_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.zip external_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.tar.gz external_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.tar.bz2