aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorHenrik Smiding <henrik.smiding@stericsson.com>2013-04-26 15:52:13 +0200
committerEvan McClain <aeroevan@gmail.com>2013-05-26 09:30:24 -0400
commit506e364aaa1689ca5b8edc68178c0a134dd16781 (patch)
tree9e6804224fc1e65f701222228290526d119e9ba6 /src
parent416f34f7ddf0a948d092fd11933565b1acee3fe9 (diff)
downloadexternal_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.zip
external_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.tar.gz
external_skia-506e364aaa1689ca5b8edc68178c0a134dd16781.tar.bz2
Add optimized S32A_D565_Opaque_Dither blitter.
Adds optimization of Skia S32A_D565_Opaque_Dither blitter using ARM NEON instruction set. Improves performance in browser. Change-Id: Ie20fb3cf3ad9a60f01b79e3ed2ddba7651bcf013 Signed-off-by: Henrik Smiding <henrik.smiding@stericsson.com> Signed-off-by: Patrik Ryd <patrik.ryd@stericsson.com>
Diffstat (limited to 'src')
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp130
1 files changed, 129 insertions, 1 deletions
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 9ca2b77..8e56e25 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -1464,7 +1464,135 @@ static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
///////////////////////////////////////////////////////////////////////////////
-#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS)
+
+/* This function was broken out to keep GCC from storing all registers on the stack
+ even though they would not be used in the assembler code */
+static __attribute__ ((noinline)) void S32A_D565_Opaque_Dither_Handle8(uint16_t * SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha, int x, int y) {
+ DITHER_565_SCAN(y);
+ do {
+ SkPMColor c = *src++;
+ SkPMColorAssert(c);
+ if (c) {
+ unsigned a = SkGetPackedA32(c);
+
+ // dither and alpha are just temporary variables to work-around
+ // an ICE in debug.
+ unsigned dither = DITHER_VALUE(x);
+ unsigned alpha = SkAlpha255To256(a);
+ int d = SkAlphaMul(dither, alpha);
+
+ unsigned sr = SkGetPackedR32(c);
+ unsigned sg = SkGetPackedG32(c);
+ unsigned sb = SkGetPackedB32(c);
+ sr = SkDITHER_R32_FOR_565(sr, d);
+ sg = SkDITHER_G32_FOR_565(sg, d);
+ sb = SkDITHER_B32_FOR_565(sb, d);
+
+ uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
+ uint32_t dst_expanded = SkExpand_rgb_16(*dst);
+ dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
+ // now src and dst expanded are in g:11 r:10 x:1 b:10
+ *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
+ }
+ dst += 1;
+ DITHER_INC_X(x);
+ } while (--count != 0);
+}
+
+
+static void S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha, int x, int y) {
+ SkASSERT(255 == alpha);
+
+ if (count >= 8) {
+ asm volatile (
+ "pld [%[src]] \n\t" // Preload source
+ "pld [%[dst]] \n\t" // Preload destination pixels
+ "and %[y], %[y], #0x03 \n\t" // Mask y by 3
+ "vmov.i8 d31, #0x01 \n\t" // Set up alpha constant
+ "add %[y], %[y], lsl #1 \n\t" // and multiply with 12 to get the row offset
+ "and %[x], %[x], #0x03 \n\t" // Mask x by 3
+ "vmov.i16 q12, #256 \n\t" // Set up alpha constant
+ "add %[y], %[matrix], %[y], lsl #2 \n\t" //
+ "add r7, %[x], %[y] \n\t" //
+ "vld1.8 {d26}, [r7] \n\t" // Load dither values
+ "add %[x], %[count] \n\t" //
+ "vmov.i16 q11, #0x3F \n\t" // Set up green mask constant
+ "and %[x], %[x], #0x03 \n\t" // Mask x by 3
+ "vmovl.u8 q13, d26 \n\t" // Expand dither to 16-bit
+ "add r7, %[x], %[y] \n\t" //
+ "vmov.i16 q10, #0x1F \n\t" // Set up blue mask constant
+ "vld1.8 {d28}, [r7] \n\t" // Load iteration 2+ dither values
+ "ands r7, %[count], #7 \n\t" // Calculate first iteration increment
+ "moveq r7, #8 \n\t" // Do full iteration?
+ "vmovl.u8 q14, d28 \n\t" // Expand dither to 16-bit
+ "vld4.8 {d0-d3}, [%[src]] \n\t" // Load eight source pixels
+ "vld1.16 {q3}, [%[dst]] \n\t" // Load destination 565 pixels
+ "add %[src], r7, lsl #2 \n\t" // Increment source pointer
+ "add %[dst], r7, lsl #1 \n\t" // Increment destination buffer pointer
+ "subs %[count], r7 \n\t" // Decrement loop counter
+ "sub r7, %[dst], r7, lsl #1 \n\t" // Save original destination pointer
+ "b 2f \n\t"
+ "1: \n\t"
+ "vld4.8 {d0-d3}, [%[src]]! \n\t" // Load eight source pixels
+ "vld1.16 {q3}, [%[dst]]! \n\t" // Load destination 565 pixels
+ "vst1.16 {q2}, [r7] \n\t" // Write result to memory
+ "sub r7, %[dst], #8*2 \n\t" // Calculate next loop's destination pointer
+ "subs %[count], #8 \n\t" // Decrement loop counter
+ "2: \n\t"
+ "pld [%[src]] \n\t" // Preload destination pixels
+ "pld [%[dst]] \n\t" // Preload destination pixels
+ "vaddl.u8 q2, d3, d31 \n\t" // Add 1 to alpha to get 0-256
+ "vshr.u8 d16, d0, #5 \n\t" // Calculate source red subpixel
+ "vmul.u16 q2, q2, q13 \n\t" // Multiply alpha with dither value
+ "vsub.i8 d0, d16 \n\t" // red = (red - (red >> 5) + dither)
+ "vshrn.i16 d30, q2, #8 \n\t" // Shift and narrow result to 0-7
+ "vadd.i8 d0, d30 \n\t" //
+ "vshr.u8 d16, d2, #5 \n\t" // Calculate source blue subpixel
+ "vsub.i8 d2, d16 \n\t" // blue = (blue - (blue >> 5) + dither)
+ "vshr.u8 d16, d1, #6 \n\t" // Calculate source green subpixel
+ "vadd.i8 d2, d30 \n\t" //
+ "vsub.i8 d1, d16 \n\t" // green = (green - (green >> 6) + (dither >> 1))
+ "vshr.u8 d30, #1 \n\t" //
+ "vadd.i8 d1, d30 \n\t" //
+ "vsubw.u8 q2, q12, d3 \n\t" // Calculate inverse alpha 256-1
+ "vshr.u16 q8, q3, #5 \n\t" // Extract destination green pixel
+ "vshr.u16 q9, q3, #11 \n\t" // Extract destination red pixel
+ "vand q8, q11 \n\t" // Shift green
+ "vand q3, q10 \n\t" // Extract destination blue pixel
+ "vshr.u16 q2, #3 \n\t" // Shift alpha
+ "vshll.u8 q1, d2, #2 \n\t" // Calculate destination blue pixel
+ "vmla.i16 q1, q3, q2 \n\t" // ...and add to source pixel
+ "vshll.u8 q3, d1, #3 \n\t" // Calculate destination green pixel
+ "vmov.u8 q13, q14 \n\t" // Set dither matrix to iteration 2+ values
+ "vmla.i16 q3, q8, q2 \n\t" // ...and add to source pixel
+ "vshll.u8 q8, d0, #2 \n\t" // Calculate destination red pixel
+ "vmla.i16 q8, q9, q2 \n\t" // ...and add to source pixel
+ "vshr.u16 q1, #5 \n\t" // Pack blue pixel
+ "vand q2, q1, q10 \n\t" //
+ "vshr.u16 q3, #5 \n\t" // Pack green pixel
+ "vsli.16 q2, q3, #5 \n\t" // ...and insert
+ "vshr.u16 q8, #5 \n\t" // Pack red pixel
+ "vsli.16 q2, q8, #11 \n\t" // ...and insert
+ "bne 1b \n\t" // If inner loop counter != 0, loop
+ "vst1.16 {q2}, [r7] \n\t" // Write result to memory
+ : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [x] "+r" (x), [y] "+r" (y)
+ : [matrix] "r" (gDitherMatrix_Neon)
+ : "cc", "memory", "r7", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+ );
+ }
+ else {
+ S32A_D565_Opaque_Dither_Handle8(dst, src, count, alpha, x, y);
+ }
+}
+
+#define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
+
+#elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
#undef DEBUG_OPAQUE_DITHER