aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenrik Smiding <henrik.smiding@stericsson.com>2012-08-11 23:08:19 +0200
committerEvan McClain <aeroevan@gmail.com>2013-05-26 09:30:04 -0400
commit49cd20ffc0969087a8b6c1406e7cb7e59738a7f5 (patch)
tree1ad355c7ba07d50a7ea324d835bd46633a4149df
parent1f089a2a9abd7fbb7f9cdf7a57a37a3639fb84c2 (diff)
downloadexternal_skia-49cd20ffc0969087a8b6c1406e7cb7e59738a7f5.zip
external_skia-49cd20ffc0969087a8b6c1406e7cb7e59738a7f5.tar.gz
external_skia-49cd20ffc0969087a8b6c1406e7cb7e59738a7f5.tar.bz2
Add optimization of Skia S32A_Opaque blitter
Adds optimization of Skia S32A_Opaque_BlitRow32 blitter using ARM NEON instruction set. Special cases for when alpha is zero or opaque. Improves performance platform wide. Change-Id: I0ffeb23b128e61cfe581ad121f227631d2918686 Signed-off-by: Henrik Smiding <henrik.smiding@stericsson.com> Signed-off-by: Patrik Ryd <patrik.ryd@stericsson.com>
-rw-r--r--Android.mk5
-rw-r--r--src/opts/S32A_Opaque_BlitRow32_neon.S243
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp11
3 files changed, 258 insertions, 1 deletions
diff --git a/Android.mk b/Android.mk
index 26361ba..38f18f9 100644
--- a/Android.mk
+++ b/Android.mk
@@ -51,6 +51,10 @@ ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_CFLAGS += -D__ARM_HAVE_NEON
endif
+# Enable Neon assembler optimized version of S32A_Opaque_BlitRow32.
+# Overrides the intrinsic blitter below.
+LOCAL_CFLAGS += -DENABLE_OPTIMIZED_S32A_BLITTERS
+
# special checks for alpha == 0 and alpha == 255 in S32A_Opaque_BlitRow32
# procedures (C and assembly) seriously improve skia performance
LOCAL_CFLAGS += -DTEST_SRC_ALPHA
@@ -264,6 +268,7 @@ ifeq ($(TARGET_ARCH),arm)
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_SRC_FILES += \
+ src/opts/S32A_Opaque_BlitRow32_neon.S \
src/opts/memset16_neon.S \
src/opts/memset32_neon.S
diff --git a/src/opts/S32A_Opaque_BlitRow32_neon.S b/src/opts/S32A_Opaque_BlitRow32_neon.S
new file mode 100644
index 0000000..5d97c77
--- /dev/null
+++ b/src/opts/S32A_Opaque_BlitRow32_neon.S
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2012
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Neon optimized version of S32A_Opaque_BlitRow32.
+ * Special cases for when alpha is zero or opaque.
+ */
+
+#if defined(__ARM_HAVE_NEON) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS)
+
+ .text
+ .fpu neon
+ .align
+
+ .global S32A_Opaque_BlitRow32_neon
+ .func S32A_Opaque_BlitRow32_neon
+
+S32A_Opaque_BlitRow32_neon:
+ stmdb sp!, {r4-r6}
+ cmp r2, #7 // The main loop requires at least 8 pixels
+ ble BlitSmall
+ /* Setup constants */
+ vld4.8 {d0-d3}, [r1]! // Load eight source RGBA pixels
+ vmov.i16 q15, #256 // Set up alpha constant
+ pld [r1, #0] // Pre-load next eight source pixels
+ subs r2, r2, #24 // Decrement loop counter
+ mov r3, r0 // Backup destination pointer
+ bmi PostLoop // Do we have enough pixels to enter the main loop?
+ pld [r1, #32] // Pre-load next next eight source pixels
+ /* Main loop, blitting 16 pixels per iteration */
+Loop:
+ vmov r4, r5, d3 // Move alpha to ARM for test
+ pld [r1, #64] // Pre-load next eight source pixels
+ and r6, r4, r5 // Check if source alpha is opaque
+ cmp r6, #0xFFFFFFFF //
+ bne NotOpaque1 // If not opaque, skip code
+ vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels
+ vst4.8 {d0-d3}, [r3]! // Since it is opaque, just write result to memory
+ add r0, r0, #32 // Advance destination pointer
+ b GoBack1 // Skip to next eight pixels
+NotOpaque1:
+ orrs r4, r5 // Check if source alpha is fully transparent
+ beq AllZero1 // If so, jump to special case handling
+
+ vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels
+ vsubw.u8 q14, q15, d3 // Calculate inverse alpha (scale)
+ vmovl.u8 q8, d4 // Expand destination red to 16-bit
+ vmovl.u8 q9, d5 // Expand destination green to 16-bit
+ vmovl.u8 q2, d6 // Expand destination blue to 16-bit
+ vmovl.u8 q3, d7 // Expand destination alpha to 16-bit
+ vmul.i16 q8, q8, q14 // Scale red
+ vmul.i16 q9, q9, q14 // Scale green
+ vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels
+ vmul.i16 q3, q3, q14 // Scale alpha
+ vmul.i16 q2, q2, q14 // Scale blue
+ vshrn.i16 d7, q3, #8 // Shift and narrow alpha
+ vshrn.i16 d6, q2, #8 // Shift and narrow blue
+ vshrn.i16 d5, q9, #8 // Shift and narrow green
+ vshrn.i16 d4, q8, #8 // Shift and narrow red
+ vadd.i8 q3, q1 // Add source to results
+ vadd.i8 q2, q0 // Add source to results
+ vst4.8 {d4-d7}, [r3]! // Write result to memory
+GoBack1:
+ vmov r4, r5, d23 // Move alpha to ARM for test
+ pld [r1, #64] // Pre-load next eight source pixels
+ and r6, r4, r5 // Check if source alpha is opaque
+ cmp r6, #0xFFFFFFFF //
+ bne NotOpaque2 // If not opaque, skip code
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ vst4.8 {d20-d23}, [r3]! // Since it is opaque, just write result to memory
+ subs r2, r2, #16 // Decrement loop counter
+ add r0, r0, #32 // Advance destination pointer
+ bpl Loop // Loop here, instead of jumping to GoBack2
+ b PostLoop
+NotOpaque2:
+ orrs r4, r5 // Check if source alpha is fully transparent
+ beq AllZero2 // If so, jump to special case handling
+
+ vld4.8 {d24-d27}, [r0]! // Pre-load next eight destination RGBA pixels
+ vsubw.u8 q14, q15, d23 // Calculate inverse alpha (scale)
+ vmovl.u8 q8, d24 // Expand destination red to 16-bit
+ vmovl.u8 q9, d25 // Expand destination green to 16-bit
+ vmovl.u8 q12, d26 // Expand destination blue to 16-bit
+ vmovl.u8 q13, d27 // Expand destination alpha to 16-bit
+ vmul.i16 q8, q8, q14 // Scale red
+ vmul.i16 q9, q9, q14 // Scale green
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ vmul.i16 q13, q13, q14 // Scale alpha
+ vmul.i16 q12, q12, q14 // Scale blue
+ vshrn.i16 d27, q13, #8 // Shift and narrow alpha
+ vshrn.i16 d26, q12, #8 // Shift and narrow blue
+ vshrn.i16 d25, q9, #8 // Shift and narrow green
+ vshrn.i16 d24, q8, #8 // Shift and narrow red
+ vadd.i8 q13, q11 // Add source to results
+ vadd.i8 q12, q10 // Add source to results
+ vst4.8 {d24-d27}, [r3]! // Write result to memory
+GoBack2:
+ subs r2, r2, #16 // Decrement loop counter
+ bpl Loop
+PostLoop:
+ adds r2, r2, #16
+ bmi Remaining
+LoopRemaining:
+ vmov r4, r5, d3 // Move alpha to ARM for test
+ and r6, r4, r5 // Check if source alpha is opaque
+ cmp r6, #0xFFFFFFFF //
+ bne NotOpaque3 // If not opaque, skip code
+ vst4.8 {d0-d3}, [r3]! // Since it is opaque, just write result to memory
+ add r0, r0, #32 // Advance destination pointer
+ subs r2, r2, #8 // Decrement loop counter
+ bmi Remaining
+ vld4.8 {d0-d3}, [r1]! // Load eight source RGBA pixels
+ b LoopRemaining
+NotOpaque3:
+ orrs r4, r5 // Check if source alpha is fully transparent
+ addeq r3, r3, #32 // If so, advance destination write pointer
+ addeq r0, r0, #32 // ...advance destination read pointer
+ beq GoBack3 // ...and jump to special case handling
+
+ vld4.8 {d4-d7}, [r0]! // Load eight destination RGBA pixels
+ vsubw.u8 q14, q15, d3 // Calculate inverse alpha (scale)
+ vmovl.u8 q8, d4 // Expand destination red to 16-bit
+ vmovl.u8 q9, d5 // Expand destination green to 16-bit
+ vmovl.u8 q2, d6 // Expand destination blue to 16-bit
+ vmovl.u8 q3, d7 // Expand destination alpha to 16-bit
+ vmul.i16 q8, q8, q14 // Scale red
+ vmul.i16 q9, q9, q14 // Scale green
+ vmul.i16 q3, q3, q14 // Scale alpha
+ vmul.i16 q2, q2, q14 // Scale blue
+ vshrn.i16 d7, q3, #8 // Shift and narrow alpha
+ vshrn.i16 d6, q2, #8 // Shift and narrow blue
+ vshrn.i16 d5, q9, #8 // Shift and narrow green
+ vshrn.i16 d4, q8, #8 // Shift and narrow red
+ vadd.i8 q3, q1 // Add source to results
+ vadd.i8 q2, q0 // Add source to results
+ vst4.8 {d4-d7}, [r3]! // Write result to memory
+GoBack3:
+ subs r2, r2, #8 // Decrement loop counter
+ bmi Remaining
+ vld4.8 {d0-d3}, [r1]! // Load eight source RGBA pixels
+ b LoopRemaining
+
+AllZero1:
+ vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels
+ add r3, r3, #32 // Advance destination write pointer
+ add r0, r0, #32 // Advance destination read pointer
+ b GoBack1
+AllZero2:
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ add r3, r3, #32 // Advance destination write pointer
+ subs r2, r2, #16 // Decrement loop counter
+ add r0, r0, #32 // Advance destination read pointer
+ bpl Loop
+ b PostLoop
+
+/* Handle small blits, 0-8 pixels */
+Remaining:
+ adds r2, r2, #8
+ ldmeq sp!, {r4-r6}
+ bxeq lr // Zero pixels left
+ ldr r3, =AlphaIndex
+ ldr r6, =0x00FFFFFF // Set up transparency check constant
+ cmp r2, #1 // Exit if count is zero
+ vld1.8 {d29}, [r3] // Set up alpha index table
+ bhi Blit2
+ b Blit1
+BlitSmall:
+ pld [r1, #0] // Pre-load eight source pixels
+ ldr r3, =AlphaIndex
+ ldr r6, =0x00FFFFFF // Set up transparency check constant
+ vld1.8 {d29}, [r3] // Set up alpha index table
+ cmp r2, #1 // Exit if count is zero
+ vmov.i16 q15, #256 // Set up alpha constant
+ beq Blit1
+ ldmlt sp!, {r4-r6}
+ bxlt lr // Zero pixels left
+ /* loop for neon 2-pixel code */
+Blit2:
+ ldmia r1!, {r4, r5} // Load two source RGBA pixels
+ sub r2, r2, #2 // Decrement loop counter
+ and r3, r4, r5 // Check if source alpha is opaque
+ cmp r3, #0xFF000000 //
+ blo NotOpaque4 // If not opaque, skip code
+ stmia r0!, {r4, r5} // Store two source RGBA pixels
+ cmp r2, #1 // Check count
+ bhi Blit2 // Still two or more pixels left
+ ldmlt sp!, {r4-r6}
+ bxlt lr // Zero pixels left
+ b Blit1
+NotOpaque4:
+ orr r3, r4, r5 // Check if source alpha is fully transparent
+ cmp r3, r6 //
+ addls r0, r0, #8 // If so, advance destination read pointer
+ bls GoBack4 // ...and jump to special case handling
+
+ vmov d0, r4, r5 // Move pixel to neon
+ vld1.32 {d1}, [r0] // Load two destination RGBA pixels
+ vtbl.8 d2, {d0}, d29 // Spread out alpha to match pixel format
+ vsubw.u8 q2, q15, d2 // Calculate inverse alpha (scale)
+ vmovl.u8 q3, d1 // Expand destination to 16-bit
+ vmul.i16 q3, q3, q2 // Scale pixels
+ vshrn.i16 d1, q3, #8 // Shift and narrow result
+ vadd.i8 d0, d1 // Add alpha to results
+ vst1.32 {d0}, [r0]! // Store two RGBA pixels
+GoBack4:
+ cmp r2, #1 // Check count
+ bhi Blit2 // Still two or more pixels left
+ ldmlt sp!, {r4-r6}
+ bxlt lr // Zero pixels left
+ /* code to handle any one last pixel */
+Blit1:
+ ldr r4, [r1] // Load one source RGBA pixel
+ cmp r4, #0xFF000000 // Check if source alpha is opaque
+ strhs r4, [r0] // If so, store one RGBA pixel
+ ldmhs sp!, {r4-r6}
+ bxhs lr // Zero pixels left
+
+ cmp r4, r6 // Check if source alpha is fully transparent
+ ldmls sp!, {r4-r6}
+ bxls lr // Zero pixels left
+
+ vmov.32 d0[0], r4 // Move pixel to neon
+ vld1.32 {d1[0]}, [r0] // Load one destination RGBA pixel
+ vtbl.8 d2, {d0}, d29 // Spread out alpha to match pixel format
+ vsubw.u8 q2, q15, d2 // Calculate inverse alpha (scale)
+ vmovl.u8 q3, d1 // Expand destination to 16-bit
+ vmul.i16 d6, d6, d4 // Scale pixel
+ vshrn.i16 d1, q3, #8 // Shift and narrow result
+ vadd.i8 d0, d1 // Add alpha to results
+ vst1.32 {d0[0]}, [r0] // Store one RGBA pixel
+ ldmia sp!, {r4-r6}
+ bx lr
+
+ .endfunc
+
+ .data
+ .align
+AlphaIndex:
+ .byte 3, 3, 3, 3, 7, 7, 7, 7
+
+#endif
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 361acbe..f41c342 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -508,7 +508,16 @@ static void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst,
///////////////////////////////////////////////////////////////////////////////
-#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(TEST_SRC_ALPHA)
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS)
+
+/* External function in file S32A_Opaque_BlitRow32_neon.S */
+extern "C" void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha);
+
+#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon
+
+#elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(TEST_SRC_ALPHA)
static void S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,