aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenrik Smiding <henrik.smiding@stericsson.com>2012-08-11 23:28:17 +0200
committerEvan McClain <aeroevan@gmail.com>2013-05-26 09:30:17 -0400
commit416f34f7ddf0a948d092fd11933565b1acee3fe9 (patch)
treed206e869474a3c74e4386ec3a1f5627cb6357bd7
parent49cd20ffc0969087a8b6c1406e7cb7e59738a7f5 (diff)
downloadexternal_skia-416f34f7ddf0a948d092fd11933565b1acee3fe9.zip
external_skia-416f34f7ddf0a948d092fd11933565b1acee3fe9.tar.gz
external_skia-416f34f7ddf0a948d092fd11933565b1acee3fe9.tar.bz2
Add optimization of Skia S32A_Blend blitter
Adds optimization of Skia S32A_Blend_BlitRow32 blitter using ARM NEON instruction set. Special cases for when pixel alpha is zero. Improves performance platform wide. Change-Id: I6c8bf8a9525838682206ebd139855354d6b3a563 Signed-off-by: Henrik Smiding <henrik.smiding@stericsson.com> Signed-off-by: Patrik Ryd <patrik.ryd@stericsson.com>
-rw-r--r--Android.mk5
-rw-r--r--src/opts/S32A_Blend_BlitRow32_neon.S262
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp14
3 files changed, 279 insertions, 2 deletions
diff --git a/Android.mk b/Android.mk
index 38f18f9..7ceea85 100644
--- a/Android.mk
+++ b/Android.mk
@@ -51,8 +51,8 @@ ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_CFLAGS += -D__ARM_HAVE_NEON
endif
-# Enable Neon assembler optimized version of S32A_Opaque_BlitRow32.
-# Overrides the intrinsic blitter below.
+# Enable Neon assembler optimized version of S32A_Opaque_BlitRow32 and
+# S32A_Blend_Blitrow32. Overrides the intrinsic blitter below.
LOCAL_CFLAGS += -DENABLE_OPTIMIZED_S32A_BLITTERS
# special checks for alpha == 0 and alpha == 255 in S32A_Opaque_BlitRow32
@@ -269,6 +269,7 @@ ifeq ($(TARGET_ARCH),arm)
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_SRC_FILES += \
src/opts/S32A_Opaque_BlitRow32_neon.S \
+ src/opts/S32A_Blend_BlitRow32_neon.S \
src/opts/memset16_neon.S \
src/opts/memset32_neon.S
diff --git a/src/opts/S32A_Blend_BlitRow32_neon.S b/src/opts/S32A_Blend_BlitRow32_neon.S
new file mode 100644
index 0000000..84f9846
--- /dev/null
+++ b/src/opts/S32A_Blend_BlitRow32_neon.S
@@ -0,0 +1,262 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Neon optimized version of S32A_Blend_BlitRow32.
+ * Special cases for when alpha is zero or opaque.
+ */
+
+#if defined(__ARM_HAVE_NEON) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS)
+
+ .text
+ .fpu neon
+ .align
+
+ .global S32A_Blend_BlitRow32_neon
+ .func S32A_Blend_BlitRow32_neon
+
+S32A_Blend_BlitRow32_neon:
+ cmp r2, #8 // The main code requires at least 8 pixels
+ ble BlitSmall
+ /* Setup constants, and do the first 1-8 pixels */
+ vld4.8 {d20-d23}, [r1] // Load eight source RGBA pixels
+ vld4.8 {d24-d27}, [r0] // Load eight destination RGBA pixels
+ add r3, #1 // Modify global alpha to 0...256 range
+ vpush {q4-q5}
+ stmdb sp!, {r4-r5}
+ vmov.i16 q15, #256 // Set up alpha constant
+ vmov.i16 q5, #0xFF00 // Set up mask constant
+ vdup.16 q4, r3 // Set up global alpha
+ pld [r1, #32] // Pre-load next eight source pixels
+ pld [r0, #32] // Pre-load next eight destination pixels
+ ands r3, r2, #0x7 // Should we do a partial first iteration?
+ moveq r3, #8 // Do full iteration?
+ vmovl.u8 q8, d20 // Expand source red to 16-bit
+ vmovl.u8 q9, d21 // Expand source green to 16-bit
+ vmovl.u8 q10, d22 // Expand source blue to 16-bit
+ vmovl.u8 q11, d23 // Expand source alpha to 16-bit
+ vmul.i16 q8, q8, q4 // Scale source red
+ vmul.i16 q11, q11, q4 // Scale source alpha
+ vand q8, q5 // Mask low byte in red to avoid overflow in vmla
+ vmul.i16 q9, q9, q4 // Scale source green
+ vshr.u16 q0, q11, #8 // Pre-calculate inverse destination alpha (scale)
+ vmul.i16 q10, q10, q4 // Scale source blue
+ vand q11, q5 // Mask low byte in green to avoid overflow in vmla
+ vand q9, q5 // Mask low byte in blue to avoid overflow in vmla
+ vand q10, q5 // Mask low byte in alpha to avoid overflow in vmla
+ vsub.i16 q14, q15, q0 // Calculate inverse destination alpha (scale)
+ vmovl.u8 q2, d24 // Expand destination red to 16-bit
+ vmovl.u8 q3, d25 // Expand destination green to 16-bit
+ vmovl.u8 q12, d26 // Expand destination blue to 16-bit
+ vmovl.u8 q13, d27 // Expand destination alpha to 16-bit
+ vmla.i16 q8, q2, q14 // Scale destination red, and add to source
+ mov r4, r0 // Backup destination pointer
+ add r1, r3, lsl #2 // Increment source pointer
+ sub r2, r2, r3 // Decrement loop counter
+ vmla.i16 q9, q3, q14 // Scale destination green, and add to source
+ add r0, r3, lsl #2 // Increment destination pointer
+ pld [r1, #32] // Pre-load next eight source pixels
+ pld [r0, #32] // Pre-load next eight destination pixels
+ mov r3, r0 // Backup destination pointer
+ vmla.i16 q11, q13, q14 // Scale destination alpha, and add to source
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ subs r2, r2, #24 // Decrement loop counter
+ vmla.i16 q10, q12, q14 // Scale destination blue, and add to source
+ vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels
+ vshrn.i16 d24, q8, #8 // Shift and narrow red
+ vshrn.i16 d25, q9, #8 // Shift and narrow green
+ vshrn.i16 d26, q10, #8 // Shift and narrow blue
+ vshrn.i16 d27, q11, #8 // Shift and narrow alpha
+ vst4.8 {d24-d27}, [r4] // Write result to memory
+ bmi PostLoop // Do we have enough pixels to enter the main loop?
+ /* Main loop, blitting 16 pixels per iteration */
+Loop:
+ pld [r1, #32] // Pre-load next eight source pixels
+ pld [r0, #32] // Pre-load next eight destination pixels
+ vmov r4, r5, d3 // Move alpha to ARM for test
+ orrs r4, r5 // Check if source alpha is fully transparent
+ beq AllZero1 // If so, jump to special case handling
+ vmovl.u8 q8, d0 // Expand source red to 16-bit
+ vmovl.u8 q9, d1 // Expand source green to 16-bit
+ vmovl.u8 q0, d2 // Expand source blue to 16-bit
+ vmovl.u8 q1, d3 // Expand source alpha to 16-bit
+ vmul.i16 q8, q8, q4 // Scale source red
+ vmul.i16 q1, q1, q4 // Scale source alpha
+ vand q8, q5 // Mask low byte in red to avoid overflow in vmla
+ vmul.i16 q9, q9, q4 // Scale source green
+ vshr.u16 q10, q1, #8 // Pre-calculate inverse destination alpha (scale)
+ vmul.i16 q0, q0, q4 // Scale source blue
+ vand q1, q5 // Mask low byte in green to avoid overflow in vmla
+ vand q9, q5 // Mask low byte in blue to avoid overflow in vmla
+ vand q0, q5 // Mask low byte in alpha to avoid overflow in vmla
+ vsub.i16 q14, q15, q10 // Calculate inverse destination alpha (scale)
+ vmovl.u8 q12, d4 // Expand destination red to 16-bit
+ vmovl.u8 q13, d5 // Expand destination green to 16-bit
+ vmovl.u8 q2, d6 // Expand destination blue to 16-bit
+ vmovl.u8 q3, d7 // Expand destination alpha to 16-bit
+ vmla.i16 q8, q12, q14 // Scale destination red and add to source
+ vmla.i16 q9, q13, q14 // Scale destination green and add to source
+ vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels
+ vmla.i16 q1, q3, q14 // Scale destination alpha and add to source
+ vmla.i16 q0, q2, q14 // Scale destination blue and add to source
+ vld4.8 {d24-d27}, [r0]! // Pre-load next eight destination RGBA pixels
+ vshrn.i16 d4, q8, #8 // Shift and narrow red
+ vshrn.i16 d5, q9, #8 // Shift and narrow green
+ vshrn.i16 d6, q0, #8 // Shift and narrow blue
+ vshrn.i16 d7, q1, #8 // Shift and narrow alpha
+ vst4.8 {d4-d7}, [r3]! // Write result to memory
+GoBack1:
+ pld [r1, #32] // Pre-load next eight source pixels
+ pld [r0, #32] // Pre-load next eight destination pixels
+ vmov r4, r5, d23 // Move alpha to ARM for test
+ orrs r4, r5 // Check if source alpha is fully transparent
+ beq AllZero2 // If so, jump to special case handling
+ vmovl.u8 q8, d20 // Expand source red to 16-bit
+ vmovl.u8 q9, d21 // Expand source green to 16-bit
+ vmovl.u8 q10, d22 // Expand source blue to 16-bit
+ vmovl.u8 q11, d23 // Expand source alpha to 16-bit
+ vmul.i16 q8, q8, q4 // Scale source red
+ subs r2, r2, #16 // Decrement loop counter
+ vmul.i16 q11, q11, q4 // Scale source alpha
+ vand q8, q5 // Mask low byte in red to avoid overflow in vmla
+ vmul.i16 q9, q9, q4 // Scale source green
+ vshr.u16 q0, q11, #8 // Pre-calculate inverse destination alpha (scale)
+ vmul.i16 q10, q10, q4 // Scale source blue
+ vand q11, q5 // Mask low byte in green to avoid overflow in vmla
+ vand q9, q5 // Mask low byte in blue to avoid overflow in vmla
+ vand q10, q5 // Mask low byte in alpha to avoid overflow in vmla
+ vsub.i16 q14, q15, q0 // Calculate inverse destination alpha (scale)
+ vmovl.u8 q2, d24 // Expand destination red to 16-bit
+ vmovl.u8 q3, d25 // Expand destination green to 16-bit
+ vmovl.u8 q12, d26 // Expand destination blue to 16-bit
+ vmovl.u8 q13, d27 // Expand destination alpha to 16-bit
+ vmla.i16 q8, q2, q14 // Scale destination red and add to source
+ vmla.i16 q9, q3, q14 // Scale destination green and add to source
+ vmla.i16 q11, q13, q14 // Scale destination alpha and add to source
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ vmla.i16 q10, q12, q14 // Scale destination blue, and add to source
+ vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels
+ vshrn.i16 d24, q8, #8 // Shift and narrow red
+ vshrn.i16 d25, q9, #8 // Shift and narrow green
+ vshrn.i16 d26, q10, #8 // Shift and narrow blue
+ vshrn.i16 d27, q11, #8 // Shift and narrow alpha
+ vst4.8 {d24-d27}, [r3]! // Write result to memory
+ bpl Loop
+PostLoop:
+ add r2, r2, #16
+ vmov.i16 q10, q4
+ ldmia sp!, {r4-r5}
+ vpop {q4-q5}
+LoopRemaining:
+ vmovl.u8 q8, d0 // Expand source red to 16-bit
+ vmovl.u8 q9, d1 // Expand source green to 16-bit
+ vmovl.u8 q0, d2 // Expand source blue to 16-bit
+ vmovl.u8 q1, d3 // Expand source alpha to 16-bit
+ vmul.i16 q8, q8, q10 // Scale source red
+ vmov.i16 q12, #0xFF00 // Set up mask constant
+ vmul.i16 q1, q1, q10 // Scale source alpha
+ vand q8, q12 // Mask low byte in red to avoid overflow in vmla
+ vmul.i16 q9, q9, q10 // Scale source green
+ vshr.u16 q11, q1, #8 // Pre-calculate inverse destination alpha (scale)
+ vmul.i16 q0, q0, q10 // Scale source blue
+ vand q1, q12 // Mask low byte in green to avoid overflow in vmla
+ vand q9, q12 // Mask low byte in blue to avoid overflow in vmla
+ vand q0, q12 // Mask low byte in alpha to avoid overflow in vmla
+ vsub.i16 q14, q15, q11 // Calculate inverse destination alpha (scale)
+ vmovl.u8 q12, d4 // Expand destination red to 16-bit
+ vmovl.u8 q13, d5 // Expand destination green to 16-bit
+ vmovl.u8 q2, d6 // Expand destination blue to 16-bit
+ vmovl.u8 q3, d7 // Expand destination alpha to 16-bit
+ vmla.i16 q8, q12, q14 // Scale destination red and add to source
+ subs r2, r2, #8 // Decrement loop counter
+ vmla.i16 q9, q13, q14 // Scale destination green and add to source
+ vmla.i16 q1, q3, q14 // Scale destination alpha and add to source
+ vmla.i16 q0, q2, q14 // Scale destination blue and add to source
+ vshrn.i16 d4, q8, #8 // Shift and narrow red
+ vshrn.i16 d5, q9, #8 // Shift and narrow green
+ vshrn.i16 d6, q0, #8 // Shift and narrow blue
+ vshrn.i16 d7, q1, #8 // Shift and narrow alpha
+ vst4.8 {d4-d7}, [r3]! // Write result to memory
+ bxmi lr
+ vld4.8 {d0-d3}, [r1] // Load eight source RGBA pixels
+ vld4.8 {d4-d7}, [r0] // Load eight destination RGBA pixels
+ b LoopRemaining
+AllZero1:
+ vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels
+ vld4.8 {d24-d27}, [r0]! // Pre-load next eight destination RGBA pixels
+ add r3, r3, #32 // Advance destination write pointer
+ b GoBack1
+AllZero2:
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels
+ add r3, r3, #32 // Advance destination write pointer
+ subs r2, r2, #16 // Decrement loop counter
+ bpl Loop
+ b PostLoop
+
+/* Handle small blits, 0-8 pixels */
+BlitSmall:
+ beq Blit8
+ pld [r1, #0] // Pre-load eight source pixels
+ pld [r0, #0] // Pre-load eight destination pixels
+ add r3, #1 // Modify global alpha to 0...256 range
+ vdup.16 q13, r3 // Set up global alpha
+ ldr r3, =AlphaIndex
+ vmov.i16 q15, #256 // Set up alpha constant
+ vld1.8 {d29}, [r3] // Set up alpha index table
+ vmov.i16 q12, #0xFF00 // Set up mask constant
+ cmp r2, #1 // Exit if count is zero
+ beq Blit1
+ bxlt lr // Zero pixels left
+ /* loop for neon 2-pixel code */
+Blit2:
+ vld1.32 {d0}, [r1]! // Load two source RGBA pixels
+ vld1.32 {d1}, [r0] // Load two destination RGBA pixels
+ sub r2, r2, #2 // Decrement width counter
+ vmovl.u8 q8, d0 // Expand source to 16-bit
+ vmul.i16 q8, q8, q13 // Scale source pixels
+ vmovl.u8 q3, d1 // Expand destination to 16-bit
+ vtbl.8 d2, {d16, d17}, d29 // Spread out alpha to match pixel format
+ vand q8, q12 // Mask low byte to avoid overflow in vmla
+ vsubw.u8 q2, q15, d2 // Calculate inverse alpha (scale)
+ vmla.i16 q8, q3, q2 // Scale destination pixels and add to source
+ vshrn.i16 d0, q8, #8 // Shift and narrow result
+ vst1.32 {d0}, [r0]! // Store two RGBA pixels
+ cmp r2, #1 // Exit if count is zero
+ bhi Blit2 // Still two or more pixels left
+ bxlt lr // Zero pixels left
+ /* code to handle any one last pixel */
+Blit1:
+ vld1.32 {d0[0]}, [r1] // Load one source RGBA pixel
+ vld1.32 {d1[0]}, [r0] // Load one destination RGBA pixel
+ vmovl.u8 q8, d0 // Expand source to 16-bit
+ vmul.i16 d16, d16, d26 // Scale source pixels
+ vmovl.u8 q3, d1 // Expand destination to 16-bit
+ vtbl.8 d2, {d16, d17}, d29 // Spread out alpha to match pixel format
+ vand d16, d24 // Mask low byte to avoid overflow in vmla
+ vsubw.u8 q2, q15, d2 // Calculate inverse alpha (scale)
+ vmla.i16 d16, d6, d4 // Scale destination pixels and add to source
+ vshrn.i16 d0, q8, #8 // Shift and narrow result
+ vst1.32 {d0[0]}, [r0] // Store one RGBA pixel
+ bx lr
+ /* Handle 8 pixels */
+Blit8:
+ add r3, #1 // Modify global alpha to 0...256 range
+ sub r2, r2, #8 // Decrement loop counter
+ vdup.16 q10, r3 // Set up global alpha
+ mov r3, r0 // Backup destination pointer
+ vld4.8 {d0-d3}, [r1] // Load eight source RGBA pixels
+ vld4.8 {d4-d7}, [r0] // Load eight destination RGBA pixels
+ vmov.i16 q15, #256
+ b LoopRemaining
+
+ .endfunc
+
+ .data
+ .align
+AlphaIndex:
+ .byte 7, 7, 7, 7, 15, 15, 15, 15
+
+#endif
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index f41c342..9ca2b77 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -1232,6 +1232,18 @@ static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
#define S32A_Opaque_BlitRow32_PROC NULL
#endif
+
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS)
+
+/* External function in file S32A_Blend_BlitRow32_neon.S */
+extern "C" void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count, U8CPU alpha);
+
+#define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_neon
+
+#else
+
/*
* ARM asm version of S32A_Blend_BlitRow32
*/
@@ -1364,6 +1376,8 @@ static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
}
#define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm
+#endif
+
/* Neon version of S32_Blend_BlitRow32()
* portable version is in src/core/SkBlitRow_D32.cpp