aboutsummaryrefslogtreecommitdiffstats
path: root/src/opts/S32A_Blend_BlitRow32_neon.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/opts/S32A_Blend_BlitRow32_neon.S')
-rw-r--r--src/opts/S32A_Blend_BlitRow32_neon.S262
1 files changed, 262 insertions, 0 deletions
diff --git a/src/opts/S32A_Blend_BlitRow32_neon.S b/src/opts/S32A_Blend_BlitRow32_neon.S
new file mode 100644
index 0000000..84f9846
--- /dev/null
+++ b/src/opts/S32A_Blend_BlitRow32_neon.S
@@ -0,0 +1,262 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Neon optimized version of S32A_Blend_BlitRow32.
+ * Special cases for when alpha is zero or opaque.
+ */
+
+#if defined(__ARM_HAVE_NEON) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS)
+
+ .text
+ .fpu neon
+ .align
+
+ .global S32A_Blend_BlitRow32_neon
+ .func S32A_Blend_BlitRow32_neon
+
+S32A_Blend_BlitRow32_neon:
+ cmp r2, #8 // The main code requires at least 8 pixels
+ ble BlitSmall
+ /* Setup constants, and do the first 1-8 pixels */
+ vld4.8 {d20-d23}, [r1] // Load eight source RGBA pixels
+ vld4.8 {d24-d27}, [r0] // Load eight destination RGBA pixels
+ add r3, #1 // Modify global alpha to 0...256 range
+ vpush {q4-q5}
+ stmdb sp!, {r4-r5}
+ vmov.i16 q15, #256 // Set up alpha constant
+ vmov.i16 q5, #0xFF00 // Set up mask constant
+ vdup.16 q4, r3 // Set up global alpha
+ pld [r1, #32] // Pre-load next eight source pixels
+ pld [r0, #32] // Pre-load next eight destination pixels
+ ands r3, r2, #0x7 // Should we do a partial first iteration?
+ moveq r3, #8 // Do full iteration?
+ vmovl.u8 q8, d20 // Expand source red to 16-bit
+ vmovl.u8 q9, d21 // Expand source green to 16-bit
+ vmovl.u8 q10, d22 // Expand source blue to 16-bit
+ vmovl.u8 q11, d23 // Expand source alpha to 16-bit
+ vmul.i16 q8, q8, q4 // Scale source red
+ vmul.i16 q11, q11, q4 // Scale source alpha
+ vand q8, q5 // Mask low byte in red to avoid overflow in vmla
+ vmul.i16 q9, q9, q4 // Scale source green
+ vshr.u16 q0, q11, #8 // Pre-calculate inverse destination alpha (scale)
+ vmul.i16 q10, q10, q4 // Scale source blue
+ vand q11, q5 // Mask low byte in green to avoid overflow in vmla
+ vand q9, q5 // Mask low byte in blue to avoid overflow in vmla
+ vand q10, q5 // Mask low byte in alpha to avoid overflow in vmla
+ vsub.i16 q14, q15, q0 // Calculate inverse destination alpha (scale)
+ vmovl.u8 q2, d24 // Expand destination red to 16-bit
+ vmovl.u8 q3, d25 // Expand destination green to 16-bit
+ vmovl.u8 q12, d26 // Expand destination blue to 16-bit
+ vmovl.u8 q13, d27 // Expand destination alpha to 16-bit
+ vmla.i16 q8, q2, q14 // Scale destination red, and add to source
+ mov r4, r0 // Backup destination pointer
+ add r1, r3, lsl #2 // Increment source pointer
+ sub r2, r2, r3 // Decrement loop counter
+ vmla.i16 q9, q3, q14 // Scale destination green, and add to source
+ add r0, r3, lsl #2 // Increment destination pointer
+ pld [r1, #32] // Pre-load next eight source pixels
+ pld [r0, #32] // Pre-load next eight destination pixels
+ mov r3, r0 // Backup destination pointer
+ vmla.i16 q11, q13, q14 // Scale destination alpha, and add to source
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ subs r2, r2, #24 // Decrement loop counter
+ vmla.i16 q10, q12, q14 // Scale destination blue, and add to source
+ vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels
+ vshrn.i16 d24, q8, #8 // Shift and narrow red
+ vshrn.i16 d25, q9, #8 // Shift and narrow green
+ vshrn.i16 d26, q10, #8 // Shift and narrow blue
+ vshrn.i16 d27, q11, #8 // Shift and narrow alpha
+ vst4.8 {d24-d27}, [r4] // Write result to memory
+ bmi PostLoop // Do we have enough pixels to enter the main loop?
+ /* Main loop, blitting 16 pixels per iteration */
+Loop:
+ pld [r1, #32] // Pre-load next eight source pixels
+ pld [r0, #32] // Pre-load next eight destination pixels
+ vmov r4, r5, d3 // Move alpha to ARM for test
+ orrs r4, r5 // Check if source alpha is fully transparent
+ beq AllZero1 // If so, jump to special case handling
+ vmovl.u8 q8, d0 // Expand source red to 16-bit
+ vmovl.u8 q9, d1 // Expand source green to 16-bit
+ vmovl.u8 q0, d2 // Expand source blue to 16-bit
+ vmovl.u8 q1, d3 // Expand source alpha to 16-bit
+ vmul.i16 q8, q8, q4 // Scale source red
+ vmul.i16 q1, q1, q4 // Scale source alpha
+ vand q8, q5 // Mask low byte in red to avoid overflow in vmla
+ vmul.i16 q9, q9, q4 // Scale source green
+ vshr.u16 q10, q1, #8 // Pre-calculate inverse destination alpha (scale)
+ vmul.i16 q0, q0, q4 // Scale source blue
+ vand q1, q5 // Mask low byte in green to avoid overflow in vmla
+ vand q9, q5 // Mask low byte in blue to avoid overflow in vmla
+ vand q0, q5 // Mask low byte in alpha to avoid overflow in vmla
+ vsub.i16 q14, q15, q10 // Calculate inverse destination alpha (scale)
+ vmovl.u8 q12, d4 // Expand destination red to 16-bit
+ vmovl.u8 q13, d5 // Expand destination green to 16-bit
+ vmovl.u8 q2, d6 // Expand destination blue to 16-bit
+ vmovl.u8 q3, d7 // Expand destination alpha to 16-bit
+ vmla.i16 q8, q12, q14 // Scale destination red and add to source
+ vmla.i16 q9, q13, q14 // Scale destination green and add to source
+ vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels
+ vmla.i16 q1, q3, q14 // Scale destination alpha and add to source
+ vmla.i16 q0, q2, q14 // Scale destination blue and add to source
+ vld4.8 {d24-d27}, [r0]! // Pre-load next eight destination RGBA pixels
+ vshrn.i16 d4, q8, #8 // Shift and narrow red
+ vshrn.i16 d5, q9, #8 // Shift and narrow green
+ vshrn.i16 d6, q0, #8 // Shift and narrow blue
+ vshrn.i16 d7, q1, #8 // Shift and narrow alpha
+ vst4.8 {d4-d7}, [r3]! // Write result to memory
+GoBack1:
+ pld [r1, #32] // Pre-load next eight source pixels
+ pld [r0, #32] // Pre-load next eight destination pixels
+ vmov r4, r5, d23 // Move alpha to ARM for test
+ orrs r4, r5 // Check if source alpha is fully transparent
+ beq AllZero2 // If so, jump to special case handling
+ vmovl.u8 q8, d20 // Expand source red to 16-bit
+ vmovl.u8 q9, d21 // Expand source green to 16-bit
+ vmovl.u8 q10, d22 // Expand source blue to 16-bit
+ vmovl.u8 q11, d23 // Expand source alpha to 16-bit
+ vmul.i16 q8, q8, q4 // Scale source red
+ subs r2, r2, #16 // Decrement loop counter
+ vmul.i16 q11, q11, q4 // Scale source alpha
+ vand q8, q5 // Mask low byte in red to avoid overflow in vmla
+ vmul.i16 q9, q9, q4 // Scale source green
+ vshr.u16 q0, q11, #8 // Pre-calculate inverse destination alpha (scale)
+ vmul.i16 q10, q10, q4 // Scale source blue
+ vand q11, q5 // Mask low byte in green to avoid overflow in vmla
+ vand q9, q5 // Mask low byte in blue to avoid overflow in vmla
+ vand q10, q5 // Mask low byte in alpha to avoid overflow in vmla
+ vsub.i16 q14, q15, q0 // Calculate inverse destination alpha (scale)
+ vmovl.u8 q2, d24 // Expand destination red to 16-bit
+ vmovl.u8 q3, d25 // Expand destination green to 16-bit
+ vmovl.u8 q12, d26 // Expand destination blue to 16-bit
+ vmovl.u8 q13, d27 // Expand destination alpha to 16-bit
+ vmla.i16 q8, q2, q14 // Scale destination red and add to source
+ vmla.i16 q9, q3, q14 // Scale destination green and add to source
+ vmla.i16 q11, q13, q14 // Scale destination alpha and add to source
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ vmla.i16 q10, q12, q14 // Scale destination blue, and add to source
+ vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels
+ vshrn.i16 d24, q8, #8 // Shift and narrow red
+ vshrn.i16 d25, q9, #8 // Shift and narrow green
+ vshrn.i16 d26, q10, #8 // Shift and narrow blue
+ vshrn.i16 d27, q11, #8 // Shift and narrow alpha
+ vst4.8 {d24-d27}, [r3]! // Write result to memory
+ bpl Loop
+PostLoop:
+ add r2, r2, #16
+ vmov.i16 q10, q4
+ ldmia sp!, {r4-r5}
+ vpop {q4-q5}
+LoopRemaining:
+ vmovl.u8 q8, d0 // Expand source red to 16-bit
+ vmovl.u8 q9, d1 // Expand source green to 16-bit
+ vmovl.u8 q0, d2 // Expand source blue to 16-bit
+ vmovl.u8 q1, d3 // Expand source alpha to 16-bit
+ vmul.i16 q8, q8, q10 // Scale source red
+ vmov.i16 q12, #0xFF00 // Set up mask constant
+ vmul.i16 q1, q1, q10 // Scale source alpha
+ vand q8, q12 // Mask low byte in red to avoid overflow in vmla
+ vmul.i16 q9, q9, q10 // Scale source green
+ vshr.u16 q11, q1, #8 // Pre-calculate inverse destination alpha (scale)
+ vmul.i16 q0, q0, q10 // Scale source blue
+ vand q1, q12 // Mask low byte in green to avoid overflow in vmla
+ vand q9, q12 // Mask low byte in blue to avoid overflow in vmla
+ vand q0, q12 // Mask low byte in alpha to avoid overflow in vmla
+ vsub.i16 q14, q15, q11 // Calculate inverse destination alpha (scale)
+ vmovl.u8 q12, d4 // Expand destination red to 16-bit
+ vmovl.u8 q13, d5 // Expand destination green to 16-bit
+ vmovl.u8 q2, d6 // Expand destination blue to 16-bit
+ vmovl.u8 q3, d7 // Expand destination alpha to 16-bit
+ vmla.i16 q8, q12, q14 // Scale destination red and add to source
+ subs r2, r2, #8 // Decrement loop counter
+ vmla.i16 q9, q13, q14 // Scale destination green and add to source
+ vmla.i16 q1, q3, q14 // Scale destination alpha and add to source
+ vmla.i16 q0, q2, q14 // Scale destination blue and add to source
+ vshrn.i16 d4, q8, #8 // Shift and narrow red
+ vshrn.i16 d5, q9, #8 // Shift and narrow green
+ vshrn.i16 d6, q0, #8 // Shift and narrow blue
+ vshrn.i16 d7, q1, #8 // Shift and narrow alpha
+ vst4.8 {d4-d7}, [r3]! // Write result to memory
+ bxmi lr
+ vld4.8 {d0-d3}, [r1] // Load eight source RGBA pixels
+ vld4.8 {d4-d7}, [r0] // Load eight destination RGBA pixels
+ b LoopRemaining
+AllZero1:
+ vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels
+ vld4.8 {d24-d27}, [r0]! // Pre-load next eight destination RGBA pixels
+ add r3, r3, #32 // Advance destination write pointer
+ b GoBack1
+AllZero2:
+ vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels
+ vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels
+ add r3, r3, #32 // Advance destination write pointer
+ subs r2, r2, #16 // Decrement loop counter
+ bpl Loop
+ b PostLoop
+
+/* Handle small blits, 0-8 pixels */
+BlitSmall:
+ beq Blit8
+ pld [r1, #0] // Pre-load eight source pixels
+ pld [r0, #0] // Pre-load eight destination pixels
+ add r3, #1 // Modify global alpha to 0...256 range
+ vdup.16 q13, r3 // Set up global alpha
+ ldr r3, =AlphaIndex
+ vmov.i16 q15, #256 // Set up alpha constant
+ vld1.8 {d29}, [r3] // Set up alpha index table
+ vmov.i16 q12, #0xFF00 // Set up mask constant
+ cmp r2, #1 // Exit if count is zero
+ beq Blit1
+ bxlt lr // Zero pixels left
+ /* loop for neon 2-pixel code */
+Blit2:
+ vld1.32 {d0}, [r1]! // Load two source RGBA pixels
+ vld1.32 {d1}, [r0] // Load two destination RGBA pixels
+ sub r2, r2, #2 // Decrement width counter
+ vmovl.u8 q8, d0 // Expand source to 16-bit
+ vmul.i16 q8, q8, q13 // Scale source pixels
+ vmovl.u8 q3, d1 // Expand destination to 16-bit
+ vtbl.8 d2, {d16, d17}, d29 // Spread out alpha to match pixel format
+ vand q8, q12 // Mask low byte to avoid overflow in vmla
+ vsubw.u8 q2, q15, d2 // Calculate inverse alpha (scale)
+ vmla.i16 q8, q3, q2 // Scale destination pixels and add to source
+ vshrn.i16 d0, q8, #8 // Shift and narrow result
+ vst1.32 {d0}, [r0]! // Store two RGBA pixels
+ cmp r2, #1 // Exit if count is zero
+ bhi Blit2 // Still two or more pixels left
+ bxlt lr // Zero pixels left
+ /* code to handle any one last pixel */
+Blit1:
+ vld1.32 {d0[0]}, [r1] // Load one source RGBA pixel
+ vld1.32 {d1[0]}, [r0] // Load one destination RGBA pixel
+ vmovl.u8 q8, d0 // Expand source to 16-bit
+ vmul.i16 d16, d16, d26 // Scale source pixels
+ vmovl.u8 q3, d1 // Expand destination to 16-bit
+ vtbl.8 d2, {d16, d17}, d29 // Spread out alpha to match pixel format
+ vand d16, d24 // Mask low byte to avoid overflow in vmla
+ vsubw.u8 q2, q15, d2 // Calculate inverse alpha (scale)
+ vmla.i16 d16, d6, d4 // Scale destination pixels and add to source
+ vshrn.i16 d0, q8, #8 // Shift and narrow result
+ vst1.32 {d0[0]}, [r0] // Store one RGBA pixel
+ bx lr
+ /* Handle 8 pixels */
+Blit8:
+ add r3, #1 // Modify global alpha to 0...256 range
+ sub r2, r2, #8 // Decrement loop counter
+ vdup.16 q10, r3 // Set up global alpha
+ mov r3, r0 // Backup destination pointer
+ vld4.8 {d0-d3}, [r1] // Load eight source RGBA pixels
+ vld4.8 {d4-d7}, [r0] // Load eight destination RGBA pixels
+ vmov.i16 q15, #256
+ b LoopRemaining
+
+ .endfunc
+
+ .data
+ .align
+AlphaIndex:
+ .byte 7, 7, 7, 7, 15, 15, 15, 15
+
+#endif