aboutsummaryrefslogtreecommitdiffstats
path: root/src/opts/S32A_D565_Opaque_arm.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/opts/S32A_D565_Opaque_arm.S')
-rw-r--r--src/opts/S32A_D565_Opaque_arm.S325
1 files changed, 325 insertions, 0 deletions
diff --git a/src/opts/S32A_D565_Opaque_arm.S b/src/opts/S32A_D565_Opaque_arm.S
new file mode 100644
index 0000000..9576521
--- /dev/null
+++ b/src/opts/S32A_D565_Opaque_arm.S
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2006, The Android Open Source Project
+ * Copyright (c) 2009, Code Aurora Forum.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * This file is derived from libpixelflinger version of BLIT routine.
+ * Algorithm used for BLIT operation here is equivalent to the one in
+ * C function, S32A_D565_Opaque. Use neon instructions to process 16 pixels
+ * at-a-time on armv7. If the number of pixels is less than 16 and/or the
+ * architecture is armv6 and below, use regular arm instructions. Regular
+ * arm code combines two 16-bit writes into one 32-bit write to destination,
+ * uses destination and source pre-loads, and unrolls the main loop thrice.
+ */
+ .text
+ .align
+
+ .global S32A_D565_Opaque_arm
+
+// uses r6, r7, r8, r9, r10, lr
+
+.macro pixel, DREG, SRC, FB, OFFSET
+
+ // SRC = AABBGGRR
+ subs r7, r10, \SRC, lsr #24 // sAA = 255 - sAA
+ beq 1f
+
+.if \OFFSET
+
+ // red
+ mov lr, \DREG, lsr #(\OFFSET + 6 + 5)
+ smlabb lr, r7, lr, r8
+ and r6, \SRC, r10
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ orr \FB, lr, lsl #(\OFFSET + 11)
+
+ // green
+ and r6, \DREG, #(0x3F<<(\OFFSET + 5))
+ lsr r6, #5
+ smlabt r6, r7, r6, r9
+ and lr, r10, \SRC, lsr #(8)
+ add r6, r6, r6, lsr #6
+ add r6, lr, r6, lsr #6
+ lsr r6, #2
+ orr \FB, \FB, r6, lsl #(\OFFSET + 5)
+
+ // blue
+ and lr, \DREG, #(0x1F << \OFFSET)
+ smlabt lr, r7, lr, r8
+ and r6, r10, \SRC, lsr #(8+8)
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ orr \FB, \FB, lr, lsl #\OFFSET
+
+.else
+
+ // red
+ mov lr, \DREG, lsr #(6+5)
+ and lr, lr, #0x1F
+ smlabb lr, r7, lr, r8
+ and r6, \SRC, r10
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ mov \FB, lr, lsl #11
+
+ // green
+ and r6, \DREG, #(0x3F<<5)
+ lsr r6, #5
+ smlabb r6, r7, r6, r9
+ and lr, r10, \SRC, lsr #(8)
+ add r6, r6, r6, lsr #6
+ add r6, lr, r6, lsr #6
+ lsr r6, #2
+ orr \FB, \FB, r6, lsl #5
+
+ // blue
+ and lr, \DREG, #0x1F
+ smlabb lr, r7, lr, r8
+ and r6, r10, \SRC, lsr #(8+8)
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ orr \FB, \FB, lr, lsr #3
+
+.endif
+ b 2f
+
+ /*
+ * When alpha = 255, down scale the source RGB pixel (24 bits)
+ * to 16 bits(RGB565)
+ */
+1:
+ lsl r6, \SRC, #8
+ lsr lr, \SRC, #5
+ and r7, r6, #0xf800
+ and lr, lr, #0x7e0
+ orr lr, lr, r7
+
+.if \OFFSET
+ orr lr, lr, r6, lsr #27
+ orr \FB, \FB, lr, lsl #(\OFFSET)
+.else
+ orr \FB, lr, r6, lsr #27
+.endif
+
+2:
+.endm
+
+
+// r0: dst ptr
+// r1: src ptr
+// r2: count
+// r3: d
+// r4: s0
+// r5: s1
+// r6: pixel
+// r7: pixel
+// r8: 0x10
+// r9: 0x20
+// r10: 0xFF
+// r11: free
+// r12: scratch
+// r14: free
+
+S32A_D565_Opaque_arm:
+ stmfd sp!, {r4-r10, lr}
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+ subs r2, r2, #16
+
+ blo blit_less_than_16_left
+
+ vmov.u16 q12, #0x80
+ vmov.u8 q13, #0xf8
+
+blit_neon_loop:
+ /*
+ * Load 64 bytes from source and 32 bytes from destination
+ * note that source pixels are 4 bytes wide and
+ * destination pixels are 2 bytes wide.
+ */
+ vld4.8 {d2, d4, d6, d8}, [r1]!
+ vld4.8 {d3, d5, d7, d9}, [r1]!
+
+ vand.8 d10, d8, d9
+ vmov r3, r4, d10
+
+ cmp r3, #0xffffffff
+ cmpeq r4, #0xffffffff
+ bne blit_alpha_not_255
+
+ // alpha equals 255 case
+
+ vshl.u8 q0, q2, #3
+
+ subs r2, r2, #16
+
+ vsri.u8 q1, q2, #5
+ vsri.u8 q0, q3, #3
+
+ // store the rgb destination values back to memory
+ vst2.8 {d0, d2}, [r0]!
+ vst2.8 {d1, d3}, [r0]!
+
+ blo blit_less_than_16_left
+ b blit_neon_loop
+
+blit_alpha_not_255:
+ // alpha = 255 - alpha
+ vmvn.u8 q0, q4
+
+ vld2.8 {q5, q6}, [r0]
+
+ vshl.u8 q7, q6, #3
+
+ subs r2, r2, #16
+
+ vand.u8 q6, q6, q13
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+
+ vsri.u8 q7, q5, #5
+ vshl.u8 q5, q5, #3
+
+ vmlal.u8 q8, d0, d12
+ vmlal.u8 q9, d1, d13
+
+ vshl.u8 q7, q7, #2
+
+ vshr.u16 q10, q8, #5
+ vshr.u16 q11, q9, #5
+ vaddhn.u16 d12, q8, q10
+ vaddhn.u16 d13, q9, q11
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+ vmlal.u8 q8, d0, d14
+ vmlal.u8 q9, d1, d15
+
+ vqadd.u8 q6, q6, q1
+
+ vshr.u16 q10, q8, #6
+ vshr.u16 q11, q9, #6
+ vaddhn.u16 d14, q8, q10
+ vaddhn.u16 d15, q9, q11
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+ vmlal.u8 q8, d0, d10
+ vmlal.u8 q9, d1, d11
+
+ vqadd.u8 q7, q7, q2
+
+ vshl.u8 q5, q7, #3
+
+ vshr.u16 q10, q8, #5
+ vshr.u16 q11, q9, #5
+
+ vsri.u8 q6, q7, #5
+
+ vaddhn.u16 d16, q8, q10
+ vaddhn.u16 d17, q9, q11
+ vqadd.u8 q8, q8, q3
+
+ vsri.u8 q5, q8, #3
+
+ // store the rgb destination values back to memory
+ vst2.8 {d10, d12}, [r0]!
+ vst2.8 {d11, d13}, [r0]!
+
+ blo blit_less_than_16_left
+ b blit_neon_loop
+#endif
+
+blit_less_than_16_left:
+ pld [r1]
+
+ mov r8, #0x10
+ mov r9, #0x20
+ mov r10, #0xFF
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+ adds r2, r2, #14
+#else
+ subs r2, r2, #2
+#endif
+
+ pld [r0]
+ blo 9f
+
+ // The main loop is unrolled thrice and process 6 pixels
+8: ldmia r1!, {r4, r5}
+ // stream the source
+ pld [r1, #32]
+ add r0, r0, #4
+ // it's all zero, skip this pixel
+ orrs r3, r4, r5
+ beq 7f
+
+ // load the destination
+ ldr r3, [r0, #-4]
+ // stream the destination
+ pld [r0, #32]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ // effectively, we're getting write-combining by virtue of the
+ // cpu's write-back cache.
+ str r12, [r0, #-4]
+
+ // 2nd iteration of the loop, don't stream anything
+ subs r2, r2, #2
+ blt 9f
+ ldmia r1!, {r4, r5}
+ add r0, r0, #4
+ orrs r3, r4, r5
+ beq 7f
+ ldr r3, [r0, #-4]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ str r12, [r0, #-4]
+
+ // 3rd iteration of the loop, don't stream anything
+ subs r2, r2, #2
+ blt 9f
+ ldmia r1!, {r4, r5}
+ add r0, r0, #4
+ orrs r3, r4, r5
+ beq 7f
+ ldr r3, [r0, #-4]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ str r12, [r0, #-4]
+
+7: subs r2, r2, #2
+ blo 9f
+ b 8b
+
+9: adds r2, r2, #1
+ ldmlofd sp!, {r4-r10, lr} // return
+ bxlo lr
+
+ // last pixel left
+ ldr r4, [r1], #4
+ ldrh r3, [r0]
+ pixel r3, r4, r12, 0
+ strh r12, [r0], #2
+ ldmfd sp!, {r4-r10, lr} // return
+ bx lr