aboutsummaryrefslogtreecommitdiffstats
path: root/src/opts
diff options
context:
space:
mode:
authorPer-Daniel Olsson <eperdol@steludxu213.lud.stericsson.com>2010-11-09 08:51:09 +0100
committerSteve Kondik <shade@chemlab.org>2012-07-19 15:43:49 -0700
commita9a4c66d163e245628ddde71dca40e4c29a44f47 (patch)
tree1a037135f98a8f8d59f3b1b5c0cedc74bcc56e3d /src/opts
parentec1ad08b13b4034af0ae2e65da7803250a7eece3 (diff)
downloadexternal_skia-a9a4c66d163e245628ddde71dca40e4c29a44f47.zip
external_skia-a9a4c66d163e245628ddde71dca40e4c29a44f47.tar.gz
external_skia-a9a4c66d163e245628ddde71dca40e4c29a44f47.tar.bz2
Neon optimized implementation of S16_opaque_D32_nofilter_DX
This patch does the following: * Neon ASM blitter from RGB565 to ABGR8888 without blend. * Special case for no scaling but the scaling case is almost as fast. Copyright (C) ST-Ericsson SA 2010 Change-Id: I0fae80b14aa5a5c7f6ff4c251bc1419ffa0d4751 Signed-off-by: Christian Bejram <christian.bejram@stericsson.com>
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkBitmapProcState_opts_arm.cpp204
1 files changed, 204 insertions, 0 deletions
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp
index 20d62e1..f7b89a9 100644
--- a/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -11,6 +11,11 @@
#include "SkColorPriv.h"
#include "SkUtils.h"
+#if defined(__ARM_HAVE_NEON)
+#include <arm_neon.h>
+#endif
+
+
#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
void SI8_D16_nofilter_DX_arm(
const SkBitmapProcState& s,
@@ -184,11 +189,201 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
}
#endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
+
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+void S16_opaque_D32_nofilter_DX_neon_asm(const SkBitmapProcState& s,
+ const uint32_t* __restrict__ xy,
+ int count, uint32_t* __restrict__ colors) {
+
+ const uint16_t* __restrict__ srcAddr = (const uint16_t*)s.fBitmap->getPixels();
+
+ uint16_t* index;
+ uint16_t src;
+ int i;
+
+ srcAddr = (const uint16_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes());
+
+ const uint16_t* __restrict__ xx = (const uint16_t*)(++xy);
+
+ if (1 == s.fBitmap->width()) {
+
+ src = srcAddr[0];
+ uint32_t dstValue = SkPixel16ToPixel32(src);
+ sk_memset32(colors, dstValue, count);
+ } else if ((xx[count-1] - xx[0]) == (count-1)) {
+ // No scaling
+ const uint16_t* src_data = (const uint16_t*)(srcAddr + xx[0]);
+ asm volatile (
+ "subs %[count], %[count], #8 \n\t" // count -= 8, set flag
+ "blt 2f \n\t" // if count < 0, branch to label 2
+ "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use.
+ "1: \n\t" // 8 loop
+ // Handle 8 pixels in one loop.
+
+ "vld1.u16 {q0}, [%[src_data]]! \n\t" // load eight src 565 pixels
+
+ "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2
+ "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3
+ "vmov.u16 q1, q8 \n\t" // copy alpha from q8
+ "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1
+ "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1
+ "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2
+ "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2
+ "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2
+ "vzip.16 q2, q1 \n\t" // interleave q1 and q2
+ "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst
+ "subs %[count], %[count], #8 \n\t" // count -= 8, set flag
+ "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q1 to dst
+
+ "bge 1b \n\t" // loop if count >= 0
+ "2: \n\t" // exit of 8 loop
+
+ "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed.
+ "blt 3f \n\t" // if count < 0, branch to label 3
+
+ // Handle 4 pixels at once
+
+ "vld1.u16 {d0}, [%[src_data]]! \n\t" // load eight src 565 pixels
+
+ "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2
+ "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1
+ "vmov.u16 d3, d16 \n\t" // copy alpha from d16
+ "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3
+ "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3
+ "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2
+ "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2
+ "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2
+ "vzip.16 d2, d3 \n\t" // interleave d2 and d3
+ "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst
+
+ "3: \n\t" // end
+ : [src_data] "+r" (src_data), [colors] "+r" (colors), [count] "+r" (count)
+ :
+ : "cc", "memory","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17"
+ );
+
+ for (i = (count & 3); i > 0; --i) {
+ *colors++ = SkPixel16ToPixel32(*src_data++);
+ }
+
+ } else {
+ // Scaling case
+ uint16_t data[8];
+
+ asm volatile (
+ "subs %[count], %[count], #8 \n\t" // count -= 8, set flag
+ "blt 2f \n\t" // if count < 0, branch to label 2
+ "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use.
+ "1: \n\t" // 8 loop
+ // Handle 8 pixels in one loop.
+ "ldmia %[xx]!, {r4, r5, r6, r7} \n\t" // load ptrs to pixels 0-7
+
+ "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer
+ "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer
+ "mov r6, r6, lsl #1 \n\t" // <<1 because of 16 bit pointer
+ "mov r7, r7, lsl #1 \n\t" // <<1 because of 16 bit pointer
+
+ "uxth r8, r4 \n\t" // extract ptr 0
+ "mov r4, r4, lsr #16 \n\t" // extract ptr 1
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image
+ "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image
+ "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register
+
+ "uxth r8, r5 \n\t" // extract ptr 2
+ "mov r5, r5, lsr #16 \n\t" // extract ptr 3
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image
+ "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image
+ "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register
+
+ "uxth r8, r6 \n\t" // extract ptr 4
+ "mov r6, r6, lsr #16 \n\t" // extract ptr 5
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image
+ "ldrh r6, [%[srcAddr], r6] \n\t" // load pixel 5 from image
+ "pkhbt r6, r8, r6, lsl #16 \n\t" // combine pixel 4 and 5 in one register
+
+ "uxth r8, r7 \n\t" // extract ptr 6
+ "mov r7, r7, lsr #16 \n\t" // extract ptr 7
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 6 from image
+ "ldrh r7, [%[srcAddr], r7] \n\t" // load pixel 7 from image
+ "pkhbt r7, r8, r7, lsl #16 \n\t" // combine pixel 6 and 7 in one register
+
+ "stmia %[data], {r4, r5, r6, r7} \n\t" // store 8 src pixels
+
+ "vld1.u16 {q0}, [%[data]] \n\t" // load eight src 565 pixels
+
+ "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2
+ "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3
+ "vmov.u16 q1, q8 \n\t" // copy alpha from q8
+ "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1
+ "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1
+ "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2
+ "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2
+ "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2
+ "vzip.16 q2, q1 \n\t" // interleave q1 and q2
+ "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst
+ "subs %[count], %[count], #8 \n\t" // count -= 8, set flag
+ "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q2 to dst
+
+ "bge 1b \n\t" // loop if count >= 0
+ "2: \n\t" // exit of 8 loop
+
+ "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed.
+ "blt 3f \n\t" // if count < 0, branch to label 3
+
+ // Handle 4 pixels at once
+ "ldmia %[xx]!, {r4, r5} \n\t" // load ptrs to pixels 0-3
+
+ "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer
+ "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer
+
+ "uxth r8, r4 \n\t" // extract ptr 0
+ "mov r4, r4, lsr #16 \n\t" // extract ptr 1
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image
+ "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image
+ "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register
+
+ "uxth r8, r5 \n\t" // extract ptr 2
+ "mov r5, r5, lsr #16 \n\t" // extract ptr 3
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image
+ "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image
+ "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register
+
+ "stmia %[data], {r4, r5} \n\t" // store 4 src pixels
+
+ "vld1.u16 {d0}, [%[data]] \n\t" // load eight src 565 pixels
+
+ "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2
+ "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1
+ "vmov.u16 d3, d16 \n\t" // copy alpha from d16
+ "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3
+ "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3
+ "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2
+ "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2
+ "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2
+ "vzip.16 d2, d3 \n\t" // interleave d2 and d3
+ "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst
+
+ "3: \n\t" // End
+ : [xx] "+r" (xx), [colors] "+r" (colors), [count] "+r" (count)
+ : [data] "r" (data), [srcAddr] "r" (srcAddr)
+ : "cc", "memory","r4","r5","r6","r7","r8","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17"
+ );
+
+ for (i = (count & 3); i > 0; --i) {
+ src = srcAddr[*xx++]; *colors++ = SkPixel16ToPixel32(src);
+ }
+ }
+}
+#endif
+
+
///////////////////////////////////////////////////////////////////////////////
/* If we replace a sampleproc, then we null-out the associated shaderproc,
otherwise the shader won't even look at the matrix/sampler
*/
+
+
void SkBitmapProcState::platformProcs() {
bool doFilter = fDoFilter;
bool isOpaque = 256 == fAlphaScale;
@@ -214,6 +409,15 @@ void SkBitmapProcState::platformProcs() {
}
#endif
break;
+ case SkBitmap::kRGB_565_Config:
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+ if (justDx && !doFilter) {
+ if (isOpaque) {
+ fSampleProc32 = S16_opaque_D32_nofilter_DX_neon_asm;
+ }
+ }
+#endif
+ break;
default:
break;
}