diff options
author | Per-Daniel Olsson <eperdol@steludxu213.lud.stericsson.com> | 2010-11-09 08:51:09 +0100 |
---|---|---|
committer | Steve Kondik <shade@chemlab.org> | 2012-07-19 15:43:49 -0700 |
commit | a9a4c66d163e245628ddde71dca40e4c29a44f47 (patch) | |
tree | 1a037135f98a8f8d59f3b1b5c0cedc74bcc56e3d /src/opts | |
parent | ec1ad08b13b4034af0ae2e65da7803250a7eece3 (diff) | |
download | external_skia-a9a4c66d163e245628ddde71dca40e4c29a44f47.zip external_skia-a9a4c66d163e245628ddde71dca40e4c29a44f47.tar.gz external_skia-a9a4c66d163e245628ddde71dca40e4c29a44f47.tar.bz2 |
Neon optimized implementation of S16_opaque_D32_nofilter_DX
This patch does the following:
* Neon ASM blitter from RGB565 to ABGR8888 without blend.
* Special case for no scaling but the scaling case is almost as fast.
Copyright (C) ST-Ericsson SA 2010
Change-Id: I0fae80b14aa5a5c7f6ff4c251bc1419ffa0d4751
Signed-off-by: Christian Bejram <christian.bejram@stericsson.com>
Diffstat (limited to 'src/opts')
-rw-r--r-- | src/opts/SkBitmapProcState_opts_arm.cpp | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp index 20d62e1..f7b89a9 100644 --- a/src/opts/SkBitmapProcState_opts_arm.cpp +++ b/src/opts/SkBitmapProcState_opts_arm.cpp @@ -11,6 +11,11 @@ #include "SkColorPriv.h" #include "SkUtils.h" +#if defined(__ARM_HAVE_NEON) +#include <arm_neon.h> +#endif + + #if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) void SI8_D16_nofilter_DX_arm( const SkBitmapProcState& s, @@ -184,11 +189,201 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, } #endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) + +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) +void S16_opaque_D32_nofilter_DX_neon_asm(const SkBitmapProcState& s, + const uint32_t* __restrict__ xy, + int count, uint32_t* __restrict__ colors) { + + const uint16_t* __restrict__ srcAddr = (const uint16_t*)s.fBitmap->getPixels(); + + uint16_t* index; + uint16_t src; + int i; + + srcAddr = (const uint16_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes()); + + const uint16_t* __restrict__ xx = (const uint16_t*)(++xy); + + if (1 == s.fBitmap->width()) { + + src = srcAddr[0]; + uint32_t dstValue = SkPixel16ToPixel32(src); + sk_memset32(colors, dstValue, count); + } else if ((xx[count-1] - xx[0]) == (count-1)) { + // No scaling + const uint16_t* src_data = (const uint16_t*)(srcAddr + xx[0]); + asm volatile ( + "subs %[count], %[count], #8 \n\t" // count -= 8, set flag + "blt 2f \n\t" // if count < 0, branch to label 2 + "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use. + "1: \n\t" // 8 loop + // Handle 8 pixels in one loop. + + "vld1.u16 {q0}, [%[src_data]]! \n\t" // load eight src 565 pixels + + "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2 + "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3 + "vmov.u16 q1, q8 \n\t" // copy alpha from q8 + "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1 + "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1 + "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2 + "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2 + "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2 + "vzip.16 q2, q1 \n\t" // interleave q1 and q2 + "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst + "subs %[count], %[count], #8 \n\t" // count -= 8, set flag + "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q1 to dst + + "bge 1b \n\t" // loop if count >= 0 + "2: \n\t" // exit of 8 loop + + "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed. + "blt 3f \n\t" // if count < 0, branch to label 3 + + // Handle 4 pixels at once + + "vld1.u16 {d0}, [%[src_data]]! \n\t" // load eight src 565 pixels + + "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2 + "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1 + "vmov.u16 d3, d16 \n\t" // copy alpha from d16 + "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3 + "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3 + "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2 + "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2 + "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2 + "vzip.16 d2, d3 \n\t" // interleave d2 and d3 + "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst + + "3: \n\t" // end + : [src_data] "+r" (src_data), [colors] "+r" (colors), [count] "+r" (count) + : + : "cc", "memory","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17" + ); + + for (i = (count & 3); i > 0; --i) { + *colors++ = SkPixel16ToPixel32(*src_data++); + } + + } else { + // Scaling case + uint16_t data[8]; + + asm volatile ( + "subs %[count], %[count], #8 \n\t" // count -= 8, set flag + "blt 2f \n\t" // if count < 0, branch to label 2 + "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use. + "1: \n\t" // 8 loop + // Handle 8 pixels in one loop. + "ldmia %[xx]!, {r4, r5, r6, r7} \n\t" // load ptrs to pixels 0-7 + + "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer + "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer + "mov r6, r6, lsl #1 \n\t" // <<1 because of 16 bit pointer + "mov r7, r7, lsl #1 \n\t" // <<1 because of 16 bit pointer + + "uxth r8, r4 \n\t" // extract ptr 0 + "mov r4, r4, lsr #16 \n\t" // extract ptr 1 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image + "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image + "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register + + "uxth r8, r5 \n\t" // extract ptr 2 + "mov r5, r5, lsr #16 \n\t" // extract ptr 3 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image + "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image + "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register + + "uxth r8, r6 \n\t" // extract ptr 4 + "mov r6, r6, lsr #16 \n\t" // extract ptr 5 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image + "ldrh r6, [%[srcAddr], r6] \n\t" // load pixel 5 from image + "pkhbt r6, r8, r6, lsl #16 \n\t" // combine pixel 4 and 5 in one register + + "uxth r8, r7 \n\t" // extract ptr 6 + "mov r7, r7, lsr #16 \n\t" // extract ptr 7 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 6 from image + "ldrh r7, [%[srcAddr], r7] \n\t" // load pixel 7 from image + "pkhbt r7, r8, r7, lsl #16 \n\t" // combine pixel 6 and 7 in one register + + "stmia %[data], {r4, r5, r6, r7} \n\t" // store 8 src pixels + + "vld1.u16 {q0}, [%[data]] \n\t" // load eight src 565 pixels + + "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2 + "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3 + "vmov.u16 q1, q8 \n\t" // copy alpha from q8 + "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1 + "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1 + "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2 + "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2 + "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2 + "vzip.16 q2, q1 \n\t" // interleave q1 and q2 + "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst + "subs %[count], %[count], #8 \n\t" // count -= 8, set flag + "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q2 to dst + + "bge 1b \n\t" // loop if count >= 0 + "2: \n\t" // exit of 8 loop + + "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed. + "blt 3f \n\t" // if count < 0, branch to label 3 + + // Handle 4 pixels at once + "ldmia %[xx]!, {r4, r5} \n\t" // load ptrs to pixels 0-3 + + "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer + "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer + + "uxth r8, r4 \n\t" // extract ptr 0 + "mov r4, r4, lsr #16 \n\t" // extract ptr 1 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image + "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image + "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register + + "uxth r8, r5 \n\t" // extract ptr 2 + "mov r5, r5, lsr #16 \n\t" // extract ptr 3 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image + "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image + "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register + + "stmia %[data], {r4, r5} \n\t" // store 4 src pixels + + "vld1.u16 {d0}, [%[data]] \n\t" // load eight src 565 pixels + + "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2 + "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1 + "vmov.u16 d3, d16 \n\t" // copy alpha from d16 + "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3 + "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3 + "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2 + "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2 + "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2 + "vzip.16 d2, d3 \n\t" // interleave d2 and d3 + "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst + + "3: \n\t" // End + : [xx] "+r" (xx), [colors] "+r" (colors), [count] "+r" (count) + : [data] "r" (data), [srcAddr] "r" (srcAddr) + : "cc", "memory","r4","r5","r6","r7","r8","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17" + ); + + for (i = (count & 3); i > 0; --i) { + src = srcAddr[*xx++]; *colors++ = SkPixel16ToPixel32(src); + } + } +} +#endif + + /////////////////////////////////////////////////////////////////////////////// /* If we replace a sampleproc, then we null-out the associated shaderproc, otherwise the shader won't even look at the matrix/sampler */ + + void SkBitmapProcState::platformProcs() { bool doFilter = fDoFilter; bool isOpaque = 256 == fAlphaScale; @@ -214,6 +409,15 @@ void SkBitmapProcState::platformProcs() { } #endif break; + case SkBitmap::kRGB_565_Config: +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) + if (justDx && !doFilter) { + if (isOpaque) { + fSampleProc32 = S16_opaque_D32_nofilter_DX_neon_asm; + } + } +#endif + break; default: break; } |