From c3a04a0580f84bf26b099607eee8b6ee40bc1143 Mon Sep 17 00:00:00 2001 From: Evan McClain Date: Sun, 26 May 2013 09:22:54 -0400 Subject: Revert "Neon optimized implementation of S16_opaque_D32_nofilter_DX" This reverts commit a9a4c66d163e245628ddde71dca40e4c29a44f47. --- src/opts/SkBitmapProcState_opts_arm.cpp | 204 -------------------------------- 1 file changed, 204 deletions(-) diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp index f7b89a9..20d62e1 100644 --- a/src/opts/SkBitmapProcState_opts_arm.cpp +++ b/src/opts/SkBitmapProcState_opts_arm.cpp @@ -11,11 +11,6 @@ #include "SkColorPriv.h" #include "SkUtils.h" -#if defined(__ARM_HAVE_NEON) -#include -#endif - - #if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) void SI8_D16_nofilter_DX_arm( const SkBitmapProcState& s, @@ -189,201 +184,11 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, } #endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) - -#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) -void S16_opaque_D32_nofilter_DX_neon_asm(const SkBitmapProcState& s, - const uint32_t* __restrict__ xy, - int count, uint32_t* __restrict__ colors) { - - const uint16_t* __restrict__ srcAddr = (const uint16_t*)s.fBitmap->getPixels(); - - uint16_t* index; - uint16_t src; - int i; - - srcAddr = (const uint16_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes()); - - const uint16_t* __restrict__ xx = (const uint16_t*)(++xy); - - if (1 == s.fBitmap->width()) { - - src = srcAddr[0]; - uint32_t dstValue = SkPixel16ToPixel32(src); - sk_memset32(colors, dstValue, count); - } else if ((xx[count-1] - xx[0]) == (count-1)) { - // No scaling - const uint16_t* src_data = (const uint16_t*)(srcAddr + xx[0]); - asm volatile ( - "subs %[count], %[count], #8 \n\t" // count -= 8, set flag - "blt 2f \n\t" // if count < 0, branch to label 2 - "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use. - "1: \n\t" // 8 loop - // Handle 8 pixels in one loop. - - "vld1.u16 {q0}, [%[src_data]]! \n\t" // load eight src 565 pixels - - "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2 - "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3 - "vmov.u16 q1, q8 \n\t" // copy alpha from q8 - "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1 - "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1 - "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2 - "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2 - "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2 - "vzip.16 q2, q1 \n\t" // interleave q1 and q2 - "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst - "subs %[count], %[count], #8 \n\t" // count -= 8, set flag - "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q1 to dst - - "bge 1b \n\t" // loop if count >= 0 - "2: \n\t" // exit of 8 loop - - "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed. - "blt 3f \n\t" // if count < 0, branch to label 3 - - // Handle 4 pixels at once - - "vld1.u16 {d0}, [%[src_data]]! \n\t" // load eight src 565 pixels - - "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2 - "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1 - "vmov.u16 d3, d16 \n\t" // copy alpha from d16 - "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3 - "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3 - "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2 - "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2 - "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2 - "vzip.16 d2, d3 \n\t" // interleave d2 and d3 - "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst - - "3: \n\t" // end - : [src_data] "+r" (src_data), [colors] "+r" (colors), [count] "+r" (count) - : - : "cc", "memory","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17" - ); - - for (i = (count & 3); i > 0; --i) { - *colors++ = SkPixel16ToPixel32(*src_data++); - } - - } else { - // Scaling case - uint16_t data[8]; - - asm volatile ( - "subs %[count], %[count], #8 \n\t" // count -= 8, set flag - "blt 2f \n\t" // if count < 0, branch to label 2 - "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use. - "1: \n\t" // 8 loop - // Handle 8 pixels in one loop. - "ldmia %[xx]!, {r4, r5, r6, r7} \n\t" // load ptrs to pixels 0-7 - - "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer - "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer - "mov r6, r6, lsl #1 \n\t" // <<1 because of 16 bit pointer - "mov r7, r7, lsl #1 \n\t" // <<1 because of 16 bit pointer - - "uxth r8, r4 \n\t" // extract ptr 0 - "mov r4, r4, lsr #16 \n\t" // extract ptr 1 - "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image - "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image - "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register - - "uxth r8, r5 \n\t" // extract ptr 2 - "mov r5, r5, lsr #16 \n\t" // extract ptr 3 - "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image - "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image - "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register - - "uxth r8, r6 \n\t" // extract ptr 4 - "mov r6, r6, lsr #16 \n\t" // extract ptr 5 - "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image - "ldrh r6, [%[srcAddr], r6] \n\t" // load pixel 5 from image - "pkhbt r6, r8, r6, lsl #16 \n\t" // combine pixel 4 and 5 in one register - - "uxth r8, r7 \n\t" // extract ptr 6 - "mov r7, r7, lsr #16 \n\t" // extract ptr 7 - "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 6 from image - "ldrh r7, [%[srcAddr], r7] \n\t" // load pixel 7 from image - "pkhbt r7, r8, r7, lsl #16 \n\t" // combine pixel 6 and 7 in one register - - "stmia %[data], {r4, r5, r6, r7} \n\t" // store 8 src pixels - - "vld1.u16 {q0}, [%[data]] \n\t" // load eight src 565 pixels - - "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2 - "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3 - "vmov.u16 q1, q8 \n\t" // copy alpha from q8 - "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1 - "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1 - "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2 - "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2 - "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2 - "vzip.16 q2, q1 \n\t" // interleave q1 and q2 - "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst - "subs %[count], %[count], #8 \n\t" // count -= 8, set flag - "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q2 to dst - - "bge 1b \n\t" // loop if count >= 0 - "2: \n\t" // exit of 8 loop - - "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed. - "blt 3f \n\t" // if count < 0, branch to label 3 - - // Handle 4 pixels at once - "ldmia %[xx]!, {r4, r5} \n\t" // load ptrs to pixels 0-3 - - "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer - "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer - - "uxth r8, r4 \n\t" // extract ptr 0 - "mov r4, r4, lsr #16 \n\t" // extract ptr 1 - "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image - "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image - "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register - - "uxth r8, r5 \n\t" // extract ptr 2 - "mov r5, r5, lsr #16 \n\t" // extract ptr 3 - "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image - "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image - "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register - - "stmia %[data], {r4, r5} \n\t" // store 4 src pixels - - "vld1.u16 {d0}, [%[data]] \n\t" // load eight src 565 pixels - - "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2 - "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1 - "vmov.u16 d3, d16 \n\t" // copy alpha from d16 - "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3 - "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3 - "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2 - "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2 - "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2 - "vzip.16 d2, d3 \n\t" // interleave d2 and d3 - "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst - - "3: \n\t" // End - : [xx] "+r" (xx), [colors] "+r" (colors), [count] "+r" (count) - : [data] "r" (data), [srcAddr] "r" (srcAddr) - : "cc", "memory","r4","r5","r6","r7","r8","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17" - ); - - for (i = (count & 3); i > 0; --i) { - src = srcAddr[*xx++]; *colors++ = SkPixel16ToPixel32(src); - } - } -} -#endif - - /////////////////////////////////////////////////////////////////////////////// /* If we replace a sampleproc, then we null-out the associated shaderproc, otherwise the shader won't even look at the matrix/sampler */ - - void SkBitmapProcState::platformProcs() { bool doFilter = fDoFilter; bool isOpaque = 256 == fAlphaScale; @@ -409,15 +214,6 @@ void SkBitmapProcState::platformProcs() { } #endif break; - case SkBitmap::kRGB_565_Config: -#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) - if (justDx && !doFilter) { - if (isOpaque) { - fSampleProc32 = S16_opaque_D32_nofilter_DX_neon_asm; - } - } -#endif - break; default: break; } -- cgit v1.1