From c3a04a0580f84bf26b099607eee8b6ee40bc1143 Mon Sep 17 00:00:00 2001
From: Evan McClain <aeroevan@gmail.com>
Date: Sun, 26 May 2013 09:22:54 -0400
Subject: Revert "Neon optimized implementation of S16_opaque_D32_nofilter_DX"

This reverts commit a9a4c66d163e245628ddde71dca40e4c29a44f47.
---
 src/opts/SkBitmapProcState_opts_arm.cpp | 204 --------------------------------
 1 file changed, 204 deletions(-)

diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp
index f7b89a9..20d62e1 100644
--- a/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -11,11 +11,6 @@
 #include "SkColorPriv.h"
 #include "SkUtils.h"
 
-#if defined(__ARM_HAVE_NEON)
-#include <arm_neon.h>
-#endif
-
-
 #if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
 void SI8_D16_nofilter_DX_arm(
     const SkBitmapProcState& s,
@@ -189,201 +184,11 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
 }
 #endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
 
-
-#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
-void S16_opaque_D32_nofilter_DX_neon_asm(const SkBitmapProcState& s,
-                            const uint32_t* __restrict__ xy,
-                            int count, uint32_t* __restrict__ colors) {
-
-    const uint16_t* __restrict__ srcAddr = (const uint16_t*)s.fBitmap->getPixels();
-
-    uint16_t* index;
-    uint16_t src;
-    int i;
-
-    srcAddr = (const uint16_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes());
-
-    const uint16_t* __restrict__ xx = (const uint16_t*)(++xy);
-
-    if (1 == s.fBitmap->width()) {
-
-        src = srcAddr[0];
-        uint32_t dstValue = SkPixel16ToPixel32(src);
-        sk_memset32(colors, dstValue, count);
-    } else if ((xx[count-1] - xx[0]) == (count-1)) {
-        // No scaling
-        const uint16_t* src_data = (const uint16_t*)(srcAddr + xx[0]);
-        asm volatile (
-                      "subs       %[count], %[count], #8          \n\t"   // count -= 8, set flag
-                      "blt        2f                              \n\t"   // if count < 0, branch to label 2
-                      "vmov.u16   q8, #0xFF00                     \n\t"   // Load alpha value into q8 for later use.
-                      "1:                                         \n\t"   // 8 loop
-                      // Handle 8 pixels in one loop.
-
-                      "vld1.u16   {q0}, [%[src_data]]!            \n\t"   // load eight src 565 pixels
-
-                      "vshl.u16   q2, q0, #5                      \n\t"   // put green in the 6 high bits of q2
-                      "vshl.u16   q3, q0, #11                     \n\t"   // put blue in the 5 high bits of q3
-                      "vmov.u16   q1, q8                          \n\t"   // copy alpha from q8
-                      "vsri.u16   q1, q3, #8                      \n\t"   // put blue below alpha in q1
-                      "vsri.u16   q1, q3, #13                     \n\t"   // put 3 MSB blue below blue in q1
-                      "vsri.u16   q2, q2, #6                      \n\t"   // put 2 MSB green below green in q2
-                      "vsri.u16   q2, q0, #8                      \n\t"   // put red below green in q2
-                      "vsri.u16   q2, q0, #13                     \n\t"   // put 3 MSB red below red in q2
-                      "vzip.16    q2, q1                          \n\t"   // interleave q1 and q2
-                      "vst1.16    {d4, d5}, [%[colors]]!          \n\t"   // store q1 to dst
-                      "subs       %[count], %[count], #8          \n\t"   // count -= 8, set flag
-                      "vst1.16    {d2, d3}, [%[colors]]!          \n\t"   // store q1 to dst
-
-                      "bge        1b                              \n\t"   // loop if count >= 0
-                      "2:                                         \n\t"   // exit of 8 loop
-
-                      "adds       %[count], %[count], #4          \n\t"   // add 4 to count to see if a 4 loop is needed.
-                      "blt        3f                              \n\t"   // if count < 0, branch to label 3
-
-                      // Handle 4 pixels at once
-
-                      "vld1.u16   {d0}, [%[src_data]]!            \n\t"   // load eight src 565 pixels
-
-                      "vshl.u16   d2, d0, #5                      \n\t"   // put green in the 6 high bits of d2
-                      "vshl.u16   d1, d0, #11                     \n\t"   // put blue in the 5 high bits of d1
-                      "vmov.u16   d3, d16                         \n\t"   // copy alpha from d16
-                      "vsri.u16   d3, d1, #8                      \n\t"   // put blue below alpha in d3
-                      "vsri.u16   d3, d1, #13                     \n\t"   // put 3 MSB blue below blue in d3
-                      "vsri.u16   d2, d2, #6                      \n\t"   // put 2 MSB green below green in d2
-                      "vsri.u16   d2, d0, #8                      \n\t"   // put red below green in d2
-                      "vsri.u16   d2, d0, #13                     \n\t"   // put 3 MSB red below red in d2
-                      "vzip.16    d2, d3                          \n\t"   // interleave d2 and d3
-                      "vst1.16    {d2, d3}, [%[colors]]!          \n\t"   // store d2 and d3 to dst
-
-                      "3:                                         \n\t"   // end
-                      : [src_data] "+r" (src_data), [colors] "+r" (colors), [count] "+r" (count)
-                      :
-                      : "cc", "memory","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17"
-                     );
-
-        for (i = (count & 3); i > 0; --i) {
-            *colors++ = SkPixel16ToPixel32(*src_data++);
-        }
-
-    } else {
-        // Scaling case
-        uint16_t data[8];
-
-        asm volatile (
-                      "subs       %[count], %[count], #8          \n\t"   // count -= 8, set flag
-                      "blt        2f                              \n\t"   // if count < 0, branch to label 2
-                      "vmov.u16   q8, #0xFF00                     \n\t"   // Load alpha value into q8 for later use.
-                      "1:                                         \n\t"   // 8 loop
-                      // Handle 8 pixels in one loop.
-                      "ldmia      %[xx]!, {r4, r5, r6, r7}        \n\t"   // load ptrs to pixels 0-7
-
-                      "mov        r4, r4, lsl #1                  \n\t"   // <<1 because of 16 bit pointer
-                      "mov        r5, r5, lsl #1                  \n\t"   // <<1 because of 16 bit pointer
-                      "mov        r6, r6, lsl #1                  \n\t"   // <<1 because of 16 bit pointer
-                      "mov        r7, r7, lsl #1                  \n\t"   // <<1 because of 16 bit pointer
-
-                      "uxth       r8, r4                          \n\t"   // extract ptr 0
-                      "mov        r4, r4, lsr #16                 \n\t"   // extract ptr 1
-                      "ldrh       r8, [%[srcAddr], r8]            \n\t"   // load pixel 0 from image
-                      "ldrh       r4, [%[srcAddr], r4]            \n\t"   // load pixel 1 from image
-                      "pkhbt      r4, r8, r4, lsl #16             \n\t"   // combine pixel 0 and 1 in one register
-
-                      "uxth       r8, r5                          \n\t"   // extract ptr 2
-                      "mov        r5, r5, lsr #16                 \n\t"   // extract ptr 3
-                      "ldrh       r8, [%[srcAddr], r8]            \n\t"   // load pixel 2 from image
-                      "ldrh       r5, [%[srcAddr], r5]            \n\t"   // load pixel 3 from image
-                      "pkhbt      r5, r8, r5, lsl #16             \n\t"   // combine pixel 2 and 3 in one register
-
-                      "uxth       r8, r6                          \n\t"   // extract ptr 4
-                      "mov        r6, r6, lsr #16                 \n\t"   // extract ptr 5
-                      "ldrh       r8, [%[srcAddr], r8]            \n\t"   // load pixel 4 from image
-                      "ldrh       r6, [%[srcAddr], r6]            \n\t"   // load pixel 5 from image
-                      "pkhbt      r6, r8, r6, lsl #16             \n\t"   // combine pixel 4 and 5 in one register
-
-                      "uxth       r8, r7                          \n\t"   // extract ptr 6
-                      "mov        r7, r7, lsr #16                 \n\t"   // extract ptr 7
-                      "ldrh       r8, [%[srcAddr], r8]            \n\t"   // load pixel 6 from image
-                      "ldrh       r7, [%[srcAddr], r7]            \n\t"   // load pixel 7 from image
-                      "pkhbt      r7, r8, r7, lsl #16             \n\t"   // combine pixel 6 and 7 in one register
-
-                      "stmia      %[data], {r4, r5, r6, r7}       \n\t"   // store 8 src pixels
-
-                      "vld1.u16   {q0}, [%[data]]                 \n\t"   // load eight src 565 pixels
-
-                      "vshl.u16   q2, q0, #5                      \n\t"   // put green in the 6 high bits of q2
-                      "vshl.u16   q3, q0, #11                     \n\t"   // put blue in the 5 high bits of q3
-                      "vmov.u16   q1, q8                          \n\t"   // copy alpha from q8
-                      "vsri.u16   q1, q3, #8                      \n\t"   // put blue below alpha in q1
-                      "vsri.u16   q1, q3, #13                     \n\t"   // put 3 MSB blue below blue in q1
-                      "vsri.u16   q2, q2, #6                      \n\t"   // put 2 MSB green below green in q2
-                      "vsri.u16   q2, q0, #8                      \n\t"   // put red below green in q2
-                      "vsri.u16   q2, q0, #13                     \n\t"   // put 3 MSB red below red in q2
-                      "vzip.16    q2, q1                          \n\t"   // interleave q1 and q2
-                      "vst1.16    {d4, d5}, [%[colors]]!          \n\t"   // store q1 to dst
-                      "subs       %[count], %[count], #8          \n\t"   // count -= 8, set flag
-                      "vst1.16    {d2, d3}, [%[colors]]!          \n\t"   // store q2 to dst
-
-                      "bge        1b                              \n\t"   // loop if count >= 0
-                      "2:                                         \n\t"   // exit of 8 loop
-
-                      "adds       %[count], %[count], #4          \n\t"   // add 4 to count to see if a 4 loop is needed.
-                      "blt        3f                              \n\t"   // if count < 0, branch to label 3
-
-                      // Handle 4 pixels at once
-                      "ldmia      %[xx]!, {r4, r5}                \n\t"   // load ptrs to pixels 0-3
-
-                      "mov        r4, r4, lsl #1                  \n\t"   // <<1 because of 16 bit pointer
-                      "mov        r5, r5, lsl #1                  \n\t"   // <<1 because of 16 bit pointer
-
-                      "uxth       r8, r4                          \n\t"   // extract ptr 0
-                      "mov        r4, r4, lsr #16                 \n\t"   // extract ptr 1
-                      "ldrh       r8, [%[srcAddr], r8]            \n\t"   // load pixel 0 from image
-                      "ldrh       r4, [%[srcAddr], r4]            \n\t"   // load pixel 1 from image
-                      "pkhbt      r4, r8, r4, lsl #16             \n\t"   // combine pixel 0 and 1 in one register
-
-                      "uxth       r8, r5                          \n\t"   // extract ptr 2
-                      "mov        r5, r5, lsr #16                 \n\t"   // extract ptr 3
-                      "ldrh       r8, [%[srcAddr], r8]            \n\t"   // load pixel 2 from image
-                      "ldrh       r5, [%[srcAddr], r5]            \n\t"   // load pixel 3 from image
-                      "pkhbt      r5, r8, r5, lsl #16             \n\t"   // combine pixel 2 and 3 in one register
-
-                      "stmia      %[data], {r4, r5}               \n\t"   // store 4 src pixels
-
-                      "vld1.u16   {d0}, [%[data]]                 \n\t"   // load eight src 565 pixels
-
-                      "vshl.u16   d2, d0, #5                      \n\t"   // put green in the 6 high bits of d2
-                      "vshl.u16   d1, d0, #11                     \n\t"   // put blue in the 5 high bits of d1
-                      "vmov.u16   d3, d16                         \n\t"   // copy alpha from d16
-                      "vsri.u16   d3, d1, #8                      \n\t"   // put blue below alpha in d3
-                      "vsri.u16   d3, d1, #13                     \n\t"   // put 3 MSB blue below blue in d3
-                      "vsri.u16   d2, d2, #6                      \n\t"   // put 2 MSB green below green in d2
-                      "vsri.u16   d2, d0, #8                      \n\t"   // put red below green in d2
-                      "vsri.u16   d2, d0, #13                     \n\t"   // put 3 MSB red below red in d2
-                      "vzip.16    d2, d3                          \n\t"   // interleave d2 and d3
-                      "vst1.16    {d2, d3}, [%[colors]]!          \n\t"   // store d2 and d3 to dst
-
-                      "3:                                         \n\t"   // End
-                      : [xx] "+r" (xx), [colors] "+r" (colors), [count] "+r" (count)
-                      : [data] "r" (data), [srcAddr] "r" (srcAddr)
-                      : "cc", "memory","r4","r5","r6","r7","r8","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17"
-                     );
-
-        for (i = (count & 3); i > 0; --i) {
-            src = srcAddr[*xx++]; *colors++ = SkPixel16ToPixel32(src);
-        }
-    }
-}
-#endif
-
-
 ///////////////////////////////////////////////////////////////////////////////
 
 /*  If we replace a sampleproc, then we null-out the associated shaderproc,
     otherwise the shader won't even look at the matrix/sampler
  */
-
-
 void SkBitmapProcState::platformProcs() {
     bool doFilter = fDoFilter;
     bool isOpaque = 256 == fAlphaScale;
@@ -409,15 +214,6 @@ void SkBitmapProcState::platformProcs() {
             }
 #endif
             break;
-        case SkBitmap::kRGB_565_Config:
-#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
-            if (justDx && !doFilter) {
-                if (isOpaque) {
-                    fSampleProc32 = S16_opaque_D32_nofilter_DX_neon_asm;
-                }
-            }
-#endif
-            break;
         default:
             break;
     }
-- 
cgit v1.1