Add optimized S16_opaque_D32_nofilter_DX blitter.

Adds an NEON optimized version of Skia sample blitter S16_opaque_D32_nofilter_DX when running unscaled. Used when scrolling UI lists. Change-Id: I7a212f2e50aba3244aab237937395af02abe81a9 Signed-off-by: Henrik Smiding <henrik.smiding@stericsson.com> Signed-off-by: Patrik Ryd <patrik.ryd@stericsson.com>
author: Henrik Smiding <henrik.smiding@stericsson.com> 2013-04-24 14:51:49 +0200
committer: Evan McClain <aeroevan@gmail.com> 2013-05-26 09:23:38 -0400
commit: ef1ba5711ebf9d65f83bdd2372e7637ea524c3a8 (patch)
tree: 9c7692e8095961aa340419ded70d2df514a88b13
parent: 0f1f6b46ef441d15bcc77825bb6316ff4dc4338e (diff)
download: external_skia-ef1ba5711ebf9d65f83bdd2372e7637ea524c3a8.zip
external_skia-ef1ba5711ebf9d65f83bdd2372e7637ea524c3a8.tar.gz
external_skia-ef1ba5711ebf9d65f83bdd2372e7637ea524c3a8.tar.bz2
1 files changed, 123 insertions, 0 deletions
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp
index 3a5694f..552e97c 100644
--- a/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -914,6 +914,121 @@ void Clamp_S32_opaque_D32_nofilter_DX_shaderproc(const SkBitmapProcState& s, int
         fx += dx;
     }
 }
+
+
+void S16_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
+                                    const uint32_t* SK_RESTRICT xy,
+                                    int count, uint32_t* SK_RESTRICT colors) {
+    const uint16_t* SK_RESTRICT srcAddr = (const uint16_t*)s.fBitmap->getPixels();
+    uint16_t* index;
+    uint16_t src;
+    int i;
+
+    srcAddr = (const uint16_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes());
+    const uint16_t* SK_RESTRICT xx = (const uint16_t*)(++xy);
+
+    if (1 == s.fBitmap->width()) {
+        src = srcAddr[0];
+        sk_memset32(colors, SkPixel16ToPixel32(src), count);
+    } else if ((xx[count - 1] - xx[0]) == (count - 1)) {
+        // No scaling
+        const uint16_t* src_data = (const uint16_t*)(srcAddr + xx[0]);
+        asm volatile (
+            "pld            [%[src_data]]                   \n\t"   // Pre-load source
+            "subs           %[count], %[count], #24         \n\t"   // Do we have at least 24 pixels for main loop?
+            "blt            2f                              \n\t"   //
+            "vld1.16        {q0}, [%[src_data]]!            \n\t"   // Load eight RGB565 pixels
+            "pld            [%[src_data]]                   \n\t"   // Pre-load source
+            "pld            [%[src_data], #32]              \n\t"   // Pre-load source
+            "vmov.u16       q8, #0xFF00                     \n\t"   // Load alpha value for later use
+            "1:                                             \n\t"
+            // Handle 16 pixels in main loop.
+            "vshl.u16       q1, q0, #5                      \n\t"   // Split green into q1
+            "vshl.u16       q3, q0, #11                     \n\t"   // Split blue into q3
+            "vmov.u16       q2, q8                          \n\t"   // Copy alpha to q2
+            "vld1.16        {q9}, [%[src_data]]!            \n\t"   // Pre-load next eight RGB565 pixels
+            "vsri.u16       q1, q1, #6                      \n\t"   // Insert two high bits green as LSBs green in q1
+            "vsri.u16       q2, q3, #8                      \n\t"   // Merge blue with alpha in q2
+            "vsri.u16       q1, q0, #8                      \n\t"   // Merge red with green in q1
+            "vsri.u16       q2, q3, #13                     \n\t"   // Merge three high bits blue as LSBs blue in q2
+            "vsri.u16       q1, q0, #13                     \n\t"   // Merge three high bits red as LSBs red in q1
+            "subs           %[count], %[count], #16         \n\t"   // Decrease count
+            "vst2.16        {q1 - q2}, [%[colors]]!         \n\t"   // Store q1-q2 to destination
+            "vshl.u16       q10, q9, #5                     \n\t"   // Split green into q10
+            "vshl.u16       q12, q9, #11                    \n\t"   // Split blue into q12
+            "vmov.u16       q11, q8                         \n\t"   // Copy alpha to q11
+            "vld1.16        {q0}, [%[src_data]]!            \n\t"   // Pre-load next eight RGB565 pixels
+            "vsri.u16       q10, q10, #6                    \n\t"   // Insert two high bits green as LSBs green in q10
+            "vsri.u16       q11, q12, #8                    \n\t"   // Merge blue with alpha in q11
+            "vsri.u16       q10, q9, #8                     \n\t"   // Merge red with green in q10
+            "pld            [%[src_data], #32]              \n\t"   // Pre-load source
+            "vsri.u16       q11, q12, #13                   \n\t"   // Merge three high bits blue as LSBs blue in q11
+            "vsri.u16       q10, q9, #13                    \n\t"   // Merge three high bits red as LSBs red in q10
+            "vst2.16        {q10 - q11}, [%[colors]]!       \n\t"   // Store q10-q11 to destination
+            "bge            1b                              \n\t"   // Loop if count >= 0
+            // Handle last 8 pixels from main loop.
+            "vshl.u16       q1, q0, #5                      \n\t"   // Split green into q1
+            "vshl.u16       q3, q0, #11                     \n\t"   // Split blue into q3
+            "vmov.u16       q2, q8                          \n\t"   // Copy alpha to q2
+            "vsri.u16       q1, q1, #6                      \n\t"   // Insert two high bits green as LSBs green in q1
+            "vsri.u16       q2, q3, #8                      \n\t"   // Merge blue with alpha in q2
+            "vsri.u16       q1, q0, #8                      \n\t"   // Merge red with green in q1
+            "vsri.u16       q2, q3, #13                     \n\t"   // Merge three high bits blue as LSBs blue in q2
+            "vsri.u16       q1, q0, #13                     \n\t"   // Merge three high bits red as LSBs red in q1
+            "sub            %[count], %[count], #8          \n\t"   // Decrease count
+            "vst2.16        {q1 - q2}, [%[colors]]!         \n\t"   // Store q1-q2 to destination
+            "2:                                             \n\t"
+            "adds           %[count], %[count], #20         \n\t"   // Add 20 (24 - 4) to see if a 4 loop is needed
+            "blt            4f                              \n\t"   //
+            // Handle 4 pixels in slow loop
+            "3:                                             \n\t"
+            "vld1.u16       {d0}, [%[src_data]]!            \n\t"   // Load four RGB565 pixels
+            "vshl.u16       d2, d0, #5                      \n\t"   // Split green into d2
+            "vshl.u16       d1, d0, #11                     \n\t"   // Split blue into d1
+            "vmov.u16       d3, d16                         \n\t"   // Copy alpha to d3
+            "subs           %[count], %[count], #4          \n\t"   // Decrease count
+            "vsri.u16       d2, d2, #6                      \n\t"   // Insert two high bits green as LSBs green in d2
+            "vsri.u16       d3, d1, #8                      \n\t"   // Merge blue with alpha in d3
+            "vsri.u16       d2, d0, #8                      \n\t"   // Merge red with green in d2
+            "vsri.u16       d3, d1, #13                     \n\t"   // Merge three high bits blue as LSBs blue in d3
+            "vsri.u16       d2, d0, #13                     \n\t"   // Merge three high bits red as LSBs red in d2
+            "vst2.16        {d2, d3}, [%[colors]]!          \n\t"   // Store d2 and d3 to destination
+            "bge            3b                              \n\t"   //
+            "4:                                             \n\t"
+            "add            %[count], %[count], #4          \n\t"   // Add 4 to count
+            : [src_data] "+r" (src_data), [colors] "+r" (colors), [count] "+r" (count)
+            :
+            : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25"
+            );
+
+        for (i = (count & 3); i > 0; --i) {
+            *colors++ = SkPixel16ToPixel32(*src_data++);
+        }
+    } else {
+        // Scaling case
+        int i;
+
+        for (i = (count >> 2); i > 0; --i) {
+            uint32_t xx0 = *xy++;
+            uint32_t xx1 = *xy++;
+            uint16_t x0 = srcAddr[UNPACK_PRIMARY_SHORT(xx0)];
+            uint16_t x1 = srcAddr[UNPACK_SECONDARY_SHORT(xx0)];
+            uint16_t x2 = srcAddr[UNPACK_PRIMARY_SHORT(xx1)];
+            uint16_t x3 = srcAddr[UNPACK_SECONDARY_SHORT(xx1)];
+
+            *colors++ = SkPixel16ToPixel32(x0);
+            *colors++ = SkPixel16ToPixel32(x1);
+            *colors++ = SkPixel16ToPixel32(x2);
+            *colors++ = SkPixel16ToPixel32(x3);
+        }
+        const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy);
+        for (i = (count & 3); i > 0; --i) {
+            SkASSERT(*xx < (unsigned)s.fBitmap->width());
+            src = srcAddr[*xx++];
+            *colors++ = SkPixel16ToPixel32(src);
+        }
+    }
+}
 #endif
 
 
@@ -955,6 +1070,14 @@ void SkBitmapProcState::platformProcs() {
             }
 #endif
             break;
+        case SkBitmap::kRGB_565_Config:
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+            if (justDx && !doFilter && isOpaque) {
+                fSampleProc32 = S16_opaque_D32_nofilter_DX_arm;
+                fShaderProc32 = NULL;
+            }
+#endif
+            break;
         case SkBitmap::kARGB_8888_Config:
 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
             if (S32_opaque_D32_filter_DX == fSampleProc32 && clamp_clamp) {
author	Henrik Smiding <henrik.smiding@stericsson.com>	2013-04-24 14:51:49 +0200
committer	Evan McClain <aeroevan@gmail.com>	2013-05-26 09:23:38 -0400
commit	ef1ba5711ebf9d65f83bdd2372e7637ea524c3a8 (patch)
tree	9c7692e8095961aa340419ded70d2df514a88b13
parent	0f1f6b46ef441d15bcc77825bb6316ff4dc4338e (diff)
download	external_skia-ef1ba5711ebf9d65f83bdd2372e7637ea524c3a8.zip external_skia-ef1ba5711ebf9d65f83bdd2372e7637ea524c3a8.tar.gz external_skia-ef1ba5711ebf9d65f83bdd2372e7637ea524c3a8.tar.bz2