1 files changed, 213 insertions, 0 deletions
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp
index 415bd0c..ce75fea 100644
--- a/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -566,6 +566,216 @@ void Clamp_SI8_opaque_D32_filter_DX_shaderproc(const SkBitmapProcState& s, int x
 
     s.fBitmap->getColorTable()->unlockColors(false);
 }
+
+
+void Repeat_SI8_opaque_D32_filter_DX_shaderproc(const SkBitmapProcState& s, int x, int y,
+                                                uint32_t* SK_RESTRICT colors, int count) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+    SkASSERT(s.fInvKy == 0);
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fDoFilter);
+    SkDEBUGCODE(SkASSERT(s.fBitmap->config() == SkBitmap::kIndex8_Config);)
+
+    const unsigned maxX = s.fBitmap->width() - 1;
+    const SkFixed oneX = s.fFilterOneX;
+    const SkFixed dx = s.fInvSx;
+    SkFixed fx;
+    const uint8_t* SK_RESTRICT row0;
+    const uint8_t* SK_RESTRICT row1;
+    unsigned subY;
+
+    {
+        SkPoint pt;
+        s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                   SkIntToScalar(y) + SK_ScalarHalf, &pt);
+        SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
+        const unsigned maxY = s.fBitmap->height() - 1;
+        // compute our two Y values up front
+        subY = (((fy & 0xFFFF) * (maxY + 1) >> 12) & 0xF);
+        int y0 = ((fy & 0xFFFF) * (maxY + 1) >> 16);
+        int y1 = (((fy + s.fFilterOneY) & 0xFFFF) * (maxY + 1) >> 16);
+
+        const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
+        unsigned rb = s.fBitmap->rowBytes();
+        row0 = (const uint8_t*)(srcAddr + y0 * rb);
+        row1 = (const uint8_t*)(srcAddr + y1 * rb);
+        // now initialize fx
+        fx = SkScalarToFixed(pt.fX) - (oneX >> 1);
+    }
+
+    const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
+
+    do {
+        // Check if we can do the next four pixels using ARM NEON asm
+        if (count >= 4) {
+            int asm_count = count >> 2;
+            unsigned maxX1 = (unsigned)(maxX + 1);
+
+            count -= asm_count << 2;
+
+            // We know that oneX is 1.0 since we are running clamped.
+            // This means that we can load both x0 and x1 offsets in one go.
+            asm volatile (
+                // Setup constants
+                "rsb            r8, %[subY], #16                \n\t"   // 16 - subY
+                "vdup.8         d30, %[subY]                    \n\t"   // Create constant for subY
+                "vdup.8         d31, r8                         \n\t"   // Create constant for 16 - subY
+                "vmov.u16       d29, #16                        \n\t"   // Create constant for 16
+                "1:                                             \n\t"   // Loop start
+                // Pre-load pixel #1
+                "add            r7, %[fx], %[oneX]              \n\t"   // Start calculate x1 (fx + oneX)
+                "uxth           r7, r7                          \n\t"   // (fx + oneX) & 0xFFFF
+                "mul            r7, %[maxX1], r7                \n\t"   // Multiply with maxX + 1
+                "lsr            r7, r7, #16                     \n\t"   // Shift by 16 to get x1
+                "ldrb           r8, [%[row0], r7]               \n\t"   // Fetch row0[x1] color table offsets
+                "ldrb           r7, [%[row1], r7]               \n\t"   // Fetch row1[x1] color table offsets
+                "uxth           r6, %[fx]                       \n\t"   // Start calculate x0/subX (fx & 0xFFFF)
+                "mul            r6, %[maxX1], r6                \n\t"   // Multiply with maxX + 1
+                "add            r8, %[table], r8, lsl #2        \n\t"   // Calculate a01 address for table
+                "add            r7, %[table], r7, lsl #2        \n\t"   // Calculate a11 address for table
+                "vld1.32        {d0[1]}, [r8]                   \n\t"   // Load a01 RGBA pixel from table
+                "vld1.32        {d1[1]}, [r7]                   \n\t"   // Load a11 RGBA pixel from table
+                "lsr            r8, r6, #16                     \n\t"   // Shift by 16 to get x0
+                "ldrb           r7, [%[row0], r8]               \n\t"   // Fetch row0[x0] color table offsets
+                "ldrb           r8, [%[row1], r8]               \n\t"   // Fetch row1[x0] color table offsets
+                "subs           %[cnt], %[cnt], #1              \n\t"   // Decrement loop counter
+                "add            r7, %[table], r7, lsl #2        \n\t"   // Calculate a00 address for table
+                "add            r8, %[table], r8, lsl #2        \n\t"   // Calculate a10 address for table
+                "vld1.32        {d0[0]}, [r7]                   \n\t"   // Load a00 RGBA pixel from table
+                "vld1.32        {d1[0]}, [r8]                   \n\t"   // Load a10 RGBA pixel from table
+                // Calculate pixel #1 and pre-load #2
+                "lsr            r6, r6, #12                     \n\t"   // Calculate subX
+                "and            r6, r6, #0xF                    \n\t"   // ((fx & 0xFFFF) * (maxX1) >> 12) & 0xF
+                "add            %[fx], %[fx], %[dx]             \n\t"   // Move fx to next position
+                "vdup.16        d28, r6                         \n\t"   // subX
+                "add            r7, %[fx], %[oneX]              \n\t"   // Start calculate x1 (fx + oneX)
+                "uxth           r7, r7                          \n\t"   // (fx + oneX) & 0xFFFF
+                "mul            r7, %[maxX1], r7                \n\t"   // Multiply with maxX + 1
+                "lsr            r7, r7, #16                     \n\t"   // Shift by 16 to get x1
+                "ldrb           r8, [%[row0], r7]               \n\t"   // Fetch row0[x1] color table offsets
+                "ldrb           r7, [%[row1], r7]               \n\t"   // Fetch row1[x1] color table offsets
+                "vmull.u8       q1, d0, d31                     \n\t"   // q0 = [a00|a01] * (16 - y)
+                "vmull.u8       q2, d1, d30                     \n\t"   // q1 = [a10|a11] * y
+                "uxth           r6, %[fx]                       \n\t"   // Start calculate x0/subX (fx & 0xFFFF)
+                "mul            r6, %[maxX1], r6                \n\t"   // Multiply with maxX + 1
+                "add            r8, %[table], r8, lsl #2        \n\t"   // Calculate a01 address for table
+                "add            r7, %[table], r7, lsl #2        \n\t"   // Calculate a11 address for table
+                "vld1.32        {d0[1]}, [r8]                   \n\t"   // Load a01 RGBA pixel from table
+                "vld1.32        {d1[1]}, [r7]                   \n\t"   // Load a11 RGBA pixel from table
+                "lsr            r8, r6, #16                     \n\t"   // Shift by 16 to get x0
+                "ldrb           r7, [%[row0], r8]               \n\t"   // Fetch row0[x0] color table offsets
+                "ldrb           r8, [%[row1], r8]               \n\t"   // Fetch row1[x0] color table offsets
+                "vmul.i16       d16, d3, d28                    \n\t"   // d16  = a01 * x
+                "vmla.i16       d16, d5, d28                    \n\t"   // d16 += a11 * x
+                "add            r7, %[table], r7, lsl #2        \n\t"   // Calculate a00 address for table
+                "add            r8, %[table], r8, lsl #2        \n\t"   // Calculate a10 address for table
+                "vld1.32        {d0[0]}, [r7]                   \n\t"   // Load a00 RGBA pixel from table
+                "vld1.32        {d1[0]}, [r8]                   \n\t"   // Load a10 RGBA pixel from table
+                "vsub.i16       d27, d29, d28                   \n\t"   // 16 - subX
+                "vmla.i16       d16, d2, d27                    \n\t"   // d16 += a00 * (16 - x)
+                "vmla.i16       d16, d4, d27                    \n\t"   // d16 += a10 * (16 - x)
+                // Calculate pixel #2 and pre-load #3
+                "lsr            r6, r6, #12                     \n\t"   // Calculate subX
+                "and            r6, r6, #0xF                    \n\t"   // ((fx & 0xFFFF) * (maxX1) >> 12) & 0xF
+                "add            %[fx], %[fx], %[dx]             \n\t"   // Move fx to next position
+                "vdup.16        d28, r6                         \n\t"   // subX
+                "add            r7, %[fx], %[oneX]              \n\t"   // Start calculate x1 (fx + oneX)
+                "uxth           r7, r7                          \n\t"   // (fx + oneX) & 0xFFFF
+                "mul            r7, %[maxX1], r7                \n\t"   // Multiply with maxX + 1
+                "lsr            r7, r7, #16                     \n\t"   // Shift by 16 to get x1
+                "ldrb           r8, [%[row0], r7]               \n\t"   // Fetch row0[x1] color table offsets
+                "ldrb           r7, [%[row1], r7]               \n\t"   // Fetch row1[x1] color table offsets
+                "vmull.u8       q1, d0, d31                     \n\t"   // q0 = [a00|a01] * (16 - y)
+                "vmull.u8       q2, d1, d30                     \n\t"   // q1 = [a10|a11] * y
+                "uxth           r6, %[fx]                       \n\t"   // Start calculate x0/subX (fx & 0xFFFF)
+                "mul            r6, %[maxX1], r6                \n\t"   // Multiply with maxX + 1
+                "add            r8, %[table], r8, lsl #2        \n\t"   // Calculate a01 address for table
+                "add            r7, %[table], r7, lsl #2        \n\t"   // Calculate a11 address for table
+                "vld1.32        {d0[1]}, [r8]                   \n\t"   // Load a01 RGBA pixel from table
+                "vld1.32        {d1[1]}, [r7]                   \n\t"   // Load a11 RGBA pixel from table
+                "lsr            r8, r6, #16                     \n\t"   // Shift by 16 to get x0
+                "ldrb           r7, [%[row0], r8]               \n\t"   // Fetch row0[x0] color table offsets
+                "ldrb           r8, [%[row1], r8]               \n\t"   // Fetch row1[x0] color table offsets
+                "vmul.i16       d17, d3, d28                    \n\t"   // d17  = a01 * x
+                "vmla.i16       d17, d5, d28                    \n\t"   // d17 += a11 * x
+                "add            r7, %[table], r7, lsl #2        \n\t"   // Calculate a00 address for table
+                "add            r8, %[table], r8, lsl #2        \n\t"   // Calculate a10 address for table
+                "vld1.32        {d0[0]}, [r7]                   \n\t"   // Load a00 RGBA pixel from table
+                "vld1.32        {d1[0]}, [r8]                   \n\t"   // Load a10 RGBA pixel from table
+                "vsub.i16       d27, d29, d28                   \n\t"   // 16 - subX
+                "vmla.i16       d17, d2, d27                    \n\t"   // d17 += a00 * (16 - x)
+                "vmla.i16       d17, d4, d27                    \n\t"   // d17 += a10 * (16 - x)
+                // Calculate pixel #3 and pre-load #4
+                "lsr            r6, r6, #12                     \n\t"   // Calculate subX
+                "and            r6, r6, #0xF                    \n\t"   // ((fx & 0xFFFF) * (maxX1) >> 12) & 0xF
+                "add            %[fx], %[fx], %[dx]             \n\t"   // Move fx to next position
+                "vdup.16        d28, r6                         \n\t"   // subX
+                "vshrn.i16      d16, q8, #8                     \n\t"   // shift down result by 8
+                "add            r7, %[fx], %[oneX]              \n\t"   // Start calculate x1 (fx + oneX)
+                "uxth           r7, r7                          \n\t"   // (fx + oneX) & 0xFFFF
+                "mul            r7, %[maxX1], r7                \n\t"   // Multiply with maxX + 1
+                "lsr            r7, r7, #16                     \n\t"   // Shift by 16 to get x1
+                "ldrb           r8, [%[row0], r7]               \n\t"   // Fetch row0[x1] color table offsets
+                "ldrb           r7, [%[row1], r7]               \n\t"   // Fetch row1[x1] color table offsets
+                "vmull.u8       q1, d0, d31                     \n\t"   // q0 = [a00|a01] * (16 - y)
+                "vmull.u8       q2, d1, d30                     \n\t"   // q1 = [a10|a11] * y
+                "uxth           r6, %[fx]                       \n\t"   // Start calculate x0/subX (fx & 0xFFFF)
+                "mul            r6, %[maxX1], r6                \n\t"   // Multiply with maxX + 1
+                "add            r8, %[table], r8, lsl #2        \n\t"   // Calculate a01 address for table
+                "add            r7, %[table], r7, lsl #2        \n\t"   // Calculate a11 address for table
+                "vld1.32        {d0[1]}, [r8]                   \n\t"   // Load a01 RGBA pixel from table
+                "vld1.32        {d1[1]}, [r7]                   \n\t"   // Load a11 RGBA pixel from table
+                "lsr            r8, r6, #16                     \n\t"   // Shift by 16 to get x0
+                "ldrb           r7, [%[row0], r8]               \n\t"   // Fetch row0[x0] color table offsets
+                "ldrb           r8, [%[row1], r8]               \n\t"   // Fetch row1[x0] color table offsets
+                "vmul.i16       d18, d3, d28                    \n\t"   // d18  = a01 * x
+                "vmla.i16       d18, d5, d28                    \n\t"   // d18 += a11 * x
+                "add            r7, %[table], r7, lsl #2        \n\t"   // Calculate a00 address for table
+                "add            r8, %[table], r8, lsl #2        \n\t"   // Calculate a10 address for table
+                "vld1.32        {d0[0]}, [r7]                   \n\t"   // Load a00 RGBA pixel from table
+                "vld1.32        {d1[0]}, [r8]                   \n\t"   // Load a10 RGBA pixel from table
+                "vsub.i16       d27, d29, d28                   \n\t"   // 16 - subX
+                "vmla.i16       d18, d2, d27                    \n\t"   // d18 += a00 * (16 - x)
+                "vmla.i16       d18, d4, d27                    \n\t"   // d18 += a10 * (16 - x)
+                // Calculate pixel #4
+                "vmull.u8       q1, d0, d31                     \n\t"   // q0 = [a00|a01] * (16 - y)
+                "vmull.u8       q2, d1, d30                     \n\t"   // q1 = [a10|a11] * y
+                "lsr            r6, r6, #12                     \n\t"   // Calculate subX
+                "and            r6, r6, #0xF                    \n\t"   // ((fx & 0xFFFF) * (maxX1) >> 12) & 0xF
+                "add            %[fx], %[fx], %[dx]             \n\t"   // Move fx to next position
+                "vdup.16        d28, r6                         \n\t"   // subX
+                "vmul.i16       d19, d3, d28                    \n\t"   // d19  = a01 * x
+                "vmla.i16       d19, d5, d28                    \n\t"   // d19 += a11 * x
+                "vsub.i16       d27, d29, d28                   \n\t"   // 16 - subX
+                "vmla.i16       d19, d2, d27                    \n\t"   // d19 += a00 * (16 - x)
+                "vmla.i16       d19, d4, d27                    \n\t"   // d19 += a10 * (16 - x)
+                "vshrn.i16      d17, q9, #8                     \n\t"   // shift down result by 8
+                "vst1.32        {d16-d17}, [%[colors]]!         \n\t"   // Write result to memory
+                "bne            1b                              \n\t"
+                : [fx] "+r" (fx), [colors] "+r" (colors), [cnt] "+r" (asm_count)
+                : [row0] "r" (row0), [row1] "r" (row1), [subY] "r" (subY), [dx] "r" (dx), [table] "r" (table), [oneX] "r" (oneX), [maxX1] "r" (maxX1)
+                : "cc", "memory", "r6", "r7", "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d27", "d28", "d29", "d30", "d31"
+                );
+        } else {
+            unsigned subX = (((fx & 0xFFFF) * (maxX + 1) >> 12) & 0xF);
+            unsigned x0 = ((fx & 0xFFFF) * (maxX + 1) >> 16);
+            unsigned x1 = (((fx + oneX) & 0xFFFF) * (maxX + 1) >> 16);
+
+            Filter_32_opaque(subX, subY,
+                        table[row0[x0]],
+                        table[row0[x1]],
+                        table[row1[x0]],
+                        table[row1[x1]],
+                        colors);
+            colors += 1;
+            fx += dx;
+            count--;
+        }
+    } while (count != 0);
+
+    s.fBitmap->getColorTable()->unlockColors(false);
+}
 #endif
 
 
@@ -587,6 +797,9 @@ void SkBitmapProcState::platformProcs() {
             if (SI8_opaque_D32_filter_DX == fSampleProc32) {
                 if (clamp_clamp) {
                     fShaderProc32 = Clamp_SI8_opaque_D32_filter_DX_shaderproc;
+                } else if ((SkShader::kRepeat_TileMode == fTileModeX) &&
+                           (SkShader::kRepeat_TileMode == fTileModeY)) {
+                    fShaderProc32 = Repeat_SI8_opaque_D32_filter_DX_shaderproc;
                 }
             } else
 #endif