diff options
author | Henrik Smiding <henrik.smiding@stericsson.com> | 2012-01-09 16:50:29 +0100 |
---|---|---|
committer | Evan McClain <aeroevan@gmail.com> | 2013-05-26 09:23:24 -0400 |
commit | 5dd8eff4587a42ce15fdc3ba30c879405e6d1401 (patch) | |
tree | 390c108abe54c119426749e7bb7b6b187c26d5ef | |
parent | 4fde5f6a57b41cb35fe54322d3d6f38defe1f490 (diff) | |
download | external_skia-5dd8eff4587a42ce15fdc3ba30c879405e6d1401.zip external_skia-5dd8eff4587a42ce15fdc3ba30c879405e6d1401.tar.gz external_skia-5dd8eff4587a42ce15fdc3ba30c879405e6d1401.tar.bz2 |
Add shader for Repeat_SI8_opaque_D32_filter_DX.
Adds a NEON optimized shader for SI8_opaque_D32_filter_DX when
running with repeat tile mode.
Improves performance of browser.
Change-Id: Ic5cc63b3df86eb7e8ea0bbb3b7bddfc73db20025
Signed-off-by: Henrik Smiding <henrik.smiding@stericsson.com>
Signed-off-by: Patrik Ryd <patrik.ryd@stericsson.com>
-rw-r--r-- | src/opts/SkBitmapProcState_opts_arm.cpp | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp index 415bd0c..ce75fea 100644 --- a/src/opts/SkBitmapProcState_opts_arm.cpp +++ b/src/opts/SkBitmapProcState_opts_arm.cpp @@ -566,6 +566,216 @@ void Clamp_SI8_opaque_D32_filter_DX_shaderproc(const SkBitmapProcState& s, int x s.fBitmap->getColorTable()->unlockColors(false); } + + +void Repeat_SI8_opaque_D32_filter_DX_shaderproc(const SkBitmapProcState& s, int x, int y, + uint32_t* SK_RESTRICT colors, int count) { + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask)) == 0); + SkASSERT(s.fInvKy == 0); + SkASSERT(count > 0 && colors != NULL); + SkASSERT(s.fDoFilter); + SkDEBUGCODE(SkASSERT(s.fBitmap->config() == SkBitmap::kIndex8_Config);) + + const unsigned maxX = s.fBitmap->width() - 1; + const SkFixed oneX = s.fFilterOneX; + const SkFixed dx = s.fInvSx; + SkFixed fx; + const uint8_t* SK_RESTRICT row0; + const uint8_t* SK_RESTRICT row1; + unsigned subY; + + { + SkPoint pt; + s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &pt); + SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); + const unsigned maxY = s.fBitmap->height() - 1; + // compute our two Y values up front + subY = (((fy & 0xFFFF) * (maxY + 1) >> 12) & 0xF); + int y0 = ((fy & 0xFFFF) * (maxY + 1) >> 16); + int y1 = (((fy + s.fFilterOneY) & 0xFFFF) * (maxY + 1) >> 16); + + const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels(); + unsigned rb = s.fBitmap->rowBytes(); + row0 = (const uint8_t*)(srcAddr + y0 * rb); + row1 = (const uint8_t*)(srcAddr + y1 * rb); + // now initialize fx + fx = SkScalarToFixed(pt.fX) - (oneX >> 1); + } + + const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors(); + + do { + // Check if we can do the next four pixels using ARM NEON asm + if (count >= 4) { + int asm_count = count >> 2; + unsigned maxX1 = (unsigned)(maxX + 1); + + count -= asm_count << 2; + + // We know that oneX is 1.0 since we are running clamped. + // This means that we can load both x0 and x1 offsets in one go. + asm volatile ( + // Setup constants + "rsb r8, %[subY], #16 \n\t" // 16 - subY + "vdup.8 d30, %[subY] \n\t" // Create constant for subY + "vdup.8 d31, r8 \n\t" // Create constant for 16 - subY + "vmov.u16 d29, #16 \n\t" // Create constant for 16 + "1: \n\t" // Loop start + // Pre-load pixel #1 + "add r7, %[fx], %[oneX] \n\t" // Start calculate x1 (fx + oneX) + "uxth r7, r7 \n\t" // (fx + oneX) & 0xFFFF + "mul r7, %[maxX1], r7 \n\t" // Multiply with maxX + 1 + "lsr r7, r7, #16 \n\t" // Shift by 16 to get x1 + "ldrb r8, [%[row0], r7] \n\t" // Fetch row0[x1] color table offsets + "ldrb r7, [%[row1], r7] \n\t" // Fetch row1[x1] color table offsets + "uxth r6, %[fx] \n\t" // Start calculate x0/subX (fx & 0xFFFF) + "mul r6, %[maxX1], r6 \n\t" // Multiply with maxX + 1 + "add r8, %[table], r8, lsl #2 \n\t" // Calculate a01 address for table + "add r7, %[table], r7, lsl #2 \n\t" // Calculate a11 address for table + "vld1.32 {d0[1]}, [r8] \n\t" // Load a01 RGBA pixel from table + "vld1.32 {d1[1]}, [r7] \n\t" // Load a11 RGBA pixel from table + "lsr r8, r6, #16 \n\t" // Shift by 16 to get x0 + "ldrb r7, [%[row0], r8] \n\t" // Fetch row0[x0] color table offsets + "ldrb r8, [%[row1], r8] \n\t" // Fetch row1[x0] color table offsets + "subs %[cnt], %[cnt], #1 \n\t" // Decrement loop counter + "add r7, %[table], r7, lsl #2 \n\t" // Calculate a00 address for table + "add r8, %[table], r8, lsl #2 \n\t" // Calculate a10 address for table + "vld1.32 {d0[0]}, [r7] \n\t" // Load a00 RGBA pixel from table + "vld1.32 {d1[0]}, [r8] \n\t" // Load a10 RGBA pixel from table + // Calculate pixel #1 and pre-load #2 + "lsr r6, r6, #12 \n\t" // Calculate subX + "and r6, r6, #0xF \n\t" // ((fx & 0xFFFF) * (maxX1) >> 12) & 0xF + "add %[fx], %[fx], %[dx] \n\t" // Move fx to next position + "vdup.16 d28, r6 \n\t" // subX + "add r7, %[fx], %[oneX] \n\t" // Start calculate x1 (fx + oneX) + "uxth r7, r7 \n\t" // (fx + oneX) & 0xFFFF + "mul r7, %[maxX1], r7 \n\t" // Multiply with maxX + 1 + "lsr r7, r7, #16 \n\t" // Shift by 16 to get x1 + "ldrb r8, [%[row0], r7] \n\t" // Fetch row0[x1] color table offsets + "ldrb r7, [%[row1], r7] \n\t" // Fetch row1[x1] color table offsets + "vmull.u8 q1, d0, d31 \n\t" // q0 = [a00|a01] * (16 - y) + "vmull.u8 q2, d1, d30 \n\t" // q1 = [a10|a11] * y + "uxth r6, %[fx] \n\t" // Start calculate x0/subX (fx & 0xFFFF) + "mul r6, %[maxX1], r6 \n\t" // Multiply with maxX + 1 + "add r8, %[table], r8, lsl #2 \n\t" // Calculate a01 address for table + "add r7, %[table], r7, lsl #2 \n\t" // Calculate a11 address for table + "vld1.32 {d0[1]}, [r8] \n\t" // Load a01 RGBA pixel from table + "vld1.32 {d1[1]}, [r7] \n\t" // Load a11 RGBA pixel from table + "lsr r8, r6, #16 \n\t" // Shift by 16 to get x0 + "ldrb r7, [%[row0], r8] \n\t" // Fetch row0[x0] color table offsets + "ldrb r8, [%[row1], r8] \n\t" // Fetch row1[x0] color table offsets + "vmul.i16 d16, d3, d28 \n\t" // d16 = a01 * x + "vmla.i16 d16, d5, d28 \n\t" // d16 += a11 * x + "add r7, %[table], r7, lsl #2 \n\t" // Calculate a00 address for table + "add r8, %[table], r8, lsl #2 \n\t" // Calculate a10 address for table + "vld1.32 {d0[0]}, [r7] \n\t" // Load a00 RGBA pixel from table + "vld1.32 {d1[0]}, [r8] \n\t" // Load a10 RGBA pixel from table + "vsub.i16 d27, d29, d28 \n\t" // 16 - subX + "vmla.i16 d16, d2, d27 \n\t" // d16 += a00 * (16 - x) + "vmla.i16 d16, d4, d27 \n\t" // d16 += a10 * (16 - x) + // Calculate pixel #2 and pre-load #3 + "lsr r6, r6, #12 \n\t" // Calculate subX + "and r6, r6, #0xF \n\t" // ((fx & 0xFFFF) * (maxX1) >> 12) & 0xF + "add %[fx], %[fx], %[dx] \n\t" // Move fx to next position + "vdup.16 d28, r6 \n\t" // subX + "add r7, %[fx], %[oneX] \n\t" // Start calculate x1 (fx + oneX) + "uxth r7, r7 \n\t" // (fx + oneX) & 0xFFFF + "mul r7, %[maxX1], r7 \n\t" // Multiply with maxX + 1 + "lsr r7, r7, #16 \n\t" // Shift by 16 to get x1 + "ldrb r8, [%[row0], r7] \n\t" // Fetch row0[x1] color table offsets + "ldrb r7, [%[row1], r7] \n\t" // Fetch row1[x1] color table offsets + "vmull.u8 q1, d0, d31 \n\t" // q0 = [a00|a01] * (16 - y) + "vmull.u8 q2, d1, d30 \n\t" // q1 = [a10|a11] * y + "uxth r6, %[fx] \n\t" // Start calculate x0/subX (fx & 0xFFFF) + "mul r6, %[maxX1], r6 \n\t" // Multiply with maxX + 1 + "add r8, %[table], r8, lsl #2 \n\t" // Calculate a01 address for table + "add r7, %[table], r7, lsl #2 \n\t" // Calculate a11 address for table + "vld1.32 {d0[1]}, [r8] \n\t" // Load a01 RGBA pixel from table + "vld1.32 {d1[1]}, [r7] \n\t" // Load a11 RGBA pixel from table + "lsr r8, r6, #16 \n\t" // Shift by 16 to get x0 + "ldrb r7, [%[row0], r8] \n\t" // Fetch row0[x0] color table offsets + "ldrb r8, [%[row1], r8] \n\t" // Fetch row1[x0] color table offsets + "vmul.i16 d17, d3, d28 \n\t" // d17 = a01 * x + "vmla.i16 d17, d5, d28 \n\t" // d17 += a11 * x + "add r7, %[table], r7, lsl #2 \n\t" // Calculate a00 address for table + "add r8, %[table], r8, lsl #2 \n\t" // Calculate a10 address for table + "vld1.32 {d0[0]}, [r7] \n\t" // Load a00 RGBA pixel from table + "vld1.32 {d1[0]}, [r8] \n\t" // Load a10 RGBA pixel from table + "vsub.i16 d27, d29, d28 \n\t" // 16 - subX + "vmla.i16 d17, d2, d27 \n\t" // d17 += a00 * (16 - x) + "vmla.i16 d17, d4, d27 \n\t" // d17 += a10 * (16 - x) + // Calculate pixel #3 and pre-load #4 + "lsr r6, r6, #12 \n\t" // Calculate subX + "and r6, r6, #0xF \n\t" // ((fx & 0xFFFF) * (maxX1) >> 12) & 0xF + "add %[fx], %[fx], %[dx] \n\t" // Move fx to next position + "vdup.16 d28, r6 \n\t" // subX + "vshrn.i16 d16, q8, #8 \n\t" // shift down result by 8 + "add r7, %[fx], %[oneX] \n\t" // Start calculate x1 (fx + oneX) + "uxth r7, r7 \n\t" // (fx + oneX) & 0xFFFF + "mul r7, %[maxX1], r7 \n\t" // Multiply with maxX + 1 + "lsr r7, r7, #16 \n\t" // Shift by 16 to get x1 + "ldrb r8, [%[row0], r7] \n\t" // Fetch row0[x1] color table offsets + "ldrb r7, [%[row1], r7] \n\t" // Fetch row1[x1] color table offsets + "vmull.u8 q1, d0, d31 \n\t" // q0 = [a00|a01] * (16 - y) + "vmull.u8 q2, d1, d30 \n\t" // q1 = [a10|a11] * y + "uxth r6, %[fx] \n\t" // Start calculate x0/subX (fx & 0xFFFF) + "mul r6, %[maxX1], r6 \n\t" // Multiply with maxX + 1 + "add r8, %[table], r8, lsl #2 \n\t" // Calculate a01 address for table + "add r7, %[table], r7, lsl #2 \n\t" // Calculate a11 address for table + "vld1.32 {d0[1]}, [r8] \n\t" // Load a01 RGBA pixel from table + "vld1.32 {d1[1]}, [r7] \n\t" // Load a11 RGBA pixel from table + "lsr r8, r6, #16 \n\t" // Shift by 16 to get x0 + "ldrb r7, [%[row0], r8] \n\t" // Fetch row0[x0] color table offsets + "ldrb r8, [%[row1], r8] \n\t" // Fetch row1[x0] color table offsets + "vmul.i16 d18, d3, d28 \n\t" // d18 = a01 * x + "vmla.i16 d18, d5, d28 \n\t" // d18 += a11 * x + "add r7, %[table], r7, lsl #2 \n\t" // Calculate a00 address for table + "add r8, %[table], r8, lsl #2 \n\t" // Calculate a10 address for table + "vld1.32 {d0[0]}, [r7] \n\t" // Load a00 RGBA pixel from table + "vld1.32 {d1[0]}, [r8] \n\t" // Load a10 RGBA pixel from table + "vsub.i16 d27, d29, d28 \n\t" // 16 - subX + "vmla.i16 d18, d2, d27 \n\t" // d18 += a00 * (16 - x) + "vmla.i16 d18, d4, d27 \n\t" // d18 += a10 * (16 - x) + // Calculate pixel #4 + "vmull.u8 q1, d0, d31 \n\t" // q0 = [a00|a01] * (16 - y) + "vmull.u8 q2, d1, d30 \n\t" // q1 = [a10|a11] * y + "lsr r6, r6, #12 \n\t" // Calculate subX + "and r6, r6, #0xF \n\t" // ((fx & 0xFFFF) * (maxX1) >> 12) & 0xF + "add %[fx], %[fx], %[dx] \n\t" // Move fx to next position + "vdup.16 d28, r6 \n\t" // subX + "vmul.i16 d19, d3, d28 \n\t" // d19 = a01 * x + "vmla.i16 d19, d5, d28 \n\t" // d19 += a11 * x + "vsub.i16 d27, d29, d28 \n\t" // 16 - subX + "vmla.i16 d19, d2, d27 \n\t" // d19 += a00 * (16 - x) + "vmla.i16 d19, d4, d27 \n\t" // d19 += a10 * (16 - x) + "vshrn.i16 d17, q9, #8 \n\t" // shift down result by 8 + "vst1.32 {d16-d17}, [%[colors]]! \n\t" // Write result to memory + "bne 1b \n\t" + : [fx] "+r" (fx), [colors] "+r" (colors), [cnt] "+r" (asm_count) + : [row0] "r" (row0), [row1] "r" (row1), [subY] "r" (subY), [dx] "r" (dx), [table] "r" (table), [oneX] "r" (oneX), [maxX1] "r" (maxX1) + : "cc", "memory", "r6", "r7", "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d27", "d28", "d29", "d30", "d31" + ); + } else { + unsigned subX = (((fx & 0xFFFF) * (maxX + 1) >> 12) & 0xF); + unsigned x0 = ((fx & 0xFFFF) * (maxX + 1) >> 16); + unsigned x1 = (((fx + oneX) & 0xFFFF) * (maxX + 1) >> 16); + + Filter_32_opaque(subX, subY, + table[row0[x0]], + table[row0[x1]], + table[row1[x0]], + table[row1[x1]], + colors); + colors += 1; + fx += dx; + count--; + } + } while (count != 0); + + s.fBitmap->getColorTable()->unlockColors(false); +} #endif @@ -587,6 +797,9 @@ void SkBitmapProcState::platformProcs() { if (SI8_opaque_D32_filter_DX == fSampleProc32) { if (clamp_clamp) { fShaderProc32 = Clamp_SI8_opaque_D32_filter_DX_shaderproc; + } else if ((SkShader::kRepeat_TileMode == fTileModeX) && + (SkShader::kRepeat_TileMode == fTileModeY)) { + fShaderProc32 = Repeat_SI8_opaque_D32_filter_DX_shaderproc; } } else #endif |