aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenrik Smiding <henrik.smiding@stericsson.com>2012-01-09 16:46:54 +0100
committerEvan McClain <aeroevan@gmail.com>2013-05-26 09:23:15 -0400
commit4fde5f6a57b41cb35fe54322d3d6f38defe1f490 (patch)
treea098091c24833ce719733765d90dd7ed146541c0
parentd1e52fcca0df402146cac256a0d9bc0374156b6c (diff)
downloadexternal_skia-4fde5f6a57b41cb35fe54322d3d6f38defe1f490.zip
external_skia-4fde5f6a57b41cb35fe54322d3d6f38defe1f490.tar.gz
external_skia-4fde5f6a57b41cb35fe54322d3d6f38defe1f490.tar.bz2
Add shader for Clamp_SI8_opaque_D32_filter_DX.
Adds a NEON optimized shader for SI8_opaque_D32_filter_DX when running with clamped tile mode. Improves performance of browser. Change-Id: I0e1fa03d4104210953b7ec81ce10eba5f77459cc Signed-off-by: Henrik Smiding <henrik.smiding@stericsson.com> Signed-off-by: Patrik Ryd <patrik.ryd@stericsson.com>
-rw-r--r--src/core/SkBitmapProcState.h2
-rw-r--r--src/opts/SkBitmapProcState_opts_arm.cpp215
2 files changed, 217 insertions, 0 deletions
diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h
index c04992b..1b42b61 100644
--- a/src/core/SkBitmapProcState.h
+++ b/src/core/SkBitmapProcState.h
@@ -144,5 +144,7 @@ void ClampX_ClampY_filter_affine(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y);
void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y);
+void SI8_opaque_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[],
+ int count, SkPMColor colors[]);
#endif
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp
index 63854f7..415bd0c 100644
--- a/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -358,6 +358,214 @@ void Clamp_S32_opaque_D32_filter_DX_shaderproc(const SkBitmapProcState& s, int x
}
} while (count != 0);
}
+
+
+void Clamp_SI8_opaque_D32_filter_DX_shaderproc(const SkBitmapProcState& s, int x, int y,
+ uint32_t* SK_RESTRICT colors, int count) {
+ SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+ SkMatrix::kScale_Mask)) == 0);
+ SkASSERT(s.fInvKy == 0);
+ SkASSERT(count > 0 && colors != NULL);
+ SkASSERT(s.fDoFilter);
+ SkDEBUGCODE(SkASSERT(s.fBitmap->config() == SkBitmap::kIndex8_Config);)
+
+ const unsigned maxX = s.fBitmap->width() - 1;
+ const SkFixed oneX = s.fFilterOneX;
+ const SkFixed dx = s.fInvSx;
+ SkFixed fx;
+ const uint8_t* SK_RESTRICT row0;
+ const uint8_t* SK_RESTRICT row1;
+ unsigned subY;
+
+ {
+ SkPoint pt;
+ s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+ SkIntToScalar(y) + SK_ScalarHalf, &pt);
+ SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
+ const unsigned maxY = s.fBitmap->height() - 1;
+ // compute our two Y values up front
+ subY = ((fy >> 12) & 0xF);
+ int y0 = SkClampMax(fy >> 16, maxY);
+ int y1 = SkClampMax((fy + s.fFilterOneY) >> 16, maxY);
+
+ const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
+ unsigned rb = s.fBitmap->rowBytes();
+ row0 = (const uint8_t*)(srcAddr + y0 * rb);
+ row1 = (const uint8_t*)(srcAddr + y1 * rb);
+ // now initialize fx
+ fx = SkScalarToFixed(pt.fX) - (oneX >> 1);
+ }
+
+ const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
+
+ do {
+ // Check if we can do the next four pixels using ARM NEON asm
+ if ((count >= 4) &&
+ (((dx >= 0) && (fx >= 0) && (((fx + 3 * dx) >> 16) < (const signed)maxX)) ||
+ ((dx < 0) && ((fx >> 16) < (const signed)maxX) && (((fx + 3 * dx) >> 16) >= 0)))) {
+ int asm_count;
+
+ // How many iterations can we do while still clamped?
+ if (dx >= 0) {
+ asm_count = (((((const signed)maxX - 1) << 16) - fx) / dx) >> 2;
+ } else {
+ asm_count = ((0 - fx) / dx) >> 2;
+ }
+
+ if (asm_count <= 0) {
+ asm_count = 1;
+ } else if ((asm_count << 2) > count) {
+ asm_count = count >> 2;
+ }
+
+ count -= asm_count << 2;
+
+ // We know that oneX is 1.0 since we are running clamped.
+ // This means that we can load both x0 and x1 offsets in one go.
+ asm volatile (
+ // Setup constants
+ "rsb r8, %[subY], #16 \n\t" // 16 - subY
+ "vdup.8 d30, %[subY] \n\t" // Create constant for subY
+ "vdup.8 d31, r8 \n\t" // Create constant for 16 - subY
+ "vmov.u16 d29, #16 \n\t" // Create constant for 16
+ "1: \n\t" // Loop start
+ // Pre-load pixel #1
+ "asr r7, %[fx], #16 \n\t" // Calculate offset fx >> 16
+ "ldrh r8, [%[row0], r7] \n\t" // Fetch row0 color table offsets
+ "ldrh r7, [%[row1], r7] \n\t" // Fetch row1 color table offsets
+ "subs %[cnt], %[cnt], #1 \n\t" // Decrement loop counter
+ "and r6, r8, #0xFF \n\t" // Extract first offset
+ "lsr r8, r8, #8 \n\t" // Extract second offset
+ "add r6, %[table], r6, lsl #2 \n\t" // Calculate a00 address for table
+ "add r8, %[table], r8, lsl #2 \n\t" // Calculate a01 address for table
+ "vld1.32 {d0[0]}, [r6] \n\t" // Load a00 RGBA pixel from table
+ "vld1.32 {d0[1]}, [r8] \n\t" // Load a01 RGBA pixel from table
+ "and r6, r7, #0xFF \n\t" // Extract first offset
+ "lsr r7, r7, #8 \n\t" // Extract second offset
+ "add r6, %[table], r6, lsl #2 \n\t" // Calculate a10 address for table
+ "add r7, %[table], r7, lsl #2 \n\t" // Calculate a11 address for table
+ "vld1.32 {d1[0]}, [r6] \n\t" // Load a10 RGBA pixel from table
+ "vld1.32 {d1[1]}, [r7] \n\t" // Load a11 RGBA pixel from table
+ // Calculate pixel #1 and pre-load #2
+ "lsr r8, %[fx], #12 \n\t" // Calculate subX = ((fx >> 12) & 0xF)
+ "and r8, r8, #0xF \n\t" //
+ "add %[fx], %[fx], %[dx] \n\t" // Move fx to next position
+ "vdup.16 d28, r8 \n\t" // subX
+ "asr r7, %[fx], #16 \n\t" // Calculate offset fx >> 16
+ "ldrh r8, [%[row0], r7] \n\t" // Fetch row0 color table offsets
+ "ldrh r7, [%[row1], r7] \n\t" // Fetch row1 color table offsets
+ "vmull.u8 q1, d0, d31 \n\t" // q0 = [a00|a01] * (16 - y)
+ "vmull.u8 q2, d1, d30 \n\t" // q1 = [a10|a11] * y
+ "and r6, r8, #0xFF \n\t" // Extract first offset
+ "lsr r8, r8, #8 \n\t" // Extract second offset
+ "add r6, %[table], r6, lsl #2 \n\t" // Calculate a00 address for table
+ "add r8, %[table], r8, lsl #2 \n\t" // Calculate a01 address for table
+ "vld1.32 {d0[0]}, [r6] \n\t" // Load a00 RGBA pixel from table
+ "vld1.32 {d0[1]}, [r8] \n\t" // Load a01 RGBA pixel from table
+ "vmul.i16 d16, d3, d28 \n\t" // d16 = a01 * x
+ "vmla.i16 d16, d5, d28 \n\t" // d16 += a11 * x
+ "and r6, r7, #0xFF \n\t" // Extract first offset
+ "lsr r7, r7, #8 \n\t" // Extract second offset
+ "add r6, %[table], r6, lsl #2 \n\t" // Calculate a10 address for table
+ "add r7, %[table], r7, lsl #2 \n\t" // Calculate a11 address for table
+ "vld1.32 {d1[0]}, [r6] \n\t" // Load a10 RGBA pixel from table
+ "vld1.32 {d1[1]}, [r7] \n\t" // Load a11 RGBA pixel from table
+ "vsub.i16 d27, d29, d28 \n\t" // 16 - subX
+ "vmla.i16 d16, d2, d27 \n\t" // d16 += a00 * (16 - x)
+ "vmla.i16 d16, d4, d27 \n\t" // d16 += a10 * (16 - x)
+ // Calculate pixel #2 and pre-load #3
+ "lsr r8, %[fx], #12 \n\t" // Calculate subX = ((fx >> 12) & 0xF)
+ "and r8, r8, #0xF \n\t" //
+ "add %[fx], %[fx], %[dx] \n\t" // Move fx to next position
+ "vdup.16 d28, r8 \n\t" // subX
+ "asr r7, %[fx], #16 \n\t" // Calculate offset fx >> 16
+ "ldrh r8, [%[row0], r7] \n\t" // Fetch row0 color table offsets
+ "ldrh r7, [%[row1], r7] \n\t" // Fetch row1 color table offsets
+ "vmull.u8 q1, d0, d31 \n\t" // q0 = [a00|a01] * (16 - y)
+ "vmull.u8 q2, d1, d30 \n\t" // q1 = [a10|a11] * y
+ "and r6, r8, #0xFF \n\t" // Extract first offset
+ "lsr r8, r8, #8 \n\t" // Extract second offset
+ "add r6, %[table], r6, lsl #2 \n\t" // Calculate a00 address for table
+ "add r8, %[table], r8, lsl #2 \n\t" // Calculate a01 address for table
+ "vld1.32 {d0[0]}, [r6] \n\t" // Load a00 RGBA pixel from table
+ "vld1.32 {d0[1]}, [r8] \n\t" // Load a01 RGBA pixel from table
+ "vmul.i16 d17, d3, d28 \n\t" // d17 = a01 * x
+ "vmla.i16 d17, d5, d28 \n\t" // d17 += a11 * x
+ "and r6, r7, #0xFF \n\t" // Extract first offset
+ "lsr r7, r7, #8 \n\t" // Extract second offset
+ "add r6, %[table], r6, lsl #2 \n\t" // Calculate a10 address for table
+ "add r7, %[table], r7, lsl #2 \n\t" // Calculate a11 address for table
+ "vld1.32 {d1[0]}, [r6] \n\t" // Load a10 RGBA pixel from table
+ "vld1.32 {d1[1]}, [r7] \n\t" // Load a11 RGBA pixel from table
+ "vsub.i16 d27, d29, d28 \n\t" // 16 - subX
+ "vmla.i16 d17, d2, d27 \n\t" // d17 += a00 * (16 - x)
+ "vmla.i16 d17, d4, d27 \n\t" // d17 += a10 * (16 - x)
+ // Calculate pixel #3 and pre-load #4
+ "lsr r8, %[fx], #12 \n\t" // Calculate subX = ((fx >> 12) & 0xF)
+ "and r8, r8, #0xF \n\t" //
+ "add %[fx], %[fx], %[dx] \n\t" // Move fx to next position
+ "vdup.16 d28, r8 \n\t" // subX
+ "asr r7, %[fx], #16 \n\t" // Calculate offset fx >> 16
+ "ldrh r8, [%[row0], r7] \n\t" // Fetch row0 color table offsets
+ "ldrh r7, [%[row1], r7] \n\t" // Fetch row1 color table offsets
+ "vmull.u8 q1, d0, d31 \n\t" // q0 = [a00|a01] * (16 - y)
+ "vmull.u8 q2, d1, d30 \n\t" // q1 = [a10|a11] * y
+ "and r6, r8, #0xFF \n\t" // Extract first offset
+ "lsr r8, r8, #8 \n\t" // Extract second offset
+ "add r6, %[table], r6, lsl #2 \n\t" // Calculate a00 address for table
+ "add r8, %[table], r8, lsl #2 \n\t" // Calculate a01 address for table
+ "vld1.32 {d0[0]}, [r6] \n\t" // Load a00 RGBA pixel from table
+ "vld1.32 {d0[1]}, [r8] \n\t" // Load a01 RGBA pixel from table
+ "vmul.i16 d18, d3, d28 \n\t" // d18 = a01 * x
+ "vmla.i16 d18, d5, d28 \n\t" // d18 += a11 * x
+ "and r6, r7, #0xFF \n\t" // Extract first offset
+ "lsr r7, r7, #8 \n\t" // Extract second offset
+ "add r6, %[table], r6, lsl #2 \n\t" // Calculate a10 address for table
+ "add r7, %[table], r7, lsl #2 \n\t" // Calculate a11 address for table
+ "vld1.32 {d1[0]}, [r6] \n\t" // Load a10 RGBA pixel from table
+ "vld1.32 {d1[1]}, [r7] \n\t" // Load a11 RGBA pixel from table
+ "vsub.i16 d27, d29, d28 \n\t" // 16 - subX
+ "vmla.i16 d18, d2, d27 \n\t" // d18 += a00 * (16 - x)
+ "vmla.i16 d18, d4, d27 \n\t" // d18 += a10 * (16 - x)
+ "vshrn.i16 d16, q8, #8 \n\t" // shift down result by 8
+ // Calculate pixel #4
+ "vmull.u8 q1, d0, d31 \n\t" // q0 = [a00|a01] * (16 - y)
+ "vmull.u8 q2, d1, d30 \n\t" // q1 = [a10|a11] * y
+ "lsr r8, %[fx], #12 \n\t" // Calculate subX = ((fx >> 12) & 0xF)
+ "and r8, r8, #0xF \n\t" //
+ "add %[fx], %[fx], %[dx] \n\t" // Move fx to next position
+ "vdup.16 d28, r8 \n\t" // subX
+ "vmul.i16 d19, d3, d28 \n\t" // d19 = a01 * x
+ "vmla.i16 d19, d5, d28 \n\t" // d19 += a11 * x
+ "vsub.i16 d27, d29, d28 \n\t" // 16 - subX
+ "vmla.i16 d19, d2, d27 \n\t" // d19 += a00 * (16 - x)
+ "vmla.i16 d19, d4, d27 \n\t" // d19 += a10 * (16 - x)
+ "vshrn.i16 d17, q9, #8 \n\t" // shift down result by 8
+ "vst1.32 {d16-d17}, [%[colors]]! \n\t" // Write result to memory
+ "bne 1b \n\t"
+ : [fx] "+r" (fx), [colors] "+r" (colors), [cnt] "+r" (asm_count)
+ : [row0] "r" (row0), [row1] "r" (row1), [subY] "r" (subY), [dx] "r" (dx), [table] "r" (table)
+ : "cc", "memory", "r6", "r7", "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d27", "d28", "d29", "d30", "d31"
+ );
+ } else {
+ unsigned subX = ((fx >> 12) & 0xF);
+ unsigned x0 = SkClampMax(fx >> 16, maxX);
+ unsigned x1 = SkClampMax((fx + oneX) >> 16, maxX);
+
+ Filter_32_opaque(subX, subY,
+ table[row0[x0]],
+ table[row0[x1]],
+ table[row1[x0]],
+ table[row1[x1]],
+ colors);
+ colors += 1;
+ fx += dx;
+ count--;
+ }
+ } while (count != 0);
+
+ s.fBitmap->getColorTable()->unlockColors(false);
+}
#endif
@@ -375,6 +583,13 @@ void SkBitmapProcState::platformProcs() {
switch (fBitmap->config()) {
case SkBitmap::kIndex8_Config:
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+ if (SI8_opaque_D32_filter_DX == fSampleProc32) {
+ if (clamp_clamp) {
+ fShaderProc32 = Clamp_SI8_opaque_D32_filter_DX_shaderproc;
+ }
+ } else
+#endif
#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
if (justDx && !doFilter) {
#if 0 /* crashing on android device */