aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLee Hwangjae <hj-yo.lee@samsung.com>2012-08-17 12:20:24 +0900
committerSteve Kondik <shade@chemlab.org>2012-12-17 19:32:41 -0800
commit6818083c672be1ac756ad46e682b33126e3bfd5f (patch)
tree857676225fec2054033acabfa427a94b015912bf
parenta0a3a4d3f7e368311d896eaca0a22732a81d37fb (diff)
downloadexternal_skia-6818083c672be1ac756ad46e682b33126e3bfd5f.zip
external_skia-6818083c672be1ac756ad46e682b33126e3bfd5f.tar.gz
external_skia-6818083c672be1ac756ad46e682b33126e3bfd5f.tar.bz2
Support NEON anti-aliasing blit function in SKIA
original function has calculation are called repetitively, and Processed individual pixels only. So Function call overhead was reduced. And In contrast to the serial data processing in the existing code, the Neon code processes 8 or 4 pixels in parallel. In contrast to the serial data processing in the existing code, the Neon code processes 8 or 4 pixels in parallel Change-Id: Ib54aeb6bacfd4ed6d257324e8146b211f90b68e9 Signed-off-by: Lee Hwangjae <hj-yo.lee@samsung.com>
-rw-r--r--Android.mk4
-rw-r--r--src/core/SkBlitter_RGB16.cpp12
-rw-r--r--src/core/asm/SkBlitter_RGB16_NEON.S171
3 files changed, 186 insertions, 1 deletions
diff --git a/Android.mk b/Android.mk
index 84ce841..26361ba 100644
--- a/Android.mk
+++ b/Android.mk
@@ -266,6 +266,10 @@ ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_SRC_FILES += \
src/opts/memset16_neon.S \
src/opts/memset32_neon.S
+
+LOCAL_CFLAGS += -DNEON_BLITANTIH
+LOCAL_SRC_FILES += \
+ src/core/asm/SkBlitter_RGB16_NEON.S
endif
LOCAL_SRC_FILES += \
diff --git a/src/core/SkBlitter_RGB16.cpp b/src/core/SkBlitter_RGB16.cpp
index 8a4d454..c3fb3cf 100644
--- a/src/core/SkBlitter_RGB16.cpp
+++ b/src/core/SkBlitter_RGB16.cpp
@@ -579,13 +579,22 @@ void SkRGB16_Blitter::blitH(int x, int y, int width) {
blend32_16_row(fSrcColor32, device, width);
}
+#ifdef NEON_BLITANTIH
+extern "C" void blitAntiH_NEON(const SkAlpha* SK_RESTRICT antialias,
+ uint16_t * SK_RESTRICT device,
+ const int16_t* SK_RESTRICT runs,
+ uint32_t srcExpanded, unsigned scale);
+#endif
+
void SkRGB16_Blitter::blitAntiH(int x, int y,
const SkAlpha* SK_RESTRICT antialias,
const int16_t* SK_RESTRICT runs) {
uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y);
uint32_t srcExpanded = fExpandedRaw16;
unsigned scale = fScale;
-
+#ifdef NEON_BLITANTIH
+ blitAntiH_NEON(antialias, device, runs, srcExpanded, scale);
+#else
// TODO: respect fDoDither
for (;;) {
int count = runs[0];
@@ -609,6 +618,7 @@ void SkRGB16_Blitter::blitAntiH(int x, int y,
}
device += count;
}
+#endif
}
static inline void blend_8_pixels(U8CPU bw, uint16_t dst[], unsigned dst_scale,
diff --git a/src/core/asm/SkBlitter_RGB16_NEON.S b/src/core/asm/SkBlitter_RGB16_NEON.S
new file mode 100644
index 0000000..715887f
--- /dev/null
+++ b/src/core/asm/SkBlitter_RGB16_NEON.S
@@ -0,0 +1,171 @@
+ .arch armv7-a
+ .text
+ .global blitAntiH_NEON
+ .type blitAntiH_NEON, %function
+blitAntiH_NEON:
+
+ .fnstart
+
+ @r0 antialias
+ @r1 device
+ @r2 runs
+ @r3 srcExpanded
+ @r4 scale
+
+
+ @r5 count
+ @r6 temp
+ @r7 0x7E0
+ @r8 ~(0x7E0)
+ @r9 scale5
+ @r10 src32
+ @r11 temp2
+ @r12 temp3
+ @r14 count
+
+ @d16 = {0x07E0, 0x07E0, 0x07E0, 0x07E0}
+ @d17 = {0xF81F, 0XF81F, 0XF81F, 0XF81F}
+ @q12 = d16
+ @q13 = d17
+
+
+ stmfd sp!, {r4-r12,r14}
+ mov r14, r13
+ add r14, #40
+ ldr r4, [r14]
+
+ mov r5, #0x7E
+ vdup.16 d16, r5
+ vshl.u16 d16, #4
+ vmov.u16 d17, d16
+
+ vmvn.u16 d18, d17
+ vmov.u16 d19, d18
+
+ vmovl.u16 q12, d16
+ vmvn.u32 q13, q12
+
+default:
+ ldrh r5, [r2] @r5 = runs[0]
+ cmp r5, #0 @if (r5 <=0)
+ bls end_prog @return
+
+ add r2, r5, lsl #1 @ runs += (count*2)
+ ldrb r6, [r0] @ r6 = antialias[0]
+ add r0, r5 @ antialias += count
+ cmp r6, #0 @ if(r6 == 0)
+ addeq r1, r5, lsl #1 @ device += (count*2)
+ beq default @ goto default
+
+ add r6, #1 @ antialias[0]++
+ mul r7, r6, r4 @ antialias[0] * scale = r7
+
+ lsr r8, r7, #11 @ SkAlpha255To256(aa) * scale >> 11
+ mul r10, r3, r8 @ r10 = scale5 * srcExpanded
+ rsb r9, r8, #32 @ r9 = 32 - scale5
+ vdup.32 q10, r9 @ q10 = scale5
+ vdup.32 q11, r10 @ q11 = src32
+
+ mov r7, #0x7E
+ lsl r7, #4 @ r7 = 0x7E0
+ mvn r8, r7 @ r8 = ~(0x7E0)
+
+ mov r14, r5, lsr #3
+ cmp r14, #0
+ beq start_fourbytes
+
+eightbytes:
+ vld1.16 {q0}, [r1] @device load
+ vand.u16 q1, q0, q8 @ c & 0X7E0
+ vand.u16 q2, q0, q9 @ c &~(0x7E0)
+
+ vmovl.u16 q3, d2
+ vmovl.u16 q4, d4 @ conversion from 16bits to 32bits
+ vshl.u32 q5, q3, #16 @ q5 = (c & 0x7E0) << 16
+ vorr.u32 q6, q4, q5 @ q6 = (c & 0x7E0) << 16 | (c & ~(0x7E0))
+ vmul.u32 q7, q6, q10 @ dst32(q7) = scale5 * SkExpand_rgb_16(*device)
+ vadd.u32 q14, q7, q11 @ q14 = dst32 + src32
+ vshr.u32 q15, q14, #5 @ q15 = (src32 + dst32) >> 5
+ vshr.u32 q5, q15, #16 @ q5 = ( (src32 + dst32) >> 5 ) >> 16
+ vand.u32 q6, q5, q12 @ q6 = (c >> 16) & (0x7E0)
+ vand.u32 q7, q15, q13 @ q7 = (c & ~(0x7E0))
+ vorr.u32 q14, q6, q7 @ q14 = ((c >> 16) & 0x7E0) | (c & ~(0x7E0))
+ vmovn.u32 d0, q14
+
+ vmovl.u16 q3, d3
+ vmovl.u16 q4, d5 @ conversion from 16bits to 32bits
+ vshl.u32 q5, q3, #16 @ q5 = (c & 0x7E0) << 16
+ vorr.u32 q6, q4, q5 @ q6 = (c & 0x7E0) << 16 | (c & ~(0x7E0))
+ vmul.u32 q7, q6, q10 @ dst32(q7) = scale5 * SkExpand_rgb_16(*device)
+ vadd.u32 q14, q7, q11 @ q14 = dst32 + src32
+ vshr.u32 q15, q14, #5 @ q15 = (src32 + dst32) >> 5
+ vshr.u32 q5, q15, #16 @ q5 = ( (src32 + dst32) >> 5 ) >> 16
+ vand.u32 q6, q5, q12 @ q6 = (c >> 16) & (0x7E0)
+ vand.u32 q7, q15, q13 @ q7 = (c & ~(0x7E0))
+ vorr.u32 q14, q6, q7 @ q14 = ((c >> 16) & 0x7E0) | (c & ~(0x7E0))
+ vmovn.u32 d1, q14
+
+ vst1.16 {q0}, [r1]
+
+ add r1, #16 @ device++
+ subs r14, r14, #1
+ bhi eightbytes
+
+start_fourbytes:
+ and r14, r5, #4
+ cmp r14, #0
+ beq start_onebyte
+
+fourbytes:
+ vld1.16 d0, [r1] @ device load
+ vand.u16 d1, d0, d16 @ c & 0x7E0
+ vand.u16 d2, d0, d18 @ c & ~(0x7E0)
+ vmovl.u16 q3, d1
+ vmovl.u16 q4, d2 @ conversion from 16bits to 32bits
+
+ vshl.u32 q5, q3, #16 @ q5 = (c & 0x7E0) << 16
+ vorr.u32 q6, q4, q5 @ q6 = (c & 0x7E0) << 16 | (c & ~(0x7E0))
+ vmul.u32 q7, q6, q10 @ dst32(q7) = scale5 * SkExpand_rgb_16(*device)
+ vadd.u32 q3, q11, q7 @ q3 = dst32 + src32
+ vshr.u32 q1, q3, #5 @ q1 = (src32 + dst32) >> 5
+ vshr.u32 q2, q1, #16 @ q2 = ( (src32 + dst32) >> 5 ) >> 16
+
+ vand.u32 q3, q2, q12 @ q3 = (c >> 16) & (0x7E0)
+ vand.u32 q4, q1, q13 @ q4 = (c & ~(0x7E0))
+
+ vorr.u32 q5, q4, q3 @ q6 = ((c >> 16) & 0x7E0) | (c & ~(0x7E0))
+ vmovn.u32 d0, q5
+ vst1.16 d0, [r1]
+ add r1, #8
+
+
+start_onebyte:
+ and r14, r5, #3
+ cmp r14, #0
+ beq default
+onebyte:
+ ldrh r6, [r1] @ device load
+ and r5, r6, r7 @ r5 = c & 0x7E0
+ and r11, r6, r8 @ r11 = c & (~0x7E0)
+ orr r12, r11, r5, lsl #16 @ r12 = (c & (~0x7E0) ) | ((c & 0x7E0) << 16)
+
+ mul r6, r9, r12 @dst32(r5) = scale5 * SkExpand_rgb_16(*device)
+
+ add r5, r6, r10 @ src32 + dst32
+ lsr r6, r5, #5 @ (src32 + dst32) >> 5
+ and r12, r7, r6, lsr #16 @ (c>>16) & 2016
+ and r11, r8, r6 @ (c & ~(2016))
+ orr r5, r11, r12 @ (c & ~(2016) | ( (c>>16) & 2016)
+
+ strh r5, [r1] @ *device = r5
+ add r1, #2 @ device++
+ subs r14, r14, #1
+ bhi onebyte
+ b default
+
+
+end_prog:
+ ldmfd sp!, {r4-r12, r14}
+ mov pc, lr
+
+.fnend