diff options
-rw-r--r-- | Android.mk | 4 | ||||
-rw-r--r-- | src/core/SkBlitter_RGB16.cpp | 12 | ||||
-rw-r--r-- | src/core/asm/SkBlitter_RGB16_NEON.S | 171 |
3 files changed, 186 insertions, 1 deletions
@@ -266,6 +266,10 @@ ifeq ($(ARCH_ARM_HAVE_NEON),true) LOCAL_SRC_FILES += \ src/opts/memset16_neon.S \ src/opts/memset32_neon.S + +LOCAL_CFLAGS += -DNEON_BLITANTIH +LOCAL_SRC_FILES += \ + src/core/asm/SkBlitter_RGB16_NEON.S endif LOCAL_SRC_FILES += \ diff --git a/src/core/SkBlitter_RGB16.cpp b/src/core/SkBlitter_RGB16.cpp index 8a4d454..c3fb3cf 100644 --- a/src/core/SkBlitter_RGB16.cpp +++ b/src/core/SkBlitter_RGB16.cpp @@ -579,13 +579,22 @@ void SkRGB16_Blitter::blitH(int x, int y, int width) { blend32_16_row(fSrcColor32, device, width); } +#ifdef NEON_BLITANTIH +extern "C" void blitAntiH_NEON(const SkAlpha* SK_RESTRICT antialias, + uint16_t * SK_RESTRICT device, + const int16_t* SK_RESTRICT runs, + uint32_t srcExpanded, unsigned scale); +#endif + void SkRGB16_Blitter::blitAntiH(int x, int y, const SkAlpha* SK_RESTRICT antialias, const int16_t* SK_RESTRICT runs) { uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y); uint32_t srcExpanded = fExpandedRaw16; unsigned scale = fScale; - +#ifdef NEON_BLITANTIH + blitAntiH_NEON(antialias, device, runs, srcExpanded, scale); +#else // TODO: respect fDoDither for (;;) { int count = runs[0]; @@ -609,6 +618,7 @@ void SkRGB16_Blitter::blitAntiH(int x, int y, } device += count; } +#endif } static inline void blend_8_pixels(U8CPU bw, uint16_t dst[], unsigned dst_scale, diff --git a/src/core/asm/SkBlitter_RGB16_NEON.S b/src/core/asm/SkBlitter_RGB16_NEON.S new file mode 100644 index 0000000..715887f --- /dev/null +++ b/src/core/asm/SkBlitter_RGB16_NEON.S @@ -0,0 +1,171 @@ + .arch armv7-a + .text + .global blitAntiH_NEON + .type blitAntiH_NEON, %function +blitAntiH_NEON: + + .fnstart + + @r0 antialias + @r1 device + @r2 runs + @r3 srcExpanded + @r4 scale + + + @r5 count + @r6 temp + @r7 0x7E0 + @r8 ~(0x7E0) + @r9 scale5 + @r10 src32 + @r11 temp2 + @r12 temp3 + @r14 count + + @d16 = {0x07E0, 0x07E0, 0x07E0, 0x07E0} + @d17 = {0xF81F, 0XF81F, 0XF81F, 0XF81F} + @q12 = d16 + @q13 = d17 + + + stmfd sp!, {r4-r12,r14} + mov r14, r13 + add r14, #40 + ldr r4, [r14] + + mov r5, #0x7E + vdup.16 d16, r5 + vshl.u16 d16, #4 + vmov.u16 d17, d16 + + vmvn.u16 d18, d17 + vmov.u16 d19, d18 + + vmovl.u16 q12, d16 + vmvn.u32 q13, q12 + +default: + ldrh r5, [r2] @r5 = runs[0] + cmp r5, #0 @if (r5 <=0) + bls end_prog @return + + add r2, r5, lsl #1 @ runs += (count*2) + ldrb r6, [r0] @ r6 = antialias[0] + add r0, r5 @ antialias += count + cmp r6, #0 @ if(r6 == 0) + addeq r1, r5, lsl #1 @ device += (count*2) + beq default @ goto default + + add r6, #1 @ antialias[0]++ + mul r7, r6, r4 @ antialias[0] * scale = r7 + + lsr r8, r7, #11 @ SkAlpha255To256(aa) * scale >> 11 + mul r10, r3, r8 @ r10 = scale5 * srcExpanded + rsb r9, r8, #32 @ r9 = 32 - scale5 + vdup.32 q10, r9 @ q10 = scale5 + vdup.32 q11, r10 @ q11 = src32 + + mov r7, #0x7E + lsl r7, #4 @ r7 = 0x7E0 + mvn r8, r7 @ r8 = ~(0x7E0) + + mov r14, r5, lsr #3 + cmp r14, #0 + beq start_fourbytes + +eightbytes: + vld1.16 {q0}, [r1] @device load + vand.u16 q1, q0, q8 @ c & 0X7E0 + vand.u16 q2, q0, q9 @ c &~(0x7E0) + + vmovl.u16 q3, d2 + vmovl.u16 q4, d4 @ conversion from 16bits to 32bits + vshl.u32 q5, q3, #16 @ q5 = (c & 0x7E0) << 16 + vorr.u32 q6, q4, q5 @ q6 = (c & 0x7E0) << 16 | (c & ~(0x7E0)) + vmul.u32 q7, q6, q10 @ dst32(q7) = scale5 * SkExpand_rgb_16(*device) + vadd.u32 q14, q7, q11 @ q14 = dst32 + src32 + vshr.u32 q15, q14, #5 @ q15 = (src32 + dst32) >> 5 + vshr.u32 q5, q15, #16 @ q5 = ( (src32 + dst32) >> 5 ) >> 16 + vand.u32 q6, q5, q12 @ q6 = (c >> 16) & (0x7E0) + vand.u32 q7, q15, q13 @ q7 = (c & ~(0x7E0)) + vorr.u32 q14, q6, q7 @ q14 = ((c >> 16) & 0x7E0) | (c & ~(0x7E0)) + vmovn.u32 d0, q14 + + vmovl.u16 q3, d3 + vmovl.u16 q4, d5 @ conversion from 16bits to 32bits + vshl.u32 q5, q3, #16 @ q5 = (c & 0x7E0) << 16 + vorr.u32 q6, q4, q5 @ q6 = (c & 0x7E0) << 16 | (c & ~(0x7E0)) + vmul.u32 q7, q6, q10 @ dst32(q7) = scale5 * SkExpand_rgb_16(*device) + vadd.u32 q14, q7, q11 @ q14 = dst32 + src32 + vshr.u32 q15, q14, #5 @ q15 = (src32 + dst32) >> 5 + vshr.u32 q5, q15, #16 @ q5 = ( (src32 + dst32) >> 5 ) >> 16 + vand.u32 q6, q5, q12 @ q6 = (c >> 16) & (0x7E0) + vand.u32 q7, q15, q13 @ q7 = (c & ~(0x7E0)) + vorr.u32 q14, q6, q7 @ q14 = ((c >> 16) & 0x7E0) | (c & ~(0x7E0)) + vmovn.u32 d1, q14 + + vst1.16 {q0}, [r1] + + add r1, #16 @ device++ + subs r14, r14, #1 + bhi eightbytes + +start_fourbytes: + and r14, r5, #4 + cmp r14, #0 + beq start_onebyte + +fourbytes: + vld1.16 d0, [r1] @ device load + vand.u16 d1, d0, d16 @ c & 0x7E0 + vand.u16 d2, d0, d18 @ c & ~(0x7E0) + vmovl.u16 q3, d1 + vmovl.u16 q4, d2 @ conversion from 16bits to 32bits + + vshl.u32 q5, q3, #16 @ q5 = (c & 0x7E0) << 16 + vorr.u32 q6, q4, q5 @ q6 = (c & 0x7E0) << 16 | (c & ~(0x7E0)) + vmul.u32 q7, q6, q10 @ dst32(q7) = scale5 * SkExpand_rgb_16(*device) + vadd.u32 q3, q11, q7 @ q3 = dst32 + src32 + vshr.u32 q1, q3, #5 @ q1 = (src32 + dst32) >> 5 + vshr.u32 q2, q1, #16 @ q2 = ( (src32 + dst32) >> 5 ) >> 16 + + vand.u32 q3, q2, q12 @ q3 = (c >> 16) & (0x7E0) + vand.u32 q4, q1, q13 @ q4 = (c & ~(0x7E0)) + + vorr.u32 q5, q4, q3 @ q6 = ((c >> 16) & 0x7E0) | (c & ~(0x7E0)) + vmovn.u32 d0, q5 + vst1.16 d0, [r1] + add r1, #8 + + +start_onebyte: + and r14, r5, #3 + cmp r14, #0 + beq default +onebyte: + ldrh r6, [r1] @ device load + and r5, r6, r7 @ r5 = c & 0x7E0 + and r11, r6, r8 @ r11 = c & (~0x7E0) + orr r12, r11, r5, lsl #16 @ r12 = (c & (~0x7E0) ) | ((c & 0x7E0) << 16) + + mul r6, r9, r12 @dst32(r5) = scale5 * SkExpand_rgb_16(*device) + + add r5, r6, r10 @ src32 + dst32 + lsr r6, r5, #5 @ (src32 + dst32) >> 5 + and r12, r7, r6, lsr #16 @ (c>>16) & 2016 + and r11, r8, r6 @ (c & ~(2016)) + orr r5, r11, r12 @ (c & ~(2016) | ( (c>>16) & 2016) + + strh r5, [r1] @ *device = r5 + add r1, #2 @ device++ + subs r14, r14, #1 + bhi onebyte + b default + + +end_prog: + ldmfd sp!, {r4-r12, r14} + mov pc, lr + +.fnend |