From 416f34f7ddf0a948d092fd11933565b1acee3fe9 Mon Sep 17 00:00:00 2001 From: Henrik Smiding Date: Sat, 11 Aug 2012 23:28:17 +0200 Subject: Add optimization of Skia S32A_Blend blitter Adds optimization of Skia S32A_Blend_BlitRow32 blitter using ARM NEON instruction set. Special cases for when pixel alpha is zero. Improves performance platform wide. Change-Id: I6c8bf8a9525838682206ebd139855354d6b3a563 Signed-off-by: Henrik Smiding Signed-off-by: Patrik Ryd --- src/opts/S32A_Blend_BlitRow32_neon.S | 262 +++++++++++++++++++++++++++++++++++ src/opts/SkBlitRow_opts_arm.cpp | 14 ++ 2 files changed, 276 insertions(+) create mode 100644 src/opts/S32A_Blend_BlitRow32_neon.S (limited to 'src') diff --git a/src/opts/S32A_Blend_BlitRow32_neon.S b/src/opts/S32A_Blend_BlitRow32_neon.S new file mode 100644 index 0000000..84f9846 --- /dev/null +++ b/src/opts/S32A_Blend_BlitRow32_neon.S @@ -0,0 +1,262 @@ +/* + * Copyright (C) ST-Ericsson SA 2010 + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + * + * Neon optimized version of S32A_Blend_BlitRow32. + * Special cases for when alpha is zero or opaque. + */ + +#if defined(__ARM_HAVE_NEON) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS) + + .text + .fpu neon + .align + + .global S32A_Blend_BlitRow32_neon + .func S32A_Blend_BlitRow32_neon + +S32A_Blend_BlitRow32_neon: + cmp r2, #8 // The main code requires at least 8 pixels + ble BlitSmall + /* Setup constants, and do the first 1-8 pixels */ + vld4.8 {d20-d23}, [r1] // Load eight source RGBA pixels + vld4.8 {d24-d27}, [r0] // Load eight destination RGBA pixels + add r3, #1 // Modify global alpha to 0...256 range + vpush {q4-q5} + stmdb sp!, {r4-r5} + vmov.i16 q15, #256 // Set up alpha constant + vmov.i16 q5, #0xFF00 // Set up mask constant + vdup.16 q4, r3 // Set up global alpha + pld [r1, #32] // Pre-load next eight source pixels + pld [r0, #32] // Pre-load next eight destination pixels + ands r3, r2, #0x7 // Should we do a partial first iteration? + moveq r3, #8 // Do full iteration? + vmovl.u8 q8, d20 // Expand source red to 16-bit + vmovl.u8 q9, d21 // Expand source green to 16-bit + vmovl.u8 q10, d22 // Expand source blue to 16-bit + vmovl.u8 q11, d23 // Expand source alpha to 16-bit + vmul.i16 q8, q8, q4 // Scale source red + vmul.i16 q11, q11, q4 // Scale source alpha + vand q8, q5 // Mask low byte in red to avoid overflow in vmla + vmul.i16 q9, q9, q4 // Scale source green + vshr.u16 q0, q11, #8 // Pre-calculate inverse destination alpha (scale) + vmul.i16 q10, q10, q4 // Scale source blue + vand q11, q5 // Mask low byte in green to avoid overflow in vmla + vand q9, q5 // Mask low byte in blue to avoid overflow in vmla + vand q10, q5 // Mask low byte in alpha to avoid overflow in vmla + vsub.i16 q14, q15, q0 // Calculate inverse destination alpha (scale) + vmovl.u8 q2, d24 // Expand destination red to 16-bit + vmovl.u8 q3, d25 // Expand destination green to 16-bit + vmovl.u8 q12, d26 // Expand destination blue to 16-bit + vmovl.u8 q13, d27 // Expand destination alpha to 16-bit + vmla.i16 q8, q2, q14 // Scale destination red, and add to source + mov r4, r0 // Backup destination pointer + add r1, r3, lsl #2 // Increment source pointer + sub r2, r2, r3 // Decrement loop counter + vmla.i16 q9, q3, q14 // Scale destination green, and add to source + add r0, r3, lsl #2 // Increment destination pointer + pld [r1, #32] // Pre-load next eight source pixels + pld [r0, #32] // Pre-load next eight destination pixels + mov r3, r0 // Backup destination pointer + vmla.i16 q11, q13, q14 // Scale destination alpha, and add to source + vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels + subs r2, r2, #24 // Decrement loop counter + vmla.i16 q10, q12, q14 // Scale destination blue, and add to source + vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels + vshrn.i16 d24, q8, #8 // Shift and narrow red + vshrn.i16 d25, q9, #8 // Shift and narrow green + vshrn.i16 d26, q10, #8 // Shift and narrow blue + vshrn.i16 d27, q11, #8 // Shift and narrow alpha + vst4.8 {d24-d27}, [r4] // Write result to memory + bmi PostLoop // Do we have enough pixels to enter the main loop? + /* Main loop, blitting 16 pixels per iteration */ +Loop: + pld [r1, #32] // Pre-load next eight source pixels + pld [r0, #32] // Pre-load next eight destination pixels + vmov r4, r5, d3 // Move alpha to ARM for test + orrs r4, r5 // Check if source alpha is fully transparent + beq AllZero1 // If so, jump to special case handling + vmovl.u8 q8, d0 // Expand source red to 16-bit + vmovl.u8 q9, d1 // Expand source green to 16-bit + vmovl.u8 q0, d2 // Expand source blue to 16-bit + vmovl.u8 q1, d3 // Expand source alpha to 16-bit + vmul.i16 q8, q8, q4 // Scale source red + vmul.i16 q1, q1, q4 // Scale source alpha + vand q8, q5 // Mask low byte in red to avoid overflow in vmla + vmul.i16 q9, q9, q4 // Scale source green + vshr.u16 q10, q1, #8 // Pre-calculate inverse destination alpha (scale) + vmul.i16 q0, q0, q4 // Scale source blue + vand q1, q5 // Mask low byte in green to avoid overflow in vmla + vand q9, q5 // Mask low byte in blue to avoid overflow in vmla + vand q0, q5 // Mask low byte in alpha to avoid overflow in vmla + vsub.i16 q14, q15, q10 // Calculate inverse destination alpha (scale) + vmovl.u8 q12, d4 // Expand destination red to 16-bit + vmovl.u8 q13, d5 // Expand destination green to 16-bit + vmovl.u8 q2, d6 // Expand destination blue to 16-bit + vmovl.u8 q3, d7 // Expand destination alpha to 16-bit + vmla.i16 q8, q12, q14 // Scale destination red and add to source + vmla.i16 q9, q13, q14 // Scale destination green and add to source + vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels + vmla.i16 q1, q3, q14 // Scale destination alpha and add to source + vmla.i16 q0, q2, q14 // Scale destination blue and add to source + vld4.8 {d24-d27}, [r0]! // Pre-load next eight destination RGBA pixels + vshrn.i16 d4, q8, #8 // Shift and narrow red + vshrn.i16 d5, q9, #8 // Shift and narrow green + vshrn.i16 d6, q0, #8 // Shift and narrow blue + vshrn.i16 d7, q1, #8 // Shift and narrow alpha + vst4.8 {d4-d7}, [r3]! // Write result to memory +GoBack1: + pld [r1, #32] // Pre-load next eight source pixels + pld [r0, #32] // Pre-load next eight destination pixels + vmov r4, r5, d23 // Move alpha to ARM for test + orrs r4, r5 // Check if source alpha is fully transparent + beq AllZero2 // If so, jump to special case handling + vmovl.u8 q8, d20 // Expand source red to 16-bit + vmovl.u8 q9, d21 // Expand source green to 16-bit + vmovl.u8 q10, d22 // Expand source blue to 16-bit + vmovl.u8 q11, d23 // Expand source alpha to 16-bit + vmul.i16 q8, q8, q4 // Scale source red + subs r2, r2, #16 // Decrement loop counter + vmul.i16 q11, q11, q4 // Scale source alpha + vand q8, q5 // Mask low byte in red to avoid overflow in vmla + vmul.i16 q9, q9, q4 // Scale source green + vshr.u16 q0, q11, #8 // Pre-calculate inverse destination alpha (scale) + vmul.i16 q10, q10, q4 // Scale source blue + vand q11, q5 // Mask low byte in green to avoid overflow in vmla + vand q9, q5 // Mask low byte in blue to avoid overflow in vmla + vand q10, q5 // Mask low byte in alpha to avoid overflow in vmla + vsub.i16 q14, q15, q0 // Calculate inverse destination alpha (scale) + vmovl.u8 q2, d24 // Expand destination red to 16-bit + vmovl.u8 q3, d25 // Expand destination green to 16-bit + vmovl.u8 q12, d26 // Expand destination blue to 16-bit + vmovl.u8 q13, d27 // Expand destination alpha to 16-bit + vmla.i16 q8, q2, q14 // Scale destination red and add to source + vmla.i16 q9, q3, q14 // Scale destination green and add to source + vmla.i16 q11, q13, q14 // Scale destination alpha and add to source + vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels + vmla.i16 q10, q12, q14 // Scale destination blue, and add to source + vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels + vshrn.i16 d24, q8, #8 // Shift and narrow red + vshrn.i16 d25, q9, #8 // Shift and narrow green + vshrn.i16 d26, q10, #8 // Shift and narrow blue + vshrn.i16 d27, q11, #8 // Shift and narrow alpha + vst4.8 {d24-d27}, [r3]! // Write result to memory + bpl Loop +PostLoop: + add r2, r2, #16 + vmov.i16 q10, q4 + ldmia sp!, {r4-r5} + vpop {q4-q5} +LoopRemaining: + vmovl.u8 q8, d0 // Expand source red to 16-bit + vmovl.u8 q9, d1 // Expand source green to 16-bit + vmovl.u8 q0, d2 // Expand source blue to 16-bit + vmovl.u8 q1, d3 // Expand source alpha to 16-bit + vmul.i16 q8, q8, q10 // Scale source red + vmov.i16 q12, #0xFF00 // Set up mask constant + vmul.i16 q1, q1, q10 // Scale source alpha + vand q8, q12 // Mask low byte in red to avoid overflow in vmla + vmul.i16 q9, q9, q10 // Scale source green + vshr.u16 q11, q1, #8 // Pre-calculate inverse destination alpha (scale) + vmul.i16 q0, q0, q10 // Scale source blue + vand q1, q12 // Mask low byte in green to avoid overflow in vmla + vand q9, q12 // Mask low byte in blue to avoid overflow in vmla + vand q0, q12 // Mask low byte in alpha to avoid overflow in vmla + vsub.i16 q14, q15, q11 // Calculate inverse destination alpha (scale) + vmovl.u8 q12, d4 // Expand destination red to 16-bit + vmovl.u8 q13, d5 // Expand destination green to 16-bit + vmovl.u8 q2, d6 // Expand destination blue to 16-bit + vmovl.u8 q3, d7 // Expand destination alpha to 16-bit + vmla.i16 q8, q12, q14 // Scale destination red and add to source + subs r2, r2, #8 // Decrement loop counter + vmla.i16 q9, q13, q14 // Scale destination green and add to source + vmla.i16 q1, q3, q14 // Scale destination alpha and add to source + vmla.i16 q0, q2, q14 // Scale destination blue and add to source + vshrn.i16 d4, q8, #8 // Shift and narrow red + vshrn.i16 d5, q9, #8 // Shift and narrow green + vshrn.i16 d6, q0, #8 // Shift and narrow blue + vshrn.i16 d7, q1, #8 // Shift and narrow alpha + vst4.8 {d4-d7}, [r3]! // Write result to memory + bxmi lr + vld4.8 {d0-d3}, [r1] // Load eight source RGBA pixels + vld4.8 {d4-d7}, [r0] // Load eight destination RGBA pixels + b LoopRemaining +AllZero1: + vld4.8 {d20-d23}, [r1]! // Pre-load next eight source RGBA pixels + vld4.8 {d24-d27}, [r0]! // Pre-load next eight destination RGBA pixels + add r3, r3, #32 // Advance destination write pointer + b GoBack1 +AllZero2: + vld4.8 {d0-d3}, [r1]! // Pre-load next eight source RGBA pixels + vld4.8 {d4-d7}, [r0]! // Pre-load next eight destination RGBA pixels + add r3, r3, #32 // Advance destination write pointer + subs r2, r2, #16 // Decrement loop counter + bpl Loop + b PostLoop + +/* Handle small blits, 0-8 pixels */ +BlitSmall: + beq Blit8 + pld [r1, #0] // Pre-load eight source pixels + pld [r0, #0] // Pre-load eight destination pixels + add r3, #1 // Modify global alpha to 0...256 range + vdup.16 q13, r3 // Set up global alpha + ldr r3, =AlphaIndex + vmov.i16 q15, #256 // Set up alpha constant + vld1.8 {d29}, [r3] // Set up alpha index table + vmov.i16 q12, #0xFF00 // Set up mask constant + cmp r2, #1 // Exit if count is zero + beq Blit1 + bxlt lr // Zero pixels left + /* loop for neon 2-pixel code */ +Blit2: + vld1.32 {d0}, [r1]! // Load two source RGBA pixels + vld1.32 {d1}, [r0] // Load two destination RGBA pixels + sub r2, r2, #2 // Decrement width counter + vmovl.u8 q8, d0 // Expand source to 16-bit + vmul.i16 q8, q8, q13 // Scale source pixels + vmovl.u8 q3, d1 // Expand destination to 16-bit + vtbl.8 d2, {d16, d17}, d29 // Spread out alpha to match pixel format + vand q8, q12 // Mask low byte to avoid overflow in vmla + vsubw.u8 q2, q15, d2 // Calculate inverse alpha (scale) + vmla.i16 q8, q3, q2 // Scale destination pixels and add to source + vshrn.i16 d0, q8, #8 // Shift and narrow result + vst1.32 {d0}, [r0]! // Store two RGBA pixels + cmp r2, #1 // Exit if count is zero + bhi Blit2 // Still two or more pixels left + bxlt lr // Zero pixels left + /* code to handle any one last pixel */ +Blit1: + vld1.32 {d0[0]}, [r1] // Load one source RGBA pixel + vld1.32 {d1[0]}, [r0] // Load one destination RGBA pixel + vmovl.u8 q8, d0 // Expand source to 16-bit + vmul.i16 d16, d16, d26 // Scale source pixels + vmovl.u8 q3, d1 // Expand destination to 16-bit + vtbl.8 d2, {d16, d17}, d29 // Spread out alpha to match pixel format + vand d16, d24 // Mask low byte to avoid overflow in vmla + vsubw.u8 q2, q15, d2 // Calculate inverse alpha (scale) + vmla.i16 d16, d6, d4 // Scale destination pixels and add to source + vshrn.i16 d0, q8, #8 // Shift and narrow result + vst1.32 {d0[0]}, [r0] // Store one RGBA pixel + bx lr + /* Handle 8 pixels */ +Blit8: + add r3, #1 // Modify global alpha to 0...256 range + sub r2, r2, #8 // Decrement loop counter + vdup.16 q10, r3 // Set up global alpha + mov r3, r0 // Backup destination pointer + vld4.8 {d0-d3}, [r1] // Load eight source RGBA pixels + vld4.8 {d4-d7}, [r0] // Load eight destination RGBA pixels + vmov.i16 q15, #256 + b LoopRemaining + + .endfunc + + .data + .align +AlphaIndex: + .byte 7, 7, 7, 7, 15, 15, 15, 15 + +#endif diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index f41c342..9ca2b77 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -1232,6 +1232,18 @@ static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, #define S32A_Opaque_BlitRow32_PROC NULL #endif + +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(ENABLE_OPTIMIZED_S32A_BLITTERS) + +/* External function in file S32A_Blend_BlitRow32_neon.S */ +extern "C" void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha); + +#define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_neon + +#else + /* * ARM asm version of S32A_Blend_BlitRow32 */ @@ -1364,6 +1376,8 @@ static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, } #define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm +#endif + /* Neon version of S32_Blend_BlitRow32() * portable version is in src/core/SkBlitRow_D32.cpp -- cgit v1.1