diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/core/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp | 457 | ||||
-rw-r--r-- | src/core/SkBitmapProcShader.cpp | 23 | ||||
-rw-r--r-- | src/core/SkBitmapProcState.cpp | 31 | ||||
-rw-r--r-- | src/core/SkBitmapProcState_matrixProcs.cpp | 2 | ||||
-rw-r--r-- | src/core/SkBitmapProcState_sample.h | 98 | ||||
-rw-r--r-- | src/core/SkBitmapProcState_shaderproc.h | 107 | ||||
-rw-r--r-- | src/core/SkBlitRow_D16.cpp | 8 | ||||
-rw-r--r-- | src/opts/S16_D32_arm.S | 188 | ||||
-rw-r--r-- | src/opts/S32A_Blend_BlitRow32_arm.S | 396 | ||||
-rw-r--r-- | src/opts/S32A_D565_Opaque_arm.S | 325 | ||||
-rw-r--r-- | src/opts/S32A_Opaque_BlitRow32_arm.S | 311 | ||||
-rw-r--r-- | src/opts/S32_Opaque_D32_nofilter_DX_gether_arm.S | 85 | ||||
-rw-r--r-- | src/opts/SkBitmapProcState_opts_arm.cpp | 204 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm.cpp | 146 | ||||
-rw-r--r-- | src/opts/SkCachePreload_arm.h | 34 | ||||
-rw-r--r-- | src/ports/SkFontHost_FreeType.cpp | 133 | ||||
-rw-r--r-- | src/ports/SkFontHost_linux.cpp | 76 |
17 files changed, 2520 insertions, 104 deletions
diff --git a/src/core/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp b/src/core/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp new file mode 100644 index 0000000..20dc03b --- /dev/null +++ b/src/core/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp @@ -0,0 +1,457 @@ + +/* + * Copyright (c) 2010, Code Aurora Forum. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1, + SkFixed fx, unsigned int maxX, unsigned int subY, + unsigned int* colors, + SkFixed dx, int count) { + + asm volatile( + "mov r3, %[count] \n\t" //r3 = count + + "mov r5, %[fx] \n\t" //r5 = x = fx + "cmp r3, #0 \n\t" + "beq endloop \n\t" + + "vdup.8 d17, %[subY] \n\t" // duplicate y into d17 + "vmov.u8 d16, #16 \n\t" // set up constant in d16 + "vsub.u8 d18, d16, d17 \n\t" // d18 = 16-y + + "vmov.u16 d16, #16 \n\t" // set up constant in d16,int 16bit + +#define UNROLL8 +#define UNROLL2 +#ifdef UNROLL8 + "cmp r3, #8 \n\t" + "blt initloop2 \n\t" + ///////////////loop2 in x + "beginloop8: \n\t" + + /////////////////pixel 1//////////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d22, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d22, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d22, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d22, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 2//////////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d24, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d24, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d24, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d24, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 3//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d26, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d26, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d26, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d26, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 4//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d28, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d28, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d28, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d28, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 5//////////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d23, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d23, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d23, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d23, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 6//////////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d25, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d25, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d25, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d25, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 7//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d27, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d27, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d27, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d27, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 8//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d29, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d29, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d29, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d29, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// Store results/////////////////// + + "vshrn.i16 d0, q11, #8 \n\t" // shift down result by 8 + "vshrn.i16 d1, q12, #8 \n\t" // shift down result by 8 + "vshrn.i16 d2, q13, #8 \n\t" // shift down result by 8 + "vshrn.i16 d3, q14, #8 \n\t" // shift down result by 8 + + "vst4.u32 {d0, d1, d2, d3}, [%[colors]]! \n\t" // store result + + //////////////// end bilinear interp + + "sub r3, r3, #8 \n\t" //num -=8 + "add r5, r5, %[dx] \n\t" //r5 = x += dx + "cmp r3, #7 \n\t" + + "bgt beginloop8 \n\t" + + "endloop8: \n\t" + ////////////////end loop in x +#endif //UNROLL8 + + + +#ifdef UNROLL2 + "initloop2: \n\t" + "cmp r3, #2 \n\t" + "blt initloop \n\t" + ///////////////loop2 in x + "beginloop2: \n\t" + + + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d22, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d22, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d22, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d22, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////second half//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d23, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d23, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d23, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d23, d0, d20 \n\t" // d4 += a10 * (16-x) + "vshrn.i16 d0, q11, #8 \n\t" // shift down result by 8 + + "vst1.u32 {d0}, [%[colors]]! \n\t" // store result + + //////////////// end bilinear interp + + "sub r3, r3, #2 \n\t" //num -=2 + "add r5, r5, %[dx] \n\t" //r5 = x += dx + "cmp r3, #1 \n\t" + + "bgt beginloop2 \n\t" + + "endloop2: \n\t" + ////////////////end loop in x +#endif //UNROLL2 + +#if defined (UNROLL2) || defined (UNROLL8) + "initloop: \n\t" + "cmp r3, #0 \n\t" + "ble endloop \n\t" +#endif //defined (UNROLL2) || defined (UNROLL8) + + ///////////////loop in x + "beginloop: \n\t" + + + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d4, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d4, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d4, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d4, d0, d20 \n\t" // d4 += a10 * (16-x) + "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8 + + "vst1.u32 {d0[0]}, [%[colors]]! \n\t" // store result + + //////////////// end bilinear interp + + "sub r3, r3, #1 \n\t" //num -=1 + "add r5, r5, %[dx] \n\t" //r5 = x += dx + "cmp r3, #0 \n\t" + "bgt beginloop \n\t" + + "endloop: \n\t" + ////////////////end loop in x + : [colors] "+r" (colors) + : [image0] "r" (image0), [image1] "r" (image1), [fx] "r" (fx), [maxX] "r" (maxX), [subY] "r" (subY), + [dx] "r" (dx), [count] "r" (count) + : "cc", "memory", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29" + ); + + +} diff --git a/src/core/SkBitmapProcShader.cpp b/src/core/SkBitmapProcShader.cpp index 6d64716..1201ea4 100644 --- a/src/core/SkBitmapProcShader.cpp +++ b/src/core/SkBitmapProcShader.cpp @@ -159,6 +159,18 @@ bool SkBitmapProcShader::setContext(const SkBitmap& device, #define TEST_BUFFER_EXTRA 0 #endif +#if defined(__ARM_HAVE_NEON) +void clampx_nofilter_trans(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) ; + +void S16_opaque_D32_nofilter_DX(const SkBitmapProcState& s, + const uint32_t* SK_RESTRICT xy, + int count, uint32_t* SK_RESTRICT colors) ; + +void clampx_nofilter_trans_S16_D32_DX(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y, uint32_t* SK_RESTRICT colors) ; +#endif + void SkBitmapProcShader::shadeSpan(int x, int y, SkPMColor dstC[], int count) { const SkBitmapProcState& state = fState; if (state.fShaderProc32) { @@ -181,6 +193,12 @@ void SkBitmapProcShader::shadeSpan(int x, int y, SkPMColor dstC[], int count) { n = max; } SkASSERT(n > 0 && n < BUF_MAX*2); +#if defined(__ARM_HAVE_NEON) + if( sproc == S16_opaque_D32_nofilter_DX && mproc == clampx_nofilter_trans ){ + clampx_nofilter_trans_S16_D32_DX(state, buffer, n, x, y, dstC); + } else { +#endif + #ifdef TEST_BUFFER_OVERRITE for (int i = 0; i < TEST_BUFFER_EXTRA; i++) { buffer[BUF_MAX + i] = TEST_PATTERN; @@ -193,7 +211,10 @@ void SkBitmapProcShader::shadeSpan(int x, int y, SkPMColor dstC[], int count) { } #endif sproc(state, buffer, n, dstC); - + +#if defined(__ARM_HAVE_NEON) + } +#endif if ((count -= n) == 0) { break; } diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp index 3d34b20..a0758c0 100644 --- a/src/core/SkBitmapProcState.cpp +++ b/src/core/SkBitmapProcState.cpp @@ -1,4 +1,3 @@ - /* * Copyright 2011 Google Inc. * @@ -93,7 +92,11 @@ static inline U8CPU Filter_8(unsigned x, unsigned y, SkASSERT(state.fAlphaScale == 256) #define RETURNDST(src) src #define SRC_TO_FILTER(src) src +#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) + #define USE_GETHER32 +#endif #include "SkBitmapProcState_sample.h" +#undef USE_GETHER32 #undef FILTER_PROC #define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale) @@ -124,7 +127,9 @@ static inline U8CPU Filter_8(unsigned x, unsigned y, SkASSERT(state.fAlphaScale == 256) #define RETURNDST(src) SkPixel16ToPixel32(src) #define SRC_TO_FILTER(src) src +#define USE_S16_OPAQUE #include "SkBitmapProcState_sample.h" +#undef USE_S16_OPAQUE #undef FILTER_PROC #define FILTER_PROC(x, y, a, b, c, d, dst) \ @@ -340,6 +345,25 @@ static inline U8CPU Filter_8(unsigned x, unsigned y, #define POSTAMBLE(state) state.fBitmap->getColorTable()->unlockColors(false) #include "SkBitmapProcState_shaderproc.h" +#if defined(__ARM_HAVE_NEON) +#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) +#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) +#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) +#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) + +#undef FILTER_PROC +#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_opaque(x, y, a, b, c, d, dst) +#define MAKENAME(suffix) S32_Opaque_D32 ## suffix +#define SRCTYPE uint32_t +#define DSTTYPE uint32_t +#define SRC_TO_FILTER(src) src +#include "S32_Opaque_D32_filter_DX_shaderproc_neon.cpp" +#define S32_OPAQUE_D32_FILTER_DX_NEON +#include "SkBitmapProcState_shaderproc.h" +#undef S32_OPAQUE_D32_FILTER_DX_NEON +#endif //ARM_HAVE_NEON + + /////////////////////////////////////////////////////////////////////////////// static bool valid_for_filtering(unsigned dimension) { @@ -532,6 +556,11 @@ bool SkBitmapProcState::chooseProcs(const SkMatrix& inv, const SkPaint& paint) { } else if (SI8_opaque_D32_filter_DX == fSampleProc32 && clamp_clamp) { fShaderProc32 = Clamp_SI8_opaque_D32_filter_DX_shaderproc; } +#if defined(__ARM_HAVE_NEON) + else if (S32_opaque_D32_filter_DX == fSampleProc32 && clamp_clamp) { + fShaderProc32 = S32_Opaque_D32_filter_DX_shaderproc; + } +#endif //ARM_HAVE_NEON // see if our platform has any accelerated overrides this->platformProcs(); diff --git a/src/core/SkBitmapProcState_matrixProcs.cpp b/src/core/SkBitmapProcState_matrixProcs.cpp index bda2438..583a39b 100644 --- a/src/core/SkBitmapProcState_matrixProcs.cpp +++ b/src/core/SkBitmapProcState_matrixProcs.cpp @@ -327,7 +327,7 @@ static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy, return SkScalarToFixed(pt.fX) >> 16; } -static void clampx_nofilter_trans(const SkBitmapProcState& s, +void clampx_nofilter_trans(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h index e6b587f..793283d 100644 --- a/src/core/SkBitmapProcState_sample.h +++ b/src/core/SkBitmapProcState_sample.h @@ -23,6 +23,13 @@ #error "unsupported DSTSIZE" #endif +#if defined(USE_GETHER32) + extern "C" void S32_Opaque_D32_nofilter_DX_gether(SkPMColor* SK_RESTRICT colors, + const SkPMColor* SK_RESTRICT srcAddr, + int count, + const uint32_t* SK_RESTRICT xy); +#endif + void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s, const uint32_t* SK_RESTRICT xy, int count, DSTTYPE* SK_RESTRICT colors) { @@ -65,6 +72,93 @@ void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s, #endif } + +#if defined(USE_S16_OPAQUE) && defined(__ARM_HAVE_NEON) + +extern "C" void Blit_Pixel16ToPixel32( uint32_t * colors, const uint16_t *srcAddr, int n ); + +void clampx_nofilter_trans_S16_D32_DX(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y, DSTTYPE* SK_RESTRICT colors) { + + SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); + + //int xpos = nofilter_trans_preamble(s, &xy, x, y); + SkPoint pt; + s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &pt); + uint32_t Y = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16, + s.fBitmap->height()); + int xpos = SkScalarToFixed(pt.fX) >> 16; + + const SRCTYPE* SK_RESTRICT srcAddr = (const SRCTYPE*)s.fBitmap->getPixels(); + SRCTYPE src; + + // buffer is y32, x16, x16, x16, x16, x16 + // bump srcAddr to the proper row, since we're told Y never changes + //SkASSERT((unsigned)orig_xy[0] < (unsigned)s.fBitmap->height()); + //srcAddr = (const SRCTYPE*)((const char*)srcAddr + + // orig_xy[0] * s.fBitmap->rowBytes()); + SkASSERT((unsigned)Y < (unsigned)s.fBitmap->height()); + srcAddr = (const SRCTYPE*)((const char*)srcAddr + + Y * s.fBitmap->rowBytes()); + const int width = s.fBitmap->width(); + int n; + if (1 == width) { + // all of the following X values must be 0 + memset(xy, 0, count * sizeof(uint16_t)); + src = srcAddr[0]; + DSTTYPE dstValue = RETURNDST(src); + BITMAPPROC_MEMSET(colors, dstValue, count); + return; + //goto done_sample; + } + + + // fill before 0 as needed + if (xpos < 0) { + n = -xpos; + if (n > count) { + n = count; + } + src = srcAddr[0]; + for( int i = 0; i < n ; i++ ){ + *colors++ = RETURNDST(src); + } + + count -= n; + if (0 == count) { + return; + } + xpos = 0; + } + + // fill in 0..width-1 if needed + if (xpos < width) { + n = width - xpos; + if (n > count) { + n = count; + } + //for (int i = 0; i < n; i++) { + // src = srcAddr[xpos++]; + // *colors++ = RETURNDST(src); + //} + Blit_Pixel16ToPixel32( colors, &(srcAddr[xpos]), n ); + colors += n; + count -= n; + if (0 == count) { + return; + } + } + + for (int i = 0; i < count; i++) { + src = srcAddr[width - 1]; + *colors++ = RETURNDST(src); + } + +} + +#endif + void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, const uint32_t* SK_RESTRICT xy, int count, DSTTYPE* SK_RESTRICT colors) { @@ -92,6 +186,9 @@ void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, DSTTYPE dstValue = RETURNDST(src); BITMAPPROC_MEMSET(colors, dstValue, count); } else { +#if defined(USE_GETHER32) + S32_Opaque_D32_nofilter_DX_gether(colors, srcAddr, count, xy); +#else int i; for (i = (count >> 2); i > 0; --i) { uint32_t xx0 = *xy++; @@ -111,6 +208,7 @@ void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, SkASSERT(*xx < (unsigned)s.fBitmap->width()); src = srcAddr[*xx++]; *colors++ = RETURNDST(src); } +#endif } #ifdef POSTAMBLE diff --git a/src/core/SkBitmapProcState_shaderproc.h b/src/core/SkBitmapProcState_shaderproc.h index a3a8a99..3f5d55b 100644 --- a/src/core/SkBitmapProcState_shaderproc.h +++ b/src/core/SkBitmapProcState_shaderproc.h @@ -49,7 +49,111 @@ static void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y, #ifdef PREAMBLE PREAMBLE(s); #endif - + +#ifdef S32_OPAQUE_D32_FILTER_DX_NEON + int post_count; + SkFixed post_fx; + DSTTYPE* SK_RESTRICT post_colors; + int num; + post_count = count; + post_fx = fx; + post_colors = colors; + + + if (dx>=0) + { + int end = ((int)maxX-1)<<16; + num = (end-fx)/dx; + if (num < 0) num = 0; + + if (num<count) + { + count = num; + post_count = post_count - count; + post_fx = fx + count*dx; + post_colors = post_colors + count; + } + else + post_count = 0; + + while (fx<0 && count) { + unsigned subX = TILEX_LOW_BITS(fx, maxX); + unsigned x0 = TILEX_PROCF(fx, maxX); + unsigned x1 = TILEX_PROCF((fx + oneX), maxX); + + FILTER_PROC(subX, subY, + SRC_TO_FILTER(row0[x0]), + SRC_TO_FILTER(row0[x1]), + SRC_TO_FILTER(row1[x0]), + SRC_TO_FILTER(row1[x1]), + colors); + colors += 1; + + fx += dx; + count--; + } + } + else + { + int end = 0; + int maxXFix = ((int)maxX-1)<<16; + num = (end-fx)/dx; + if (num < 0) num = 0; + + + if (num<count) + { + count = num; + post_count = post_count - count; + post_fx = fx + count*dx; + post_colors = post_colors + count; + } + else + post_count = 0; + + while (fx>=maxXFix && count) { + unsigned subX = TILEX_LOW_BITS(fx, maxX); + unsigned x0 = TILEX_PROCF(fx, maxX); + unsigned x1 = TILEX_PROCF((fx + oneX), maxX); + + FILTER_PROC(subX, subY, + SRC_TO_FILTER(row0[x0]), + SRC_TO_FILTER(row0[x1]), + SRC_TO_FILTER(row1[x0]), + SRC_TO_FILTER(row1[x1]), + colors); + colors += 1; + + fx += dx; + count--; + } + + } + + S32_Opaque_D32_filter_DX_shaderproc_neon(row0, row1, fx, maxX, subY, colors, dx, count); + + fx = post_fx; + colors = post_colors; + while (post_count) { + unsigned subX = TILEX_LOW_BITS(fx, maxX); + unsigned x0 = TILEX_PROCF(fx, maxX); + unsigned x1 = TILEX_PROCF((fx + oneX), maxX); + + FILTER_PROC(subX, subY, + SRC_TO_FILTER(row0[x0]), + SRC_TO_FILTER(row0[x1]), + SRC_TO_FILTER(row1[x0]), + SRC_TO_FILTER(row1[x1]), + colors); + colors += 1; + + fx += dx; + post_count--; + } + + +#else //S32_OPAQUE_D32_FILTER_DX_NEON + do { unsigned subX = TILEX_LOW_BITS(fx, maxX); unsigned x0 = TILEX_PROCF(fx, maxX); @@ -65,6 +169,7 @@ static void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y, fx += dx; } while (--count != 0); +#endif //S32_OPAQUE_D32_FILTER_DX_NEON #ifdef POSTAMBLE POSTAMBLE(s); diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp index c815468..3bdaf0a 100644 --- a/src/core/SkBlitRow_D16.cpp +++ b/src/core/SkBlitRow_D16.cpp @@ -207,12 +207,20 @@ static void S32A_D565_Blend_Dither(uint16_t* SK_RESTRICT dst, /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// +#if defined(__CPU_ARCH_ARM) && defined(SK_CPU_LENDIAN) + extern "C" void S32A_D565_Opaque_arm(uint16_t*, uint32_t*, size_t); +#endif + static const SkBlitRow::Proc gDefault_565_Procs[] = { // no dither S32_D565_Opaque, S32_D565_Blend, +#if defined(__CPU_ARCH_ARM) && defined(SK_CPU_LENDIAN) + (SkBlitRow::Proc)S32A_D565_Opaque_arm, +#else S32A_D565_Opaque, +#endif S32A_D565_Blend, // dither diff --git a/src/opts/S16_D32_arm.S b/src/opts/S16_D32_arm.S new file mode 100644 index 0000000..0278a8f --- /dev/null +++ b/src/opts/S16_D32_arm.S @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2005-2008, The Android Open Source Project + * Copyright (c) 2011, Code Aurora Forum. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + .text + .align + + + .global Blit_Pixel16ToPixel32 + .func Blit_Pixel16ToPixel32 + +//void Blit_Pixel16ToPixel32( uint32_t * colors, const uint16_t *srcAddr, int n ); +// r0: dst ptr +// r1: src ptr +// r2: n + +Blit_Pixel16ToPixel32: + cmp r2,#32 + blt .Lless_than_32 + + vld2.u8 {d0, d2}, [r1]! //q0, lower 8 bits + vld2.u8 {d1, d3}, [r1]! //q1, higher 8 bits + vmov.u8 q13, #0xff + +.Lpipeline_neon_loop: + pld [r1, #256] + vmov.u8 q12, #31 + vand.u8 q12, q12, q0 //q12 is B channels + + vshr.u8 q11, q0, #5 //lower 3 bits of G channels + vshl.u8 q14, q1, #5 + vshr.u8 q14, q14, #2 //higher 3 bits of G channels + vorr q11, q11, q14 //q11 is G channels + + vshr.u8 q10, q1, #3 //q10 is R channels + + sub r2,r2,#16 + + vld2.u8 {d0, d2}, [r1]! //q0, lower 8 bits + vld2.u8 {d1, d3}, [r1]! //q1, higher 8 bits + + vshl.u8 q2, q12, #3 // b << (8-SK_B16_BITS) + vshr.u8 q12, q12, #2 // b >> (2* SK_B16_BITS -8 ) + vorr q12, q12, q2 //B channels + + cmp r2,#32 + vshl.u8 q3, q11, #2 // b << (8-SK_G16_BITS) + vshr.u8 q11, q11, #4 // b >> (2* SK_G16_BITS -8 ) + vorr q11, q11, q3 //G channels + + vshl.u8 q8, q10, #3 // b << (8-SK_R16_BITS) + vshr.u8 q10, q10, #2 // b >> (2* SK_R16_BITS -8 ) + vorr q10, q10, q8 //R channels + + + vst4.u8 {d20, d22, d24, d26}, [r0]! + vst4.u8 {d21, d23, d25, d27}, [r0]! + bge .Lpipeline_neon_loop + +.Lsurfix: + vmov.u8 q12, #31 + vand.u8 q12, q12, q0 //q12 is B channels + + vshr.u8 q11, q0, #5 //lower 3 bits of G channels + vshl.u8 q14, q1, #5 + vshr.u8 q14, q14, #2 //higher 3 bits of G channels + vorr q11, q11, q14 //q11 is G channels + + vshr.u8 q10, q1, #3 //q10 is R channels + + sub r2,r2,#16 + + vshl.u8 q2, q12, #3 // b << (8-SK_B16_BITS) + vshr.u8 q12, q12, #2 // b >> (2* SK_B16_BITS -8 ) + vorr q12, q12, q2 //B channels + + vshl.u8 q3, q11, #2 // b << (8-SK_G16_BITS) + vshr.u8 q11, q11, #4 // b >> (2* SK_G16_BITS -8 ) + vorr q11, q11, q3 //G channels + + vshl.u8 q8, q10, #3 // b << (8-SK_R16_BITS) + vshr.u8 q10, q10, #2 // b >> (2* SK_R16_BITS -8 ) + vorr q10, q10, q8 //R channels + + vst4.u8 {d20, d22, d24, d26}, [r0]! + vst4.u8 {d21, d23, d25, d27}, [r0]! + + +.Lless_than_32: + cmp r2,#16 + blt .Lless_than_16 + //vpush {Q4-Q7} + +.Lneon_loop: + pld [r1, #256] + vld2.u8 {d0, d2}, [r1]! //q0, lower 8 bits + vld2.u8 {d1, d3}, [r1]! //q1, higher 8 bits + + vmov.u8 q12, #31 + vand.u8 q12, q12, q0 //q12 is B channels + + vshr.u8 q11, q0, #5 //lower 3 bits of G channels + vshl.u8 q14, q1, #5 + vshr.u8 q14, q14, #2 //higher 3 bits of G channels + vorr q11, q11, q14 //q11 is G channels + + vshr.u8 q10, q1, #3 //q10 is R channels + + sub r2,r2,#16 + + vshl.u8 q2, q12, #3 // b << (8-SK_B16_BITS) + vshr.u8 q12, q12, #2 // b >> (2* SK_B16_BITS -8 ) + vorr q12, q12, q2 //B channels + + cmp r2,#16 + vshl.u8 q3, q11, #2 // b << (8-SK_G16_BITS) + vshr.u8 q11, q11, #4 // b >> (2* SK_G16_BITS -8 ) + vorr q11, q11, q3 //G channels + + vshl.u8 q8, q10, #3 // b << (8-SK_R16_BITS) + vshr.u8 q10, q10, #2 // b >> (2* SK_R16_BITS -8 ) + vorr q10, q10, q8 //R channels + + vmov.u8 q13, #0xff + + vst4.u8 {d20, d22, d24, d26}, [r0]! + vst4.u8 {d21, d23, d25, d27}, [r0]! + bge .Lneon_loop + + //vpop {Q4-Q7} + +.Lless_than_16: + + + cmp r2, #0 // 0x0 + ble .Lend + + push {r4, r5, r6, r7, r8} + + + lsl r2, r2, #1 + mov r3, #0 // 0x0 + +.Lloop: + + ldrh r6, [r1, r3] + + and r5, r6, #31 // 0x1f //r5 is B + + ubfx r4, r6, #5, #6 // r4 is G + + lsr ip, r6, #11 //ip is R + + + lsl r8, r5, #3 + lsl r6, r4, #2 + lsr r7, ip, #2 + + orr r5, r8, r5, lsr #2 + orr ip, r7, ip, lsl #3 + orr r4, r6, r4, lsr #4 + orr ip, ip, #-16777216 // 0xff000000 + orr r5, ip, r5, lsl #16 + orr r4, r5, r4, lsl #8 + str r4, [r0, r3, lsl #1] + add r3, r3, #2 // 0x2 + + cmp r3, r2 + bne .Lloop + + pop {r4, r5, r6, r7, r8} +.Lend: + bx lr + diff --git a/src/opts/S32A_Blend_BlitRow32_arm.S b/src/opts/S32A_Blend_BlitRow32_arm.S new file mode 100644 index 0000000..cd96f90 --- /dev/null +++ b/src/opts/S32A_Blend_BlitRow32_arm.S @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2005-2008, The Android Open Source Project + * Copyright (c) 2010, Code Aurora Forum. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + .text + + .global S32A_Blend_BlitRow32_arm_neon + .func S32A_Blend_BlitRow32_arm_neon + +S32A_Blend_BlitRow32_arm_neon: + + // calculate src_scale = aa + 1 + add r3, r3, #1 + +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + + cmp r2,#24 + blt .Lslow_path + + push {r4, r5} + vpush {q4-q7} + vmov.u16 q14,#0x100 + + // store src_scale in q4 + vdup.u16 q4, r3 + + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + //update source ptr but not dst ptr + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + add r4, r0, #32 // minus 16 to pretend the last round + mov r5, #64 + sub r2,r2,#8 + +.Lloop: + pld [r1, #256] + pld [r0, #256] + subs r2, r2, #16 + cmp r2,#16 + + // expand destination from 8-bit to 16-bit + vmovl.u8 q6, d4 + vmovl.u8 q7, d5 + vmovl.u8 q8, d6 + vmovl.u8 q9, d7 + + // expand source from 8-bit to 16-bit + vmovl.u8 q13, d3 + vmovl.u8 q10, d0 + vmovl.u8 q11, d1 + vmovl.u8 q12, d2 + + //update source ptr but not dst ptr + // calculate destination scale + vmul.u16 q5, q13, q4 + vshr.u16 q5, q5, #8 + vsub.u16 q5, q14, q5 + + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + // multiply destination ARGB components with dst_scale + vmul.u16 q6, q6, q5 + vmul.u16 q7, q7, q5 + vmul.u16 q8, q8, q5 + vmul.u16 q9, q9, q5 + + + vld4.8 {d4, d5, d6, d7}, [r4] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + // multiply source ARGB components with src_scale + vmul.u16 q10, q10, q4 + vmul.u16 q11, q11, q4 + vmul.u16 q12, q12, q4 + vmul.u16 q13, q13, q4 + + + // add processed src and dest pixels and extract high bytes + vqadd.u8 q10, q6, q10 + vqadd.u8 q11, q7, q11 + vqadd.u8 q12, q8, q12 + vqadd.u8 q13, q9, q13 + + vshrn.u16 d20, q10, #8 + vshrn.u16 d21, q11, #8 + vshrn.u16 d22, q12, #8 + vshrn.u16 d23, q13, #8 + + vst4.8 {d20, d21, d22, d23}, [r0], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + + // expand destination from 8-bit to 16-bit + vmovl.u8 q6, d4 + vmovl.u8 q7, d5 + vmovl.u8 q8, d6 + vmovl.u8 q9, d7 + + // expand source from 8-bit to 16-bit + vmovl.u8 q13, d3 + vmovl.u8 q10, d0 + vmovl.u8 q11, d1 + vmovl.u8 q12, d2 + + // calculate destination scale + vmul.u16 q5, q13, q4 + vshr.u16 q5, q5, #8 + vsub.u16 q5, q14, q5 + + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + + // multiply destination ARGB components with dst_scale + vmul.u16 q6, q6, q5 + vmul.u16 q7, q7, q5 + vmul.u16 q8, q8, q5 + vmul.u16 q9, q9, q5 + + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + // multiply source ARGB components with src_scale + vmul.u16 q10, q10, q4 + vmul.u16 q11, q11, q4 + vmul.u16 q12, q12, q4 + vmul.u16 q13, q13, q4 + + + // add processed src and dest pixels and extract high bytes + vqadd.u8 q10, q6, q10 + vqadd.u8 q11, q7, q11 + vqadd.u8 q12, q8, q12 + vqadd.u8 q13, q9, q13 + + vshrn.u16 d20, q10, #8 + vshrn.u16 d21, q11, #8 + vshrn.u16 d22, q12, #8 + vshrn.u16 d23, q13, #8 + + vst4.8 {d20, d21, d22, d23}, [r4], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + + bge .Lloop + +//There are 8 words left unprocessed from previous round + // expand destination from 8-bit to 16-bit + vmovl.u8 q6, d4 + vmovl.u8 q7, d5 + vmovl.u8 q8, d6 + vmovl.u8 q9, d7 + + // expand source from 8-bit to 16-bit + vmovl.u8 q13, d3 + vmovl.u8 q10, d0 + vmovl.u8 q11, d1 + vmovl.u8 q12, d2 + + // calculate destination scale + vmul.u16 q5, q13, q4 + vshr.u16 q5, q5, #8 + vsub.u16 q5, q14, q5 + + // multiply destination ARGB components with dst_scale + vmul.u16 q6, q6, q5 + vmul.u16 q7, q7, q5 + vmul.u16 q8, q8, q5 + vmul.u16 q9, q9, q5 + + // multiply source ARGB components with src_scale + vmul.u16 q10, q10, q4 + vmul.u16 q11, q11, q4 + vmul.u16 q12, q12, q4 + vmul.u16 q13, q13, q4 + + // add processed src and dest pixels and extract high bytes + vqadd.u8 q10, q6, q10 + vqadd.u8 q11, q7, q11 + vqadd.u8 q12, q8, q12 + vqadd.u8 q13, q9, q13 + + vshrn.u16 d20, q10, #8 + vshrn.u16 d21, q11, #8 + vshrn.u16 d22, q12, #8 + vshrn.u16 d23, q13, #8 + + vst4.8 {d20, d21, d22, d23}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + +.Lless_than_16: + cmp r2,#8 + blt .Lless_than_8 + + sub r2,r2,#8 + + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + //update source ptr but not dst ptr + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + // expand destination from 8-bit to 16-bit + vmovl.u8 q6, d4 + vmovl.u8 q7, d5 + vmovl.u8 q8, d6 + vmovl.u8 q9, d7 + + // expand source from 8-bit to 16-bit + vmovl.u8 q13, d3 + vmovl.u8 q10, d0 + vmovl.u8 q11, d1 + vmovl.u8 q12, d2 + + // calculate destination scale + vmul.u16 q5, q13, q4 + vshr.u16 q5, q5, #8 + vsub.u16 q5, q14, q5 + + // multiply destination ARGB components with dst_scale + vmul.u16 q6, q6, q5 + vmul.u16 q7, q7, q5 + vmul.u16 q8, q8, q5 + vmul.u16 q9, q9, q5 + + // multiply source ARGB components with src_scale + vmul.u16 q10, q10, q4 + vmul.u16 q11, q11, q4 + vmul.u16 q12, q12, q4 + vmul.u16 q13, q13, q4 + + // add processed src and dest pixels and extract high bytes + vqadd.u8 q10, q6, q10 + vqadd.u8 q11, q7, q11 + vqadd.u8 q12, q8, q12 + vqadd.u8 q13, q9, q13 + + vshrn.u16 d4, q10, #8 + vshrn.u16 d5, q11, #8 + vshrn.u16 d6, q12, #8 + vshrn.u16 d7, q13, #8 + + vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + +.Lless_than_8: + vpop {q4-q7} + pop {r4, r5} + +.Lslow_path: + adds r2, #0 + bxeq lr +#endif + +/* + * r0 - dst + * r1 - src + * r2 - count + * r3 - alpha + */ + push {r4-r11, lr} + + mov r10, #0xFF + orr r10, r10, r10, lsl #16 //mask = r10 = 0x00FF00FF + + subs r2, r2, #2 + blt .Lblitrow32_single_loop + +.Lblitrow32_double_loop: + ldm r0, {r4, r5} + ldm r1!, {r6, r7} + + /* First iteration */ + lsr lr, r6, #24 //extract src_alpha + + // calculate dst_scale = 256 - ((src_alpha*src_scale)>>8) + mul lr, r3, lr + lsr lr, #8 + rsb lr, lr, #256 + + // src processing + and r8, r6, r10 //rb = (src & mask) + and r9, r10, r6, lsr #8 //ag = (src>>8) & mask + + mul r11, r8, r3 //RB = rb * src_scale + mul r6, r9, r3 //AG = ag * src_scale + + // combine RB and AG + and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask + and r6, r6, r10, lsl #8 //r9 = AG & ~mask + + orr r6, r6, r11 + + // dst processing + and r8, r4, r10 //rb = (dst & mask) + and r9, r10, r4, lsr #8 //ag = (dst>>8) & mask + + mul r11, r8, lr //RB = rb * dst_scale + mul r4, r9, lr //AG = ag * dst_scale + + // combine RB and AG + and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask + and r4, r4, r10, lsl #8 //r9 = AG & ~mask + + orr r4, r4, r11 + + /* Second iteration */ + lsr lr, r7, #24 //extract src_alpha + + // calculate dst_scale = 256 - ((src_alpha*src_scale)>>8) + mul lr, r3, lr + lsr lr, #8 + rsb lr, lr, #256 + + // src processing + and r8, r7, r10 //rb = (src & mask) + and r9, r10, r7, lsr #8 //ag = (src>>8) & mask + + mul r11, r8, r3 //RB = rb * src_scale + mul r7, r9, r3 //AG = ag * src_scale + + // combine RB and AG + and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask + and r7, r7, r10, lsl #8 //r9 = AG & ~mask + + orr r7, r7, r11 + + // dst processing + and r8, r5, r10 //rb = (dst & mask) + and r9, r10, r5, lsr #8 //ag = (dst>>8) & mask + + mul r11, r8, lr //RB = rb * dst_scale + mul r5, r9, lr //AG = ag * dst_scale + + // combine RB and AG + and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask + and r5, r5, r10, lsl #8 //r9 = AG & ~mask + + orr r5, r5, r11 + + + // add processed src and dst + add r6, r6, r4 + add r7, r7, r5 + + subs r2, r2, #2 + stm r0!, {r6, r7} + + bge .Lblitrow32_double_loop + +.Lblitrow32_single_loop: + adds r2, #1 + blo .Lexit + + ldr r4, [r0] + ldr r6, [r1], #4 + + lsr lr, r6, #24 //extract src_alpha + + // calculate dst_scale = 256 - ((src_alpha*src_scale)>>8) + mul lr, r3, lr + lsr lr, #8 + rsb lr, lr, #256 + + // src processing + and r8, r6, r10 //rb = (src & mask) + and r9, r10, r6, lsr #8 //ag = (src>>8) & mask + + mul r11, r8, r3 //RB = rb * src_scale + mul r6, r9, r3 //AG = ag * src_scale + + // combine RB and AG + and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask + and r6, r6, r10, lsl #8 //r9 = AG & ~mask + + orr r6, r6, r11 + + // dst processing + and r8, r4, r10 //rb = (dst & mask) + and r9, r10, r4, lsr #8 //ag = (dst>>8) & mask + + mul r11, r8, lr //RB = rb * dst_scale + mul r4, r9, lr //AG = ag * dst_scale + + // combine RB and AG + and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask + and r4, r4, r10, lsl #8 //r9 = AG & ~mask + + orr r4, r4, r11 + + add r6, r6, r4 //add processed src and dst + + str r6, [r0], #4 + +.Lexit: + pop {r4-r11, lr} + bx lr diff --git a/src/opts/S32A_D565_Opaque_arm.S b/src/opts/S32A_D565_Opaque_arm.S new file mode 100644 index 0000000..9576521 --- /dev/null +++ b/src/opts/S32A_D565_Opaque_arm.S @@ -0,0 +1,325 @@ +/* + * Copyright 2006, The Android Open Source Project + * Copyright (c) 2009, Code Aurora Forum. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/* + * This file is derived from libpixelflinger version of BLIT routine. + * Algorithm used for BLIT operation here is equivalent to the one in + * C function, S32A_D565_Opaque. Use neon instructions to process 16 pixels + * at-a-time on armv7. If the number of pixels is less than 16 and/or the + * architecture is armv6 and below, use regular arm instructions. Regular + * arm code combines two 16-bit writes into one 32-bit write to destination, + * uses destination and source pre-loads, and unrolls the main loop thrice. + */ + .text + .align + + .global S32A_D565_Opaque_arm + +// uses r6, r7, r8, r9, r10, lr + +.macro pixel, DREG, SRC, FB, OFFSET + + // SRC = AABBGGRR + subs r7, r10, \SRC, lsr #24 // sAA = 255 - sAA + beq 1f + +.if \OFFSET + + // red + mov lr, \DREG, lsr #(\OFFSET + 6 + 5) + smlabb lr, r7, lr, r8 + and r6, \SRC, r10 + add lr, lr, lr, lsr #5 + add lr, r6, lr, lsr #5 + lsr lr, #3 + orr \FB, lr, lsl #(\OFFSET + 11) + + // green + and r6, \DREG, #(0x3F<<(\OFFSET + 5)) + lsr r6, #5 + smlabt r6, r7, r6, r9 + and lr, r10, \SRC, lsr #(8) + add r6, r6, r6, lsr #6 + add r6, lr, r6, lsr #6 + lsr r6, #2 + orr \FB, \FB, r6, lsl #(\OFFSET + 5) + + // blue + and lr, \DREG, #(0x1F << \OFFSET) + smlabt lr, r7, lr, r8 + and r6, r10, \SRC, lsr #(8+8) + add lr, lr, lr, lsr #5 + add lr, r6, lr, lsr #5 + lsr lr, #3 + orr \FB, \FB, lr, lsl #\OFFSET + +.else + + // red + mov lr, \DREG, lsr #(6+5) + and lr, lr, #0x1F + smlabb lr, r7, lr, r8 + and r6, \SRC, r10 + add lr, lr, lr, lsr #5 + add lr, r6, lr, lsr #5 + lsr lr, #3 + mov \FB, lr, lsl #11 + + // green + and r6, \DREG, #(0x3F<<5) + lsr r6, #5 + smlabb r6, r7, r6, r9 + and lr, r10, \SRC, lsr #(8) + add r6, r6, r6, lsr #6 + add r6, lr, r6, lsr #6 + lsr r6, #2 + orr \FB, \FB, r6, lsl #5 + + // blue + and lr, \DREG, #0x1F + smlabb lr, r7, lr, r8 + and r6, r10, \SRC, lsr #(8+8) + add lr, lr, lr, lsr #5 + add lr, r6, lr, lsr #5 + orr \FB, \FB, lr, lsr #3 + +.endif + b 2f + + /* + * When alpha = 255, down scale the source RGB pixel (24 bits) + * to 16 bits(RGB565) + */ +1: + lsl r6, \SRC, #8 + lsr lr, \SRC, #5 + and r7, r6, #0xf800 + and lr, lr, #0x7e0 + orr lr, lr, r7 + +.if \OFFSET + orr lr, lr, r6, lsr #27 + orr \FB, \FB, lr, lsl #(\OFFSET) +.else + orr \FB, lr, r6, lsr #27 +.endif + +2: +.endm + + +// r0: dst ptr +// r1: src ptr +// r2: count +// r3: d +// r4: s0 +// r5: s1 +// r6: pixel +// r7: pixel +// r8: 0x10 +// r9: 0x20 +// r10: 0xFF +// r11: free +// r12: scratch +// r14: free + +S32A_D565_Opaque_arm: + stmfd sp!, {r4-r10, lr} + +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + subs r2, r2, #16 + + blo blit_less_than_16_left + + vmov.u16 q12, #0x80 + vmov.u8 q13, #0xf8 + +blit_neon_loop: + /* + * Load 64 bytes from source and 32 bytes from destination + * note that source pixels are 4 bytes wide and + * destination pixels are 2 bytes wide. + */ + vld4.8 {d2, d4, d6, d8}, [r1]! + vld4.8 {d3, d5, d7, d9}, [r1]! + + vand.8 d10, d8, d9 + vmov r3, r4, d10 + + cmp r3, #0xffffffff + cmpeq r4, #0xffffffff + bne blit_alpha_not_255 + + // alpha equals 255 case + + vshl.u8 q0, q2, #3 + + subs r2, r2, #16 + + vsri.u8 q1, q2, #5 + vsri.u8 q0, q3, #3 + + // store the rgb destination values back to memory + vst2.8 {d0, d2}, [r0]! + vst2.8 {d1, d3}, [r0]! + + blo blit_less_than_16_left + b blit_neon_loop + +blit_alpha_not_255: + // alpha = 255 - alpha + vmvn.u8 q0, q4 + + vld2.8 {q5, q6}, [r0] + + vshl.u8 q7, q6, #3 + + subs r2, r2, #16 + + vand.u8 q6, q6, q13 + + vmov.16 q8, q12 + vmov.16 q9, q12 + + vsri.u8 q7, q5, #5 + vshl.u8 q5, q5, #3 + + vmlal.u8 q8, d0, d12 + vmlal.u8 q9, d1, d13 + + vshl.u8 q7, q7, #2 + + vshr.u16 q10, q8, #5 + vshr.u16 q11, q9, #5 + vaddhn.u16 d12, q8, q10 + vaddhn.u16 d13, q9, q11 + + vmov.16 q8, q12 + vmov.16 q9, q12 + vmlal.u8 q8, d0, d14 + vmlal.u8 q9, d1, d15 + + vqadd.u8 q6, q6, q1 + + vshr.u16 q10, q8, #6 + vshr.u16 q11, q9, #6 + vaddhn.u16 d14, q8, q10 + vaddhn.u16 d15, q9, q11 + + vmov.16 q8, q12 + vmov.16 q9, q12 + vmlal.u8 q8, d0, d10 + vmlal.u8 q9, d1, d11 + + vqadd.u8 q7, q7, q2 + + vshl.u8 q5, q7, #3 + + vshr.u16 q10, q8, #5 + vshr.u16 q11, q9, #5 + + vsri.u8 q6, q7, #5 + + vaddhn.u16 d16, q8, q10 + vaddhn.u16 d17, q9, q11 + vqadd.u8 q8, q8, q3 + + vsri.u8 q5, q8, #3 + + // store the rgb destination values back to memory + vst2.8 {d10, d12}, [r0]! + vst2.8 {d11, d13}, [r0]! + + blo blit_less_than_16_left + b blit_neon_loop +#endif + +blit_less_than_16_left: + pld [r1] + + mov r8, #0x10 + mov r9, #0x20 + mov r10, #0xFF + +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + adds r2, r2, #14 +#else + subs r2, r2, #2 +#endif + + pld [r0] + blo 9f + + // The main loop is unrolled thrice and process 6 pixels +8: ldmia r1!, {r4, r5} + // stream the source + pld [r1, #32] + add r0, r0, #4 + // it's all zero, skip this pixel + orrs r3, r4, r5 + beq 7f + + // load the destination + ldr r3, [r0, #-4] + // stream the destination + pld [r0, #32] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + // effectively, we're getting write-combining by virtue of the + // cpu's write-back cache. + str r12, [r0, #-4] + + // 2nd iteration of the loop, don't stream anything + subs r2, r2, #2 + blt 9f + ldmia r1!, {r4, r5} + add r0, r0, #4 + orrs r3, r4, r5 + beq 7f + ldr r3, [r0, #-4] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + str r12, [r0, #-4] + + // 3rd iteration of the loop, don't stream anything + subs r2, r2, #2 + blt 9f + ldmia r1!, {r4, r5} + add r0, r0, #4 + orrs r3, r4, r5 + beq 7f + ldr r3, [r0, #-4] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + str r12, [r0, #-4] + +7: subs r2, r2, #2 + blo 9f + b 8b + +9: adds r2, r2, #1 + ldmlofd sp!, {r4-r10, lr} // return + bxlo lr + + // last pixel left + ldr r4, [r1], #4 + ldrh r3, [r0] + pixel r3, r4, r12, 0 + strh r12, [r0], #2 + ldmfd sp!, {r4-r10, lr} // return + bx lr diff --git a/src/opts/S32A_Opaque_BlitRow32_arm.S b/src/opts/S32A_Opaque_BlitRow32_arm.S new file mode 100644 index 0000000..0ecfa1d --- /dev/null +++ b/src/opts/S32A_Opaque_BlitRow32_arm.S @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2005-2008, The Android Open Source Project + * Copyright (c) 2010, Code Aurora Forum. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + .text + + .global S32A_Opaque_BlitRow32_arm + .func S32A_Opaque_BlitRow32_arm + +S32A_Opaque_BlitRow32_arm: + + push {r4-r11} +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + + cmp r2,#24 + blt .Lless_than_24 + + vpush {Q4-Q7} + + vmov.i16 q14,#0x100 //Q14.16 = 256 +//prefix + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + //update source ptr but not dst ptr + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + add r3, r0, #32 // minus 16 to pretend the last round + mov r5, #64 + sub r2,r2,#8 +.Lloop: + pld [r1, #256] + pld [r0, #256] + sub r2,r2,#16 + vsubw.u8 q4,q14,d3 //Q4.16 = 256-d3 + + //It has to be 24 since we pre-load 8 word for the next rounds + cmp r2,#16 + + vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4 + vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7 + + + vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4 + vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4 + + vld4.8 {d20, d21, d22, d23}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + + vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4 + + vld4.8 {d24, d25, d26, d27}, [r3] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d4,d4,d0 //d4 = d4+d0 + vadd.i8 d5,d5,d1 //d5 = d5+d1 + vadd.i8 d6,d6,d2 //d6 = d6+d2 + vadd.i8 d7,d7,d3 //d7 = d7+d3 + + vst4.8 {d4, d5, d6, d7}, [r0], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + //add r0, r0, r5 + + //The next 4 words + vsubW.u8 q4,q14,d23 //Q4.16 = 256-d3 + + vmovl.u8 q6,d24 //Q6 = vmovl.u8 d4 + vmovl.u8 q7,d25 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d26 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d27 //Q9 = vmovl.u8 d7 + + vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4 + vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4 + + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + + vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4 + + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + vshrn.i16 d24,q6,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d25,q7,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d26,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d27,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d24,d24,d20 //d4 = d4+d0 + vadd.i8 d25,d25,d21 //d5 = d5+d1 + vadd.i8 d26,d26,d22 //d6 = d6+d2 + vadd.i8 d27,d27,d23 //d7 = d7+d3 + + vst4.8 {d24, d25, d26, d27}, [r3], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + //add r3, r3, r5 + + bge .Lloop + +//There are 8 words left unprocessed from previous round + vsubw.u8 q4,q14,d3 //Q4.16 = 256-d3 + + cmp r2,#8 + + vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4 + vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7 + + vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4 + vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4 + vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4 + + vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d4,d4,d0 //d4 = d4+d0 + vadd.i8 d5,d5,d1 //d5 = d5+d1 + vadd.i8 d6,d6,d2 //d6 = d6+d2 + vadd.i8 d7,d7,d3 //d7 = d7+d3 + + vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + +.Lless_than_16: + cmp r2,#8 + blt .Lless_than_8 + + sub r2,r2,#8 + + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + //update source ptr but not dst ptr + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + vsubw.u8 q4,q14,d3 //Q4.16 = 256-d3 + + vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4 + vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7 + + vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4 + vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4 + vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4 + + vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d4,d4,d0 //d4 = d4+d0 + vadd.i8 d5,d5,d1 //d5 = d5+d1 + vadd.i8 d6,d6,d2 //d6 = d6+d2 + vadd.i8 d7,d7,d3 //d7 = d7+d3 + + vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + +.Lless_than_8: + vpop {Q4-Q7} + +.Lless_than_4: + cmp r2, #1 + bmi .Lexit + b .Lresidual_loop + +.Lless_than_24: + cmp r2,#8 + blt .Lless_than_4 + +.Lloop_8: + sub r2,r2,#8 + // We already read the 8 words from the previous pipe line + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + //update source ptr but not dst ptr + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + vmov.i16 q10,#0x100 //Q4.16 = 256 + vsubW.u8 q10,q10,d3 //Q4.16 = 256-d3 + + cmp r2,#8 + + vmovl.u8 q12,d4 //Q6 = vmovl.u8 d4 + vmovl.u8 q13,d5 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7 + + vmul.i16 q12,q12,q10 //Q6 = Q6 * Q4 + vmul.i16 q13,q13,q10 //Q7 = Q7 * Q4 + vmul.i16 q8,q8,q10 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q10 //Q9 = Q9 * Q4 + + vshrn.i16 d4,q12,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d5,q13,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d4,d4,d0 //d4 = d4+d0 + vadd.i8 d5,d5,d1 //d5 = d5+d1 + vadd.i8 d6,d6,d2 //d6 = d6+d2 + vadd.i8 d7,d7,d3 //d7 = d7+d3 + + vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + + bge .Lloop_8 + b .Lless_than_4 + +#endif + +/* + * r0 - dst + * r1 - src + * r2 - count + */ +.Lresidual_loop: + mov r10, #0xFF + orr r10, r10, r10, lsl #16 //mask = r10 = 0x00FF00FF + + subs r2, r2, #2 + blt .Lblitrow32_single_loop + +.Lblitrow32_double_loop: + ldm r0, {r3, r4} + ldm r1!, {r5, r6} + + orrs r9, r3, r4 + beq .Lblitrow32_loop_cond + + // First iteration + lsr r7, r5, #24 //extract alpha + and r8, r3, r10 //rb = (dst & mask) + rsb r7, r7, #256 //r5 = scale = (255-alpha)+1 + and r9, r10, r3, lsr #8 //ag = (dst>>8) & mask + + mul r11, r8, r7 //RB = rb * scale + mul r3, r9, r7 //AG = ag * scale + + // combine RB and AG + and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask + and r3, r3, r10, lsl #8 //r9 = AG & ~mask + + lsr r7, r6, #24 //extract alpha for second iteration + orr r3, r3, r11 + + // Second iteration + and r8, r4, r10 //rb = (dst & mask) + rsb r7, r7, #256 //r5 = scale = (255-alpha)+1 + and r9, r10, r4, lsr #8 //ag = (dst>>8) & mask + + mul r11, r8, r7 //RB = rb * scale + mul r4, r9, r7 //AG = ag * scale + + // combine RB and AG + and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask + and r4, r4, r10, lsl #8 //r9 = AG & ~mask + orr r4, r4, r11 + + // add src to combined value + add r5, r5, r3 + add r6, r6, r4 + +.Lblitrow32_loop_cond: + subs r2, r2, #2 + stm r0!, {r5, r6} + + bge .Lblitrow32_double_loop + +.Lblitrow32_single_loop: + adds r2, #1 + blo .Lexit + + ldr r3, [r0] + ldr r5, [r1], #4 + + cmp r3, #0 + beq .Lblitrow32_single_store + + lsr r7, r5, #24 //extract alpha + and r8, r3, r10 //rb = (dst & mask) + rsb r7, r7, #256 //r5 = scale = (255-alpha)+1 + and r9, r10, r3, lsr #8 //ag = (dst>>8) & mask + + mul r8, r8, r7 //RB = rb * scale + mul r9, r9, r7 //AG = ag * scale + + // combine RB and AG + and r8, r10, r8, lsr #8 //r8 = (RB>>8) & mask + and r9, r9, r10, lsl #8 //r9 = AG & ~mask + orr r3, r8, r9 + + add r5, r5, r3 //add src to combined value + +.Lblitrow32_single_store: + str r5, [r0], #4 + +.Lexit: + pop {r4-r11} + bx lr diff --git a/src/opts/S32_Opaque_D32_nofilter_DX_gether_arm.S b/src/opts/S32_Opaque_D32_nofilter_DX_gether_arm.S new file mode 100644 index 0000000..3467432 --- /dev/null +++ b/src/opts/S32_Opaque_D32_nofilter_DX_gether_arm.S @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2005-2008, The Android Open Source Project + * Copyright (c) 2010, Code Aurora Forum. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + .text + .global S32_Opaque_D32_nofilter_DX_gether + .func S32_Opaque_D32_nofilter_DX_gether +S32_Opaque_D32_nofilter_DX_gether: + push {r0-r11,lr} + asr r0,r2,#3 + sub sp,sp,#4 //23 + cmp r0,#0 + str r0,[sp,#0] //r0 = count >> 3 + ble .L1_140 + ldr r4,[sp,#4] //r4 = r0 (dst) + mov r0,r3 + add r12,r3,#4 + asr r8,r2,#3 +.L1_52: + ldm r3!, {r0,r6,r9,r11} + lsr r5,r0,#16 //30 + ldr r5,[r1,r5,lsl #2] //30 + lsr r7,r6,#16 //32 + ldr r7,[r1,r7,lsl #2] //31 + uxth r0,r0 //34 + ldr r0,[r1,r0,lsl #2] //34 + uxth r6,r6 //31 + ldr r6,[r1,r6,lsl #2] //32 + //stm r4!, {r0,r5,r6,r7} ;35 + lsr r10,r9,#16 //30 + ldr r10,[r1,r10,lsl #2] //30 + lsr lr,r11,#16 //32 + ldr lr,[r1,lr,lsl #2] //31 + uxth r9,r9 //34 + ldr r9,[r1,r9,lsl #2] //34 + uxth r11,r11 //31 + ldr r11,[r1,r11,lsl #2] //32 + subs r8,r8,#1 + stm r4!, {r0,r5,r6,r7,r9,r10,r11,lr} //35 + + bne .L1_52 + + ldr r0,[sp,#0] // count >> 3 + mov r12,r0 + ldr r0,[sp,#4] //r0 = dst + add r0,r0,r12,lsl #5 //dst += count >>3 << 5 + str r0,[sp,#4] //save r0 into stack again +.L1_140: +//;;39 const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy); +//;;40 for (i = (count & 7); i > 0; --i) { + tst r2,#7 + beq .L1_184 + ldr r0,[sp,#4] //r0 = currnt dst + and r2,r2,#7 +.L1_156: +//;;41 //SkASSERT(*xx < (unsigned)s.fBitmap->width()); +//;;42 src = srcAddr[*xx++]; *colors++ = RETURNDST(src); + ldrh r4,[r3],#2 + add r12,r0,#4 +//;;43 } + subs r2,r2,#1 + ldr r4,[r1,r4,lsl #2] //42 + str r4,[r0,#0] //42 + mov r0,r12 //42 + bne .L1_156 +.L1_184: +//;;44 } + add sp,sp,#0x14 + pop {r4-r11,pc} + +.endfunc +.size S32_Opaque_D32_nofilter_DX_gether, .-S32_Opaque_D32_nofilter_DX_gether diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp index 20d62e1..f7b89a9 100644 --- a/src/opts/SkBitmapProcState_opts_arm.cpp +++ b/src/opts/SkBitmapProcState_opts_arm.cpp @@ -11,6 +11,11 @@ #include "SkColorPriv.h" #include "SkUtils.h" +#if defined(__ARM_HAVE_NEON) +#include <arm_neon.h> +#endif + + #if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) void SI8_D16_nofilter_DX_arm( const SkBitmapProcState& s, @@ -184,11 +189,201 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, } #endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) + +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) +void S16_opaque_D32_nofilter_DX_neon_asm(const SkBitmapProcState& s, + const uint32_t* __restrict__ xy, + int count, uint32_t* __restrict__ colors) { + + const uint16_t* __restrict__ srcAddr = (const uint16_t*)s.fBitmap->getPixels(); + + uint16_t* index; + uint16_t src; + int i; + + srcAddr = (const uint16_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes()); + + const uint16_t* __restrict__ xx = (const uint16_t*)(++xy); + + if (1 == s.fBitmap->width()) { + + src = srcAddr[0]; + uint32_t dstValue = SkPixel16ToPixel32(src); + sk_memset32(colors, dstValue, count); + } else if ((xx[count-1] - xx[0]) == (count-1)) { + // No scaling + const uint16_t* src_data = (const uint16_t*)(srcAddr + xx[0]); + asm volatile ( + "subs %[count], %[count], #8 \n\t" // count -= 8, set flag + "blt 2f \n\t" // if count < 0, branch to label 2 + "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use. + "1: \n\t" // 8 loop + // Handle 8 pixels in one loop. + + "vld1.u16 {q0}, [%[src_data]]! \n\t" // load eight src 565 pixels + + "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2 + "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3 + "vmov.u16 q1, q8 \n\t" // copy alpha from q8 + "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1 + "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1 + "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2 + "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2 + "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2 + "vzip.16 q2, q1 \n\t" // interleave q1 and q2 + "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst + "subs %[count], %[count], #8 \n\t" // count -= 8, set flag + "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q1 to dst + + "bge 1b \n\t" // loop if count >= 0 + "2: \n\t" // exit of 8 loop + + "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed. + "blt 3f \n\t" // if count < 0, branch to label 3 + + // Handle 4 pixels at once + + "vld1.u16 {d0}, [%[src_data]]! \n\t" // load eight src 565 pixels + + "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2 + "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1 + "vmov.u16 d3, d16 \n\t" // copy alpha from d16 + "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3 + "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3 + "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2 + "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2 + "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2 + "vzip.16 d2, d3 \n\t" // interleave d2 and d3 + "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst + + "3: \n\t" // end + : [src_data] "+r" (src_data), [colors] "+r" (colors), [count] "+r" (count) + : + : "cc", "memory","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17" + ); + + for (i = (count & 3); i > 0; --i) { + *colors++ = SkPixel16ToPixel32(*src_data++); + } + + } else { + // Scaling case + uint16_t data[8]; + + asm volatile ( + "subs %[count], %[count], #8 \n\t" // count -= 8, set flag + "blt 2f \n\t" // if count < 0, branch to label 2 + "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use. + "1: \n\t" // 8 loop + // Handle 8 pixels in one loop. + "ldmia %[xx]!, {r4, r5, r6, r7} \n\t" // load ptrs to pixels 0-7 + + "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer + "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer + "mov r6, r6, lsl #1 \n\t" // <<1 because of 16 bit pointer + "mov r7, r7, lsl #1 \n\t" // <<1 because of 16 bit pointer + + "uxth r8, r4 \n\t" // extract ptr 0 + "mov r4, r4, lsr #16 \n\t" // extract ptr 1 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image + "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image + "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register + + "uxth r8, r5 \n\t" // extract ptr 2 + "mov r5, r5, lsr #16 \n\t" // extract ptr 3 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image + "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image + "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register + + "uxth r8, r6 \n\t" // extract ptr 4 + "mov r6, r6, lsr #16 \n\t" // extract ptr 5 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image + "ldrh r6, [%[srcAddr], r6] \n\t" // load pixel 5 from image + "pkhbt r6, r8, r6, lsl #16 \n\t" // combine pixel 4 and 5 in one register + + "uxth r8, r7 \n\t" // extract ptr 6 + "mov r7, r7, lsr #16 \n\t" // extract ptr 7 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 6 from image + "ldrh r7, [%[srcAddr], r7] \n\t" // load pixel 7 from image + "pkhbt r7, r8, r7, lsl #16 \n\t" // combine pixel 6 and 7 in one register + + "stmia %[data], {r4, r5, r6, r7} \n\t" // store 8 src pixels + + "vld1.u16 {q0}, [%[data]] \n\t" // load eight src 565 pixels + + "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2 + "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3 + "vmov.u16 q1, q8 \n\t" // copy alpha from q8 + "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1 + "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1 + "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2 + "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2 + "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2 + "vzip.16 q2, q1 \n\t" // interleave q1 and q2 + "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst + "subs %[count], %[count], #8 \n\t" // count -= 8, set flag + "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q2 to dst + + "bge 1b \n\t" // loop if count >= 0 + "2: \n\t" // exit of 8 loop + + "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed. + "blt 3f \n\t" // if count < 0, branch to label 3 + + // Handle 4 pixels at once + "ldmia %[xx]!, {r4, r5} \n\t" // load ptrs to pixels 0-3 + + "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer + "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer + + "uxth r8, r4 \n\t" // extract ptr 0 + "mov r4, r4, lsr #16 \n\t" // extract ptr 1 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image + "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image + "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register + + "uxth r8, r5 \n\t" // extract ptr 2 + "mov r5, r5, lsr #16 \n\t" // extract ptr 3 + "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image + "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image + "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register + + "stmia %[data], {r4, r5} \n\t" // store 4 src pixels + + "vld1.u16 {d0}, [%[data]] \n\t" // load eight src 565 pixels + + "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2 + "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1 + "vmov.u16 d3, d16 \n\t" // copy alpha from d16 + "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3 + "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3 + "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2 + "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2 + "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2 + "vzip.16 d2, d3 \n\t" // interleave d2 and d3 + "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst + + "3: \n\t" // End + : [xx] "+r" (xx), [colors] "+r" (colors), [count] "+r" (count) + : [data] "r" (data), [srcAddr] "r" (srcAddr) + : "cc", "memory","r4","r5","r6","r7","r8","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17" + ); + + for (i = (count & 3); i > 0; --i) { + src = srcAddr[*xx++]; *colors++ = SkPixel16ToPixel32(src); + } + } +} +#endif + + /////////////////////////////////////////////////////////////////////////////// /* If we replace a sampleproc, then we null-out the associated shaderproc, otherwise the shader won't even look at the matrix/sampler */ + + void SkBitmapProcState::platformProcs() { bool doFilter = fDoFilter; bool isOpaque = 256 == fAlphaScale; @@ -214,6 +409,15 @@ void SkBitmapProcState::platformProcs() { } #endif break; + case SkBitmap::kRGB_565_Config: +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) + if (justDx && !doFilter) { + if (isOpaque) { + fSampleProc32 = S16_opaque_D32_nofilter_DX_neon_asm; + } + } +#endif + break; default: break; } diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index dd8e406..361acbe 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -1,20 +1,33 @@ /* - * Copyright 2009 The Android Open Source Project + * Copyright 2012 The Android Open Source Project * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ -#include "SkBlitRow.h" #include "SkBlitMask.h" +#include "SkBlitRow.h" #include "SkColorPriv.h" #include "SkDither.h" +#include "SkUtils.h" + +#include "SkCachePreload_arm.h" #if defined(__ARM_HAVE_NEON) #include <arm_neon.h> #endif +extern "C" void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, + U8CPU alpha); + +extern "C" void S32A_Blend_BlitRow32_arm_neon(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, + U8CPU alpha); + #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, @@ -29,6 +42,10 @@ static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, "vmov.u8 d31, #1<<7 \n\t" "vld1.16 {q12}, [%[dst]] \n\t" "vld4.8 {d0-d3}, [%[src]] \n\t" + // Thumb does not support the standard ARM conditional + // instructions but instead requires the 'it' instruction + // to signal conditional execution + "it eq \n\t" "moveq ip, #8 \n\t" "mov %[keep_dst], %[dst] \n\t" @@ -470,15 +487,19 @@ static void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst, : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip" ); } -#define S32A_D565_Opaque_PROC S32A_D565_Opaque_v7 #define S32A_D565_Blend_PROC NULL #define S32_D565_Blend_Dither_PROC NULL #else -#define S32A_D565_Opaque_PROC NULL #define S32A_D565_Blend_PROC NULL #define S32_D565_Blend_Dither_PROC NULL #endif +/* + * Use neon version of BLIT assembly code from S32A_D565_Opaque_arm.S, where we process + * 16 pixels at-a-time and also optimize for alpha=255 case. + */ +#define S32A_D565_Opaque_PROC NULL + /* Don't have a special version that assumes each src is opaque, but our S32A is still faster than the default, so use it here */ @@ -663,6 +684,10 @@ TAIL: #elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) +/* + * User S32A_Opaque_BlitRow32 function from S32A_Opaque_BlitRow32.S + */ +#if 0 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha) { @@ -786,6 +811,13 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, } #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon +#endif + +/* + * Use asm version of BlitRow function. Neon instructions are + * used for armv7 targets. + */ +#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm #elif defined (__ARM_ARCH__) /* #if defined(__ARM_HAVE_NEON) && defined... */ @@ -1799,6 +1831,105 @@ static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, #define S32_D565_Opaque_Dither_PROC NULL #endif +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) +static void Color32_neon(SkPMColor* dst, const SkPMColor* src, int count, + SkPMColor color) { + if (count <= 0) { + return; + } + + if (0 == color) { + if (src != dst) { + memcpy(dst, src, count * sizeof(SkPMColor)); + } + return; + } + + unsigned colorA = SkGetPackedA32(color); + if (255 == colorA) { + sk_memset32(dst, color, count); + } else { + unsigned scale = 256 - SkAlpha255To256(colorA); + + if (count >= 8) { + // at the end of this assembly, count will have been decremented + // to a negative value. That is, if count mod 8 = x, it will be + // -8 +x coming out. + asm volatile ( + PLD128(src, 0) + + "vdup.32 q0, %[color] \n\t" + + PLD128(src, 128) + + // scale numerical interval [0-255], so load as 8 bits + "vdup.8 d2, %[scale] \n\t" + + PLD128(src, 256) + + "subs %[count], %[count], #8 \n\t" + + PLD128(src, 384) + + "Loop_Color32: \n\t" + + // load src color, 8 pixels, 4 64 bit registers + // (and increment src). + "vld1.32 {d4-d7}, [%[src]]! \n\t" + + PLD128(src, 384) + + // multiply long by scale, 64 bits at a time, + // destination into a 128 bit register. + "vmull.u8 q4, d4, d2 \n\t" + "vmull.u8 q5, d5, d2 \n\t" + "vmull.u8 q6, d6, d2 \n\t" + "vmull.u8 q7, d7, d2 \n\t" + + // shift the 128 bit registers, containing the 16 + // bit scaled values back to 8 bits, narrowing the + // results to 64 bit registers. + "vshrn.i16 d8, q4, #8 \n\t" + "vshrn.i16 d9, q5, #8 \n\t" + "vshrn.i16 d10, q6, #8 \n\t" + "vshrn.i16 d11, q7, #8 \n\t" + + // adding back the color, using 128 bit registers. + "vadd.i8 q6, q4, q0 \n\t" + "vadd.i8 q7, q5, q0 \n\t" + + // store back the 8 calculated pixels (2 128 bit + // registers), and increment dst. + "vst1.32 {d12-d15}, [%[dst]]! \n\t" + + "subs %[count], %[count], #8 \n\t" + "bge Loop_Color32 \n\t" + : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) + : [color] "r" (color), [scale] "r" (scale) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" + ); + // At this point, if we went through the inline assembly, count is + // a negative value: + // if the value is -8, there is no pixel left to process. + // if the value is -7, there is one pixel left to process + // ... + // And'ing it with 7 will give us the number of pixels + // left to process. + count = count & 0x7; + } + + while (count > 0) { + *dst = color + SkAlphaMulQ(*src, scale); + src += 1; + dst += 1; + count--; + } + } +} +#endif + /////////////////////////////////////////////////////////////////////////////// static const SkBlitRow::Proc platform_565_procs[] = { @@ -1848,12 +1979,15 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { return platform_32_procs[flags]; } +/////////////////////////////////////////////////////////////////////////////// SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) + return Color32_neon; +#else return NULL; +#endif } -/////////////////////////////////////////////////////////////////////////////// - SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, SkMask::Format maskFormat, SkColor color) { diff --git a/src/opts/SkCachePreload_arm.h b/src/opts/SkCachePreload_arm.h new file mode 100644 index 0000000..cff8c2a --- /dev/null +++ b/src/opts/SkCachePreload_arm.h @@ -0,0 +1,34 @@ +/* + * Copyright 2012 The Android Open Source Project + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ +#ifndef SkCachePreload_arm_DEFINED +#define SkCachePreload_arm_DEFINED + +// This file defines macros for preload instructions for ARM. These macros +// are designed to be embedded inside GNU inline assembly. +// For the use of these macros, __ARM_USE_PLD needs to be enabled. The cache +// line size also needs to be known (and needs to be contained inside +// __ARM_CACHE_LINE_SIZE). +#if defined(__ARM_USE_PLD) + +#define PLD(x, n) "pld [%["#x"], #("#n")]\n\t" + +#if __ARM_CACHE_LINE_SIZE == 32 + #define PLD64(x, n) PLD(x, n) PLD(x, (n) + 32) +#elif __ARM_CACHE_LINE_SIZE == 64 + #define PLD64(x, n) PLD(x, n) +#else + #error "unknown __ARM_CACHE_LINE_SIZE." +#endif +#else + // PLD is disabled, all macros become empty. + #define PLD(x, n) + #define PLD64(x, n) +#endif + +#define PLD128(x, n) PLD64(x, n) PLD64(x, (n) + 64) + +#endif // SkCachePreload_arm_DEFINED diff --git a/src/ports/SkFontHost_FreeType.cpp b/src/ports/SkFontHost_FreeType.cpp index 6c03bf9..ea24a6e 100644 --- a/src/ports/SkFontHost_FreeType.cpp +++ b/src/ports/SkFontHost_FreeType.cpp @@ -12,6 +12,7 @@ #include "SkColorPriv.h" #include "SkDescriptor.h" #include "SkFDot6.h" +#include "SkFloatingPoint.h" #include "SkFontHost.h" #include "SkMask.h" #include "SkAdvancedTypefaceMetrics.h" @@ -55,6 +56,7 @@ //#define DUMP_STRIKE_CREATION //#define SK_GAMMA_APPLY_TO_A8 +//#define SK_GAMMA_SRGB #ifndef SK_GAMMA_CONTRAST #define SK_GAMMA_CONTRAST 0x66 @@ -130,8 +132,8 @@ InitFreetype() { // Setup LCD filtering. This reduces colour fringes for LCD rendered // glyphs. #ifdef FT_LCD_FILTER_H -// err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_DEFAULT); - err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_LIGHT); + err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_DEFAULT); +// err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_LIGHT); gLCDSupport = err == 0; if (gLCDSupport) { gLCDExtra = 2; //DEFAULT and LIGHT add one pixel to each side. @@ -640,6 +642,13 @@ static bool isAxisAligned(const SkScalerContext::Rec& rec) { } void SkFontHost::FilterRec(SkScalerContext::Rec* rec) { + //BOGUS: http://code.google.com/p/chromium/issues/detail?id=121119 + //Cap the requested size as larger sizes give bogus values. + //Remove when http://code.google.com/p/skia/issues/detail?id=554 is fixed. + if (rec->fTextSize > SkIntToScalar(1 << 14)) { + rec->fTextSize = SkIntToScalar(1 << 14); + } + if (!gLCDSupportValid) { InitFreetype(); FT_Done_FreeType(gFTLibrary); @@ -656,7 +665,7 @@ void SkFontHost::FilterRec(SkScalerContext::Rec* rec) { // collapse full->normal hinting if we're not doing LCD h = SkPaint::kNormal_Hinting; } - if ((rec->fFlags & SkScalerContext::kSubpixelPositioning_Flag) || isLCD(*rec)) { + if ((rec->fFlags & SkScalerContext::kSubpixelPositioning_Flag)) { if (SkPaint::kNo_Hinting != h) { h = SkPaint::kSlight_Hinting; } @@ -781,7 +790,7 @@ SkScalerContext_FreeType::SkScalerContext_FreeType(const SkDescriptor* desc) fUseVertMetrics = false; { FT_Int32 loadFlags = FT_LOAD_DEFAULT; - bool linearMetrics = false; + bool linearMetrics = SkToBool(fRec.fFlags & SkScalerContext::kSubpixelPositioning_Flag); if (SkMask::kBW_Format == fRec.fMaskFormat) { // See http://code.google.com/p/chromium/issues/detail?id=43252#c24 @@ -798,7 +807,6 @@ SkScalerContext_FreeType::SkScalerContext_FreeType(const SkDescriptor* desc) break; case SkPaint::kSlight_Hinting: loadFlags = FT_LOAD_TARGET_LIGHT; // This implies FORCE_AUTOHINT - linearMetrics = true; break; case SkPaint::kNormal_Hinting: if (fRec.fFlags & SkScalerContext::kAutohinting_Flag) @@ -1115,16 +1123,17 @@ void SkScalerContext_FreeType::generateMetrics(SkGlyph* glyph) { goto ERROR; } - if ((fRec.fFlags & SkScalerContext::kSubpixelPositioning_Flag) == 0) { + if (fDoLinearMetrics) { + glyph->fAdvanceX = SkFixedMul(fMatrix22.xx, fFace->glyph->linearHoriAdvance); + glyph->fAdvanceY = -SkFixedMul(fMatrix22.yx, fFace->glyph->linearHoriAdvance); + } else { glyph->fAdvanceX = SkFDot6ToFixed(fFace->glyph->advance.x); glyph->fAdvanceY = -SkFDot6ToFixed(fFace->glyph->advance.y); + if (fRec.fFlags & kDevKernText_Flag) { glyph->fRsbDelta = SkToS8(fFace->glyph->rsb_delta); glyph->fLsbDelta = SkToS8(fFace->glyph->lsb_delta); } - } else { - glyph->fAdvanceX = SkFixedMul(fMatrix22.xx, fFace->glyph->linearHoriAdvance); - glyph->fAdvanceY = -SkFixedMul(fMatrix22.yx, fFace->glyph->linearHoriAdvance); } if (fUseVertMetrics) { @@ -1200,49 +1209,71 @@ void SkScalerContext_FreeType::generateMetrics(SkGlyph* glyph) { /////////////////////////////////////////////////////////////////////////////// -static int apply_contrast(int srca, int contrast) { - return srca + (((255 - srca) * contrast * srca) / (255*255)); +#ifdef SK_USE_COLOR_LUMINANCE + +static float apply_contrast(float srca, float contrast) { + return srca + ((1.0f - srca) * contrast * srca); } -static void build_power_table(uint8_t table[], float ee) { - for (int i = 0; i < 256; i++) { - float x = i / 255.f; - x = powf(x, ee); - int xx = SkScalarRoundToInt(SkFloatToScalar(x * 255)); - table[i] = SkToU8(xx); +#ifdef SK_GAMMA_SRGB +static float lin(float per) { + if (per <= 0.04045f) { + return per / 12.92f; } + return powf((per + 0.055f) / 1.055, 2.4f); } - -static void build_gamma_table(uint8_t table[256], int src, int dst) { - static bool gInit; - static uint8_t powTable[256], invPowTable[256]; - if (!gInit) { - const float g = SK_GAMMA_EXPONENT; - build_power_table(powTable, g); - build_power_table(invPowTable, 1/g); - gInit = true; +static float per(float lin) { + if (lin <= 0.0031308f) { + return lin * 12.92f; } + return 1.055f * powf(lin, 1.0f / 2.4f) - 0.055f; +} +#else //SK_GAMMA_SRGB +static float lin(float per) { + const float g = SK_GAMMA_EXPONENT; + return powf(per, g); +} +static float per(float lin) { + const float g = SK_GAMMA_EXPONENT; + return powf(lin, 1.0f / g); +} +#endif //SK_GAMMA_SRGB - const int linSrc = powTable[src]; - const int linDst = powTable[dst]; - // have our contrast value taper off to 0 as the src luminance becomes white - const int contrast = SK_GAMMA_CONTRAST * (255 - linSrc) / 255; - - for (int i = 0; i < 256; ++i) { - int srca = apply_contrast(i, contrast); - SkASSERT((unsigned)srca <= 255); - int dsta = 255 - srca; - - //Calculate the output we want. - int linOut = (linSrc * srca + dsta * linDst) / 255; - SkASSERT((unsigned)linOut <= 255); - int out = invPowTable[linOut]; - - //Undo what the blit blend will do. - int result = ((255 * out) - (255 * dst)) / (src - dst); - SkASSERT((unsigned)result <= 255); +static void build_gamma_table(uint8_t table[256], int srcI) { + const float src = (float)srcI / 255.0f; + const float linSrc = lin(src); + const float linDst = 1.0f - linSrc; + const float dst = per(linDst); - table[i] = result; + // have our contrast value taper off to 0 as the src luminance becomes white + const float contrast = SK_GAMMA_CONTRAST / 255.0f * linDst; + const float step = 1.0f / 256.0f; + + //Remove discontinuity and instability when src is close to dst. + if (fabs(src - dst) < 0.01f) { + float rawSrca = 0.0f; + for (int i = 0; i < 256; ++i, rawSrca += step) { + float srca = apply_contrast(rawSrca, contrast); + table[i] = sk_float_round2int(255.0f * srca); + } + } else { + float rawSrca = 0.0f; + for (int i = 0; i < 256; ++i, rawSrca += step) { + float srca = apply_contrast(rawSrca, contrast); + SkASSERT(srca <= 1.0f); + float dsta = 1 - srca; + + //Calculate the output we want. + float linOut = (linSrc * srca + dsta * linDst); + SkASSERT(linOut <= 1.0f); + float out = per(linOut); + + //Undo what the blit blend will do. + float result = (out - dst) / (src - dst); + SkASSERT(sk_float_round2int(255.0f * result) <= 255); + + table[i] = sk_float_round2int(255.0f * result); + } } } @@ -1250,10 +1281,10 @@ static const uint8_t* getGammaTable(U8CPU luminance) { static uint8_t gGammaTables[4][256]; static bool gInited; if (!gInited) { - build_gamma_table(gGammaTables[0], 0x00, 0xFF); - build_gamma_table(gGammaTables[1], 0x66, 0x99); - build_gamma_table(gGammaTables[2], 0x99, 0x66); - build_gamma_table(gGammaTables[3], 0xFF, 0x00); + build_gamma_table(gGammaTables[0], 0x00); + build_gamma_table(gGammaTables[1], 0x55); + build_gamma_table(gGammaTables[2], 0xAA); + build_gamma_table(gGammaTables[3], 0xFF); gInited = true; } @@ -1261,7 +1292,7 @@ static const uint8_t* getGammaTable(U8CPU luminance) { return gGammaTables[luminance >> 6]; } -#ifndef SK_USE_COLOR_LUMINANCE +#else //SK_USE_COLOR_LUMINANCE static const uint8_t* getIdentityTable() { static bool gOnce; static uint8_t gIdentityTable[256]; @@ -1273,7 +1304,7 @@ static const uint8_t* getIdentityTable() { } return gIdentityTable; } -#endif +#endif //SK_USE_COLOR_LUMINANCE static uint16_t packTriple(unsigned r, unsigned g, unsigned b) { return SkPackRGB16(r >> 3, g >> 2, b >> 3); diff --git a/src/ports/SkFontHost_linux.cpp b/src/ports/SkFontHost_linux.cpp index be99576..64fa2a3 100644 --- a/src/ports/SkFontHost_linux.cpp +++ b/src/ports/SkFontHost_linux.cpp @@ -464,53 +464,43 @@ static void load_system_fonts() { /////////////////////////////////////////////////////////////////////////////// void SkFontHost::Serialize(const SkTypeface* face, SkWStream* stream) { -#if 0 - const char* name = ((FamilyTypeface*)face)->getUniqueString(); - - stream->write8((uint8_t)face->getStyle()); - - if (NULL == name || 0 == *name) { - stream->writePackedUInt(0); - // SkDebugf("--- fonthost serialize null\n"); - } else { - uint32_t len = strlen(name); - stream->writePackedUInt(len); - stream->write(name, len); - // SkDebugf("--- fonthost serialize <%s> %d\n", name, face->getStyle()); - } -#endif - sk_throw(); + SkStream* fontStream = ((FamilyTypeface*)face)->openStream(); + + // store the length of the custom font + uint32_t len = fontStream->getLength(); + stream->write32(len); + + // store the entire font in the serialized stream + void* fontData = malloc(len); + + fontStream->read(fontData, len); + stream->write(fontData, len); + + fontStream->unref(); + free(fontData); + + +// sk_throw(); } SkTypeface* SkFontHost::Deserialize(SkStream* stream) { -#if 0 load_system_fonts(); - - int style = stream->readU8(); - - int len = stream->readPackedUInt(); - if (len > 0) { - SkString str; - str.resize(len); - stream->read(str.writable_str(), len); - - const FontInitRec* rec = gSystemFonts; - for (size_t i = 0; i < SK_ARRAY_COUNT(gSystemFonts); i++) { - if (strcmp(rec[i].fFileName, str.c_str()) == 0) { - // backup until we hit the fNames - for (int j = i; j >= 0; --j) { - if (rec[j].fNames != NULL) { - return SkFontHost::CreateTypeface(NULL, rec[j].fNames[0], NULL, 0, - (SkTypeface::Style)style); - } - } - } - } - } - return SkFontHost::CreateTypeface(NULL, NULL, NULL, 0, (SkTypeface::Style)style); -#endif - sk_throw(); - return NULL; + + // read the length of the custom font from the stream + uint32_t len = stream->readU32(); + + // generate a new stream to store the custom typeface + SkMemoryStream* fontStream = new SkMemoryStream(len); + stream->read((void*)fontStream->getMemoryBase(), len); + + SkTypeface* face = CreateTypefaceFromStream(fontStream); + + fontStream->unref(); + + return face; + +// sk_throw(); +// return NULL; } /////////////////////////////////////////////////////////////////////////////// |