aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp457
-rw-r--r--src/core/SkBitmapProcShader.cpp23
-rw-r--r--src/core/SkBitmapProcState.cpp31
-rw-r--r--src/core/SkBitmapProcState_matrixProcs.cpp2
-rw-r--r--src/core/SkBitmapProcState_sample.h98
-rw-r--r--src/core/SkBitmapProcState_shaderproc.h107
-rw-r--r--src/core/SkBlitRow_D16.cpp8
-rw-r--r--src/opts/S16_D32_arm.S188
-rw-r--r--src/opts/S32A_Blend_BlitRow32_arm.S396
-rw-r--r--src/opts/S32A_D565_Opaque_arm.S325
-rw-r--r--src/opts/S32A_Opaque_BlitRow32_arm.S311
-rw-r--r--src/opts/S32_Opaque_D32_nofilter_DX_gether_arm.S85
-rw-r--r--src/opts/SkBitmapProcState_opts_arm.cpp204
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp146
-rw-r--r--src/opts/SkCachePreload_arm.h34
-rw-r--r--src/ports/SkFontHost_FreeType.cpp133
-rw-r--r--src/ports/SkFontHost_linux.cpp76
17 files changed, 2520 insertions, 104 deletions
diff --git a/src/core/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp b/src/core/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp
new file mode 100644
index 0000000..20dc03b
--- /dev/null
+++ b/src/core/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp
@@ -0,0 +1,457 @@
+
+/*
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1,
+ SkFixed fx, unsigned int maxX, unsigned int subY,
+ unsigned int* colors,
+ SkFixed dx, int count) {
+
+ asm volatile(
+ "mov r3, %[count] \n\t" //r3 = count
+
+ "mov r5, %[fx] \n\t" //r5 = x = fx
+ "cmp r3, #0 \n\t"
+ "beq endloop \n\t"
+
+ "vdup.8 d17, %[subY] \n\t" // duplicate y into d17
+ "vmov.u8 d16, #16 \n\t" // set up constant in d16
+ "vsub.u8 d18, d16, d17 \n\t" // d18 = 16-y
+
+ "vmov.u16 d16, #16 \n\t" // set up constant in d16,int 16bit
+
+#define UNROLL8
+#define UNROLL2
+#ifdef UNROLL8
+ "cmp r3, #8 \n\t"
+ "blt initloop2 \n\t"
+ ///////////////loop2 in x
+ "beginloop8: \n\t"
+
+ /////////////////pixel 1////////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d22, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d22, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d22, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d22, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// end bilinear interp
+
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+
+ /////////////////pixel 2////////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d24, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d24, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d24, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d24, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// end bilinear interp
+
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+
+ /////////////////pixel 3////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d26, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d26, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d26, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d26, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// end bilinear interp
+
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+
+ /////////////////pixel 4////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d28, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d28, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d28, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d28, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// end bilinear interp
+
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+
+ /////////////////pixel 5////////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d23, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d23, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d23, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d23, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// end bilinear interp
+
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+
+ /////////////////pixel 6////////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d25, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d25, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d25, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d25, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// end bilinear interp
+
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+
+ /////////////////pixel 7////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d27, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d27, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d27, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d27, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// end bilinear interp
+
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+
+ /////////////////pixel 8////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d29, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d29, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d29, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d29, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// Store results///////////////////
+
+ "vshrn.i16 d0, q11, #8 \n\t" // shift down result by 8
+ "vshrn.i16 d1, q12, #8 \n\t" // shift down result by 8
+ "vshrn.i16 d2, q13, #8 \n\t" // shift down result by 8
+ "vshrn.i16 d3, q14, #8 \n\t" // shift down result by 8
+
+ "vst4.u32 {d0, d1, d2, d3}, [%[colors]]! \n\t" // store result
+
+ //////////////// end bilinear interp
+
+ "sub r3, r3, #8 \n\t" //num -=8
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+ "cmp r3, #7 \n\t"
+
+ "bgt beginloop8 \n\t"
+
+ "endloop8: \n\t"
+ ////////////////end loop in x
+#endif //UNROLL8
+
+
+
+#ifdef UNROLL2
+ "initloop2: \n\t"
+ "cmp r3, #2 \n\t"
+ "blt initloop \n\t"
+ ///////////////loop2 in x
+ "beginloop2: \n\t"
+
+
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d22, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d22, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d22, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d22, d0, d20 \n\t" // d4 += a10 * (16-x)
+
+ //////////////// end bilinear interp
+
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+
+ /////////////////second half////////////////////////////////
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d23, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d23, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d23, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d23, d0, d20 \n\t" // d4 += a10 * (16-x)
+ "vshrn.i16 d0, q11, #8 \n\t" // shift down result by 8
+
+ "vst1.u32 {d0}, [%[colors]]! \n\t" // store result
+
+ //////////////// end bilinear interp
+
+ "sub r3, r3, #2 \n\t" //num -=2
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+ "cmp r3, #1 \n\t"
+
+ "bgt beginloop2 \n\t"
+
+ "endloop2: \n\t"
+ ////////////////end loop in x
+#endif //UNROLL2
+
+#if defined (UNROLL2) || defined (UNROLL8)
+ "initloop: \n\t"
+ "cmp r3, #0 \n\t"
+ "ble endloop \n\t"
+#endif //defined (UNROLL2) || defined (UNROLL8)
+
+ ///////////////loop in x
+ "beginloop: \n\t"
+
+
+ //x0 = SkClampMax((fx) >> 16, max)
+ "asr r4, r5, #16 \n\t"
+
+ "lsl r4, r4, #2 \n\t"
+ "add r6, r4, %[image0] \n\t"
+ "vldr.32 d4, [r6] \n\t"
+ "add r6, r4, %[image1] \n\t"
+ "vldr.32 d5, [r6] \n\t"
+
+ //(((fx) >> 12) & 0xF)
+ "lsr r4, r5, #12 \n\t"
+ "and r4, r4, #15 \n\t"
+ "vdup.16 d19, r4 \n\t" // duplicate x into d19
+
+
+ ////////////bilinear interp
+
+ "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
+ "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
+
+ "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
+
+ "vmul.i16 d4, d7, d19 \n\t" // d4 = a01 * x
+ "vmla.i16 d4, d1, d19 \n\t" // d4 += a11 * x
+ "vmla.i16 d4, d6, d20 \n\t" // d4 += a00 * (16-x)
+ "vmla.i16 d4, d0, d20 \n\t" // d4 += a10 * (16-x)
+ "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8
+
+ "vst1.u32 {d0[0]}, [%[colors]]! \n\t" // store result
+
+ //////////////// end bilinear interp
+
+ "sub r3, r3, #1 \n\t" //num -=1
+ "add r5, r5, %[dx] \n\t" //r5 = x += dx
+ "cmp r3, #0 \n\t"
+ "bgt beginloop \n\t"
+
+ "endloop: \n\t"
+ ////////////////end loop in x
+ : [colors] "+r" (colors)
+ : [image0] "r" (image0), [image1] "r" (image1), [fx] "r" (fx), [maxX] "r" (maxX), [subY] "r" (subY),
+ [dx] "r" (dx), [count] "r" (count)
+ : "cc", "memory", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29"
+ );
+
+
+}
diff --git a/src/core/SkBitmapProcShader.cpp b/src/core/SkBitmapProcShader.cpp
index 6d64716..1201ea4 100644
--- a/src/core/SkBitmapProcShader.cpp
+++ b/src/core/SkBitmapProcShader.cpp
@@ -159,6 +159,18 @@ bool SkBitmapProcShader::setContext(const SkBitmap& device,
#define TEST_BUFFER_EXTRA 0
#endif
+#if defined(__ARM_HAVE_NEON)
+void clampx_nofilter_trans(const SkBitmapProcState& s,
+ uint32_t xy[], int count, int x, int y) ;
+
+void S16_opaque_D32_nofilter_DX(const SkBitmapProcState& s,
+ const uint32_t* SK_RESTRICT xy,
+ int count, uint32_t* SK_RESTRICT colors) ;
+
+void clampx_nofilter_trans_S16_D32_DX(const SkBitmapProcState& s,
+ uint32_t xy[], int count, int x, int y, uint32_t* SK_RESTRICT colors) ;
+#endif
+
void SkBitmapProcShader::shadeSpan(int x, int y, SkPMColor dstC[], int count) {
const SkBitmapProcState& state = fState;
if (state.fShaderProc32) {
@@ -181,6 +193,12 @@ void SkBitmapProcShader::shadeSpan(int x, int y, SkPMColor dstC[], int count) {
n = max;
}
SkASSERT(n > 0 && n < BUF_MAX*2);
+#if defined(__ARM_HAVE_NEON)
+ if( sproc == S16_opaque_D32_nofilter_DX && mproc == clampx_nofilter_trans ){
+ clampx_nofilter_trans_S16_D32_DX(state, buffer, n, x, y, dstC);
+ } else {
+#endif
+
#ifdef TEST_BUFFER_OVERRITE
for (int i = 0; i < TEST_BUFFER_EXTRA; i++) {
buffer[BUF_MAX + i] = TEST_PATTERN;
@@ -193,7 +211,10 @@ void SkBitmapProcShader::shadeSpan(int x, int y, SkPMColor dstC[], int count) {
}
#endif
sproc(state, buffer, n, dstC);
-
+
+#if defined(__ARM_HAVE_NEON)
+ }
+#endif
if ((count -= n) == 0) {
break;
}
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 3d34b20..a0758c0 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -1,4 +1,3 @@
-
/*
* Copyright 2011 Google Inc.
*
@@ -93,7 +92,11 @@ static inline U8CPU Filter_8(unsigned x, unsigned y,
SkASSERT(state.fAlphaScale == 256)
#define RETURNDST(src) src
#define SRC_TO_FILTER(src) src
+#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
+ #define USE_GETHER32
+#endif
#include "SkBitmapProcState_sample.h"
+#undef USE_GETHER32
#undef FILTER_PROC
#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale)
@@ -124,7 +127,9 @@ static inline U8CPU Filter_8(unsigned x, unsigned y,
SkASSERT(state.fAlphaScale == 256)
#define RETURNDST(src) SkPixel16ToPixel32(src)
#define SRC_TO_FILTER(src) src
+#define USE_S16_OPAQUE
#include "SkBitmapProcState_sample.h"
+#undef USE_S16_OPAQUE
#undef FILTER_PROC
#define FILTER_PROC(x, y, a, b, c, d, dst) \
@@ -340,6 +345,25 @@ static inline U8CPU Filter_8(unsigned x, unsigned y,
#define POSTAMBLE(state) state.fBitmap->getColorTable()->unlockColors(false)
#include "SkBitmapProcState_shaderproc.h"
+#if defined(__ARM_HAVE_NEON)
+#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
+#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
+#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
+#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
+
+#undef FILTER_PROC
+#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_opaque(x, y, a, b, c, d, dst)
+#define MAKENAME(suffix) S32_Opaque_D32 ## suffix
+#define SRCTYPE uint32_t
+#define DSTTYPE uint32_t
+#define SRC_TO_FILTER(src) src
+#include "S32_Opaque_D32_filter_DX_shaderproc_neon.cpp"
+#define S32_OPAQUE_D32_FILTER_DX_NEON
+#include "SkBitmapProcState_shaderproc.h"
+#undef S32_OPAQUE_D32_FILTER_DX_NEON
+#endif //ARM_HAVE_NEON
+
+
///////////////////////////////////////////////////////////////////////////////
static bool valid_for_filtering(unsigned dimension) {
@@ -532,6 +556,11 @@ bool SkBitmapProcState::chooseProcs(const SkMatrix& inv, const SkPaint& paint) {
} else if (SI8_opaque_D32_filter_DX == fSampleProc32 && clamp_clamp) {
fShaderProc32 = Clamp_SI8_opaque_D32_filter_DX_shaderproc;
}
+#if defined(__ARM_HAVE_NEON)
+ else if (S32_opaque_D32_filter_DX == fSampleProc32 && clamp_clamp) {
+ fShaderProc32 = S32_Opaque_D32_filter_DX_shaderproc;
+ }
+#endif //ARM_HAVE_NEON
// see if our platform has any accelerated overrides
this->platformProcs();
diff --git a/src/core/SkBitmapProcState_matrixProcs.cpp b/src/core/SkBitmapProcState_matrixProcs.cpp
index bda2438..583a39b 100644
--- a/src/core/SkBitmapProcState_matrixProcs.cpp
+++ b/src/core/SkBitmapProcState_matrixProcs.cpp
@@ -327,7 +327,7 @@ static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy,
return SkScalarToFixed(pt.fX) >> 16;
}
-static void clampx_nofilter_trans(const SkBitmapProcState& s,
+void clampx_nofilter_trans(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h
index e6b587f..793283d 100644
--- a/src/core/SkBitmapProcState_sample.h
+++ b/src/core/SkBitmapProcState_sample.h
@@ -23,6 +23,13 @@
#error "unsupported DSTSIZE"
#endif
+#if defined(USE_GETHER32)
+ extern "C" void S32_Opaque_D32_nofilter_DX_gether(SkPMColor* SK_RESTRICT colors,
+ const SkPMColor* SK_RESTRICT srcAddr,
+ int count,
+ const uint32_t* SK_RESTRICT xy);
+#endif
+
void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s,
const uint32_t* SK_RESTRICT xy,
int count, DSTTYPE* SK_RESTRICT colors) {
@@ -65,6 +72,93 @@ void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s,
#endif
}
+
+#if defined(USE_S16_OPAQUE) && defined(__ARM_HAVE_NEON)
+
+extern "C" void Blit_Pixel16ToPixel32( uint32_t * colors, const uint16_t *srcAddr, int n );
+
+void clampx_nofilter_trans_S16_D32_DX(const SkBitmapProcState& s,
+ uint32_t xy[], int count, int x, int y, DSTTYPE* SK_RESTRICT colors) {
+
+ SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
+
+ //int xpos = nofilter_trans_preamble(s, &xy, x, y);
+ SkPoint pt;
+ s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+ SkIntToScalar(y) + SK_ScalarHalf, &pt);
+ uint32_t Y = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16,
+ s.fBitmap->height());
+ int xpos = SkScalarToFixed(pt.fX) >> 16;
+
+ const SRCTYPE* SK_RESTRICT srcAddr = (const SRCTYPE*)s.fBitmap->getPixels();
+ SRCTYPE src;
+
+ // buffer is y32, x16, x16, x16, x16, x16
+ // bump srcAddr to the proper row, since we're told Y never changes
+ //SkASSERT((unsigned)orig_xy[0] < (unsigned)s.fBitmap->height());
+ //srcAddr = (const SRCTYPE*)((const char*)srcAddr +
+ // orig_xy[0] * s.fBitmap->rowBytes());
+ SkASSERT((unsigned)Y < (unsigned)s.fBitmap->height());
+ srcAddr = (const SRCTYPE*)((const char*)srcAddr +
+ Y * s.fBitmap->rowBytes());
+ const int width = s.fBitmap->width();
+ int n;
+ if (1 == width) {
+ // all of the following X values must be 0
+ memset(xy, 0, count * sizeof(uint16_t));
+ src = srcAddr[0];
+ DSTTYPE dstValue = RETURNDST(src);
+ BITMAPPROC_MEMSET(colors, dstValue, count);
+ return;
+ //goto done_sample;
+ }
+
+
+ // fill before 0 as needed
+ if (xpos < 0) {
+ n = -xpos;
+ if (n > count) {
+ n = count;
+ }
+ src = srcAddr[0];
+ for( int i = 0; i < n ; i++ ){
+ *colors++ = RETURNDST(src);
+ }
+
+ count -= n;
+ if (0 == count) {
+ return;
+ }
+ xpos = 0;
+ }
+
+ // fill in 0..width-1 if needed
+ if (xpos < width) {
+ n = width - xpos;
+ if (n > count) {
+ n = count;
+ }
+ //for (int i = 0; i < n; i++) {
+ // src = srcAddr[xpos++];
+ // *colors++ = RETURNDST(src);
+ //}
+ Blit_Pixel16ToPixel32( colors, &(srcAddr[xpos]), n );
+ colors += n;
+ count -= n;
+ if (0 == count) {
+ return;
+ }
+ }
+
+ for (int i = 0; i < count; i++) {
+ src = srcAddr[width - 1];
+ *colors++ = RETURNDST(src);
+ }
+
+}
+
+#endif
+
void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
const uint32_t* SK_RESTRICT xy,
int count, DSTTYPE* SK_RESTRICT colors) {
@@ -92,6 +186,9 @@ void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
DSTTYPE dstValue = RETURNDST(src);
BITMAPPROC_MEMSET(colors, dstValue, count);
} else {
+#if defined(USE_GETHER32)
+ S32_Opaque_D32_nofilter_DX_gether(colors, srcAddr, count, xy);
+#else
int i;
for (i = (count >> 2); i > 0; --i) {
uint32_t xx0 = *xy++;
@@ -111,6 +208,7 @@ void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
SkASSERT(*xx < (unsigned)s.fBitmap->width());
src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
}
+#endif
}
#ifdef POSTAMBLE
diff --git a/src/core/SkBitmapProcState_shaderproc.h b/src/core/SkBitmapProcState_shaderproc.h
index a3a8a99..3f5d55b 100644
--- a/src/core/SkBitmapProcState_shaderproc.h
+++ b/src/core/SkBitmapProcState_shaderproc.h
@@ -49,7 +49,111 @@ static void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y,
#ifdef PREAMBLE
PREAMBLE(s);
#endif
-
+
+#ifdef S32_OPAQUE_D32_FILTER_DX_NEON
+ int post_count;
+ SkFixed post_fx;
+ DSTTYPE* SK_RESTRICT post_colors;
+ int num;
+ post_count = count;
+ post_fx = fx;
+ post_colors = colors;
+
+
+ if (dx>=0)
+ {
+ int end = ((int)maxX-1)<<16;
+ num = (end-fx)/dx;
+ if (num < 0) num = 0;
+
+ if (num<count)
+ {
+ count = num;
+ post_count = post_count - count;
+ post_fx = fx + count*dx;
+ post_colors = post_colors + count;
+ }
+ else
+ post_count = 0;
+
+ while (fx<0 && count) {
+ unsigned subX = TILEX_LOW_BITS(fx, maxX);
+ unsigned x0 = TILEX_PROCF(fx, maxX);
+ unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
+
+ FILTER_PROC(subX, subY,
+ SRC_TO_FILTER(row0[x0]),
+ SRC_TO_FILTER(row0[x1]),
+ SRC_TO_FILTER(row1[x0]),
+ SRC_TO_FILTER(row1[x1]),
+ colors);
+ colors += 1;
+
+ fx += dx;
+ count--;
+ }
+ }
+ else
+ {
+ int end = 0;
+ int maxXFix = ((int)maxX-1)<<16;
+ num = (end-fx)/dx;
+ if (num < 0) num = 0;
+
+
+ if (num<count)
+ {
+ count = num;
+ post_count = post_count - count;
+ post_fx = fx + count*dx;
+ post_colors = post_colors + count;
+ }
+ else
+ post_count = 0;
+
+ while (fx>=maxXFix && count) {
+ unsigned subX = TILEX_LOW_BITS(fx, maxX);
+ unsigned x0 = TILEX_PROCF(fx, maxX);
+ unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
+
+ FILTER_PROC(subX, subY,
+ SRC_TO_FILTER(row0[x0]),
+ SRC_TO_FILTER(row0[x1]),
+ SRC_TO_FILTER(row1[x0]),
+ SRC_TO_FILTER(row1[x1]),
+ colors);
+ colors += 1;
+
+ fx += dx;
+ count--;
+ }
+
+ }
+
+ S32_Opaque_D32_filter_DX_shaderproc_neon(row0, row1, fx, maxX, subY, colors, dx, count);
+
+ fx = post_fx;
+ colors = post_colors;
+ while (post_count) {
+ unsigned subX = TILEX_LOW_BITS(fx, maxX);
+ unsigned x0 = TILEX_PROCF(fx, maxX);
+ unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
+
+ FILTER_PROC(subX, subY,
+ SRC_TO_FILTER(row0[x0]),
+ SRC_TO_FILTER(row0[x1]),
+ SRC_TO_FILTER(row1[x0]),
+ SRC_TO_FILTER(row1[x1]),
+ colors);
+ colors += 1;
+
+ fx += dx;
+ post_count--;
+ }
+
+
+#else //S32_OPAQUE_D32_FILTER_DX_NEON
+
do {
unsigned subX = TILEX_LOW_BITS(fx, maxX);
unsigned x0 = TILEX_PROCF(fx, maxX);
@@ -65,6 +169,7 @@ static void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y,
fx += dx;
} while (--count != 0);
+#endif //S32_OPAQUE_D32_FILTER_DX_NEON
#ifdef POSTAMBLE
POSTAMBLE(s);
diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp
index c815468..3bdaf0a 100644
--- a/src/core/SkBlitRow_D16.cpp
+++ b/src/core/SkBlitRow_D16.cpp
@@ -207,12 +207,20 @@ static void S32A_D565_Blend_Dither(uint16_t* SK_RESTRICT dst,
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
+#if defined(__CPU_ARCH_ARM) && defined(SK_CPU_LENDIAN)
+ extern "C" void S32A_D565_Opaque_arm(uint16_t*, uint32_t*, size_t);
+#endif
+
static const SkBlitRow::Proc gDefault_565_Procs[] = {
// no dither
S32_D565_Opaque,
S32_D565_Blend,
+#if defined(__CPU_ARCH_ARM) && defined(SK_CPU_LENDIAN)
+ (SkBlitRow::Proc)S32A_D565_Opaque_arm,
+#else
S32A_D565_Opaque,
+#endif
S32A_D565_Blend,
// dither
diff --git a/src/opts/S16_D32_arm.S b/src/opts/S16_D32_arm.S
new file mode 100644
index 0000000..0278a8f
--- /dev/null
+++ b/src/opts/S16_D32_arm.S
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2005-2008, The Android Open Source Project
+ * Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ .text
+ .align
+
+
+ .global Blit_Pixel16ToPixel32
+ .func Blit_Pixel16ToPixel32
+
+//void Blit_Pixel16ToPixel32( uint32_t * colors, const uint16_t *srcAddr, int n );
+// r0: dst ptr
+// r1: src ptr
+// r2: n
+
+Blit_Pixel16ToPixel32:
+ cmp r2,#32
+ blt .Lless_than_32
+
+ vld2.u8 {d0, d2}, [r1]! //q0, lower 8 bits
+ vld2.u8 {d1, d3}, [r1]! //q1, higher 8 bits
+ vmov.u8 q13, #0xff
+
+.Lpipeline_neon_loop:
+ pld [r1, #256]
+ vmov.u8 q12, #31
+ vand.u8 q12, q12, q0 //q12 is B channels
+
+ vshr.u8 q11, q0, #5 //lower 3 bits of G channels
+ vshl.u8 q14, q1, #5
+ vshr.u8 q14, q14, #2 //higher 3 bits of G channels
+ vorr q11, q11, q14 //q11 is G channels
+
+ vshr.u8 q10, q1, #3 //q10 is R channels
+
+ sub r2,r2,#16
+
+ vld2.u8 {d0, d2}, [r1]! //q0, lower 8 bits
+ vld2.u8 {d1, d3}, [r1]! //q1, higher 8 bits
+
+ vshl.u8 q2, q12, #3 // b << (8-SK_B16_BITS)
+ vshr.u8 q12, q12, #2 // b >> (2* SK_B16_BITS -8 )
+ vorr q12, q12, q2 //B channels
+
+ cmp r2,#32
+ vshl.u8 q3, q11, #2 // b << (8-SK_G16_BITS)
+ vshr.u8 q11, q11, #4 // b >> (2* SK_G16_BITS -8 )
+ vorr q11, q11, q3 //G channels
+
+ vshl.u8 q8, q10, #3 // b << (8-SK_R16_BITS)
+ vshr.u8 q10, q10, #2 // b >> (2* SK_R16_BITS -8 )
+ vorr q10, q10, q8 //R channels
+
+
+ vst4.u8 {d20, d22, d24, d26}, [r0]!
+ vst4.u8 {d21, d23, d25, d27}, [r0]!
+ bge .Lpipeline_neon_loop
+
+.Lsurfix:
+ vmov.u8 q12, #31
+ vand.u8 q12, q12, q0 //q12 is B channels
+
+ vshr.u8 q11, q0, #5 //lower 3 bits of G channels
+ vshl.u8 q14, q1, #5
+ vshr.u8 q14, q14, #2 //higher 3 bits of G channels
+ vorr q11, q11, q14 //q11 is G channels
+
+ vshr.u8 q10, q1, #3 //q10 is R channels
+
+ sub r2,r2,#16
+
+ vshl.u8 q2, q12, #3 // b << (8-SK_B16_BITS)
+ vshr.u8 q12, q12, #2 // b >> (2* SK_B16_BITS -8 )
+ vorr q12, q12, q2 //B channels
+
+ vshl.u8 q3, q11, #2 // b << (8-SK_G16_BITS)
+ vshr.u8 q11, q11, #4 // b >> (2* SK_G16_BITS -8 )
+ vorr q11, q11, q3 //G channels
+
+ vshl.u8 q8, q10, #3 // b << (8-SK_R16_BITS)
+ vshr.u8 q10, q10, #2 // b >> (2* SK_R16_BITS -8 )
+ vorr q10, q10, q8 //R channels
+
+ vst4.u8 {d20, d22, d24, d26}, [r0]!
+ vst4.u8 {d21, d23, d25, d27}, [r0]!
+
+
+.Lless_than_32:
+ cmp r2,#16
+ blt .Lless_than_16
+ //vpush {Q4-Q7}
+
+.Lneon_loop:
+ pld [r1, #256]
+ vld2.u8 {d0, d2}, [r1]! //q0, lower 8 bits
+ vld2.u8 {d1, d3}, [r1]! //q1, higher 8 bits
+
+ vmov.u8 q12, #31
+ vand.u8 q12, q12, q0 //q12 is B channels
+
+ vshr.u8 q11, q0, #5 //lower 3 bits of G channels
+ vshl.u8 q14, q1, #5
+ vshr.u8 q14, q14, #2 //higher 3 bits of G channels
+ vorr q11, q11, q14 //q11 is G channels
+
+ vshr.u8 q10, q1, #3 //q10 is R channels
+
+ sub r2,r2,#16
+
+ vshl.u8 q2, q12, #3 // b << (8-SK_B16_BITS)
+ vshr.u8 q12, q12, #2 // b >> (2* SK_B16_BITS -8 )
+ vorr q12, q12, q2 //B channels
+
+ cmp r2,#16
+ vshl.u8 q3, q11, #2 // b << (8-SK_G16_BITS)
+ vshr.u8 q11, q11, #4 // b >> (2* SK_G16_BITS -8 )
+ vorr q11, q11, q3 //G channels
+
+ vshl.u8 q8, q10, #3 // b << (8-SK_R16_BITS)
+ vshr.u8 q10, q10, #2 // b >> (2* SK_R16_BITS -8 )
+ vorr q10, q10, q8 //R channels
+
+ vmov.u8 q13, #0xff
+
+ vst4.u8 {d20, d22, d24, d26}, [r0]!
+ vst4.u8 {d21, d23, d25, d27}, [r0]!
+ bge .Lneon_loop
+
+ //vpop {Q4-Q7}
+
+.Lless_than_16:
+
+
+ cmp r2, #0 // 0x0
+ ble .Lend
+
+ push {r4, r5, r6, r7, r8}
+
+
+ lsl r2, r2, #1
+ mov r3, #0 // 0x0
+
+.Lloop:
+
+ ldrh r6, [r1, r3]
+
+ and r5, r6, #31 // 0x1f //r5 is B
+
+ ubfx r4, r6, #5, #6 // r4 is G
+
+ lsr ip, r6, #11 //ip is R
+
+
+ lsl r8, r5, #3
+ lsl r6, r4, #2
+ lsr r7, ip, #2
+
+ orr r5, r8, r5, lsr #2
+ orr ip, r7, ip, lsl #3
+ orr r4, r6, r4, lsr #4
+ orr ip, ip, #-16777216 // 0xff000000
+ orr r5, ip, r5, lsl #16
+ orr r4, r5, r4, lsl #8
+ str r4, [r0, r3, lsl #1]
+ add r3, r3, #2 // 0x2
+
+ cmp r3, r2
+ bne .Lloop
+
+ pop {r4, r5, r6, r7, r8}
+.Lend:
+ bx lr
+
diff --git a/src/opts/S32A_Blend_BlitRow32_arm.S b/src/opts/S32A_Blend_BlitRow32_arm.S
new file mode 100644
index 0000000..cd96f90
--- /dev/null
+++ b/src/opts/S32A_Blend_BlitRow32_arm.S
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2005-2008, The Android Open Source Project
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ .text
+
+ .global S32A_Blend_BlitRow32_arm_neon
+ .func S32A_Blend_BlitRow32_arm_neon
+
+S32A_Blend_BlitRow32_arm_neon:
+
+ // calculate src_scale = aa + 1
+ add r3, r3, #1
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+
+ cmp r2,#24
+ blt .Lslow_path
+
+ push {r4, r5}
+ vpush {q4-q7}
+ vmov.u16 q14,#0x100
+
+ // store src_scale in q4
+ vdup.u16 q4, r3
+
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ //update source ptr but not dst ptr
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+ add r4, r0, #32 // minus 16 to pretend the last round
+ mov r5, #64
+ sub r2,r2,#8
+
+.Lloop:
+ pld [r1, #256]
+ pld [r0, #256]
+ subs r2, r2, #16
+ cmp r2,#16
+
+ // expand destination from 8-bit to 16-bit
+ vmovl.u8 q6, d4
+ vmovl.u8 q7, d5
+ vmovl.u8 q8, d6
+ vmovl.u8 q9, d7
+
+ // expand source from 8-bit to 16-bit
+ vmovl.u8 q13, d3
+ vmovl.u8 q10, d0
+ vmovl.u8 q11, d1
+ vmovl.u8 q12, d2
+
+ //update source ptr but not dst ptr
+ // calculate destination scale
+ vmul.u16 q5, q13, q4
+ vshr.u16 q5, q5, #8
+ vsub.u16 q5, q14, q5
+
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ // multiply destination ARGB components with dst_scale
+ vmul.u16 q6, q6, q5
+ vmul.u16 q7, q7, q5
+ vmul.u16 q8, q8, q5
+ vmul.u16 q9, q9, q5
+
+
+ vld4.8 {d4, d5, d6, d7}, [r4] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ // multiply source ARGB components with src_scale
+ vmul.u16 q10, q10, q4
+ vmul.u16 q11, q11, q4
+ vmul.u16 q12, q12, q4
+ vmul.u16 q13, q13, q4
+
+
+ // add processed src and dest pixels and extract high bytes
+ vqadd.u8 q10, q6, q10
+ vqadd.u8 q11, q7, q11
+ vqadd.u8 q12, q8, q12
+ vqadd.u8 q13, q9, q13
+
+ vshrn.u16 d20, q10, #8
+ vshrn.u16 d21, q11, #8
+ vshrn.u16 d22, q12, #8
+ vshrn.u16 d23, q13, #8
+
+ vst4.8 {d20, d21, d22, d23}, [r0], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+ // expand destination from 8-bit to 16-bit
+ vmovl.u8 q6, d4
+ vmovl.u8 q7, d5
+ vmovl.u8 q8, d6
+ vmovl.u8 q9, d7
+
+ // expand source from 8-bit to 16-bit
+ vmovl.u8 q13, d3
+ vmovl.u8 q10, d0
+ vmovl.u8 q11, d1
+ vmovl.u8 q12, d2
+
+ // calculate destination scale
+ vmul.u16 q5, q13, q4
+ vshr.u16 q5, q5, #8
+ vsub.u16 q5, q14, q5
+
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+
+ // multiply destination ARGB components with dst_scale
+ vmul.u16 q6, q6, q5
+ vmul.u16 q7, q7, q5
+ vmul.u16 q8, q8, q5
+ vmul.u16 q9, q9, q5
+
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ // multiply source ARGB components with src_scale
+ vmul.u16 q10, q10, q4
+ vmul.u16 q11, q11, q4
+ vmul.u16 q12, q12, q4
+ vmul.u16 q13, q13, q4
+
+
+ // add processed src and dest pixels and extract high bytes
+ vqadd.u8 q10, q6, q10
+ vqadd.u8 q11, q7, q11
+ vqadd.u8 q12, q8, q12
+ vqadd.u8 q13, q9, q13
+
+ vshrn.u16 d20, q10, #8
+ vshrn.u16 d21, q11, #8
+ vshrn.u16 d22, q12, #8
+ vshrn.u16 d23, q13, #8
+
+ vst4.8 {d20, d21, d22, d23}, [r4], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+ bge .Lloop
+
+//There are 8 words left unprocessed from previous round
+ // expand destination from 8-bit to 16-bit
+ vmovl.u8 q6, d4
+ vmovl.u8 q7, d5
+ vmovl.u8 q8, d6
+ vmovl.u8 q9, d7
+
+ // expand source from 8-bit to 16-bit
+ vmovl.u8 q13, d3
+ vmovl.u8 q10, d0
+ vmovl.u8 q11, d1
+ vmovl.u8 q12, d2
+
+ // calculate destination scale
+ vmul.u16 q5, q13, q4
+ vshr.u16 q5, q5, #8
+ vsub.u16 q5, q14, q5
+
+ // multiply destination ARGB components with dst_scale
+ vmul.u16 q6, q6, q5
+ vmul.u16 q7, q7, q5
+ vmul.u16 q8, q8, q5
+ vmul.u16 q9, q9, q5
+
+ // multiply source ARGB components with src_scale
+ vmul.u16 q10, q10, q4
+ vmul.u16 q11, q11, q4
+ vmul.u16 q12, q12, q4
+ vmul.u16 q13, q13, q4
+
+ // add processed src and dest pixels and extract high bytes
+ vqadd.u8 q10, q6, q10
+ vqadd.u8 q11, q7, q11
+ vqadd.u8 q12, q8, q12
+ vqadd.u8 q13, q9, q13
+
+ vshrn.u16 d20, q10, #8
+ vshrn.u16 d21, q11, #8
+ vshrn.u16 d22, q12, #8
+ vshrn.u16 d23, q13, #8
+
+ vst4.8 {d20, d21, d22, d23}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+.Lless_than_16:
+ cmp r2,#8
+ blt .Lless_than_8
+
+ sub r2,r2,#8
+
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ //update source ptr but not dst ptr
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ // expand destination from 8-bit to 16-bit
+ vmovl.u8 q6, d4
+ vmovl.u8 q7, d5
+ vmovl.u8 q8, d6
+ vmovl.u8 q9, d7
+
+ // expand source from 8-bit to 16-bit
+ vmovl.u8 q13, d3
+ vmovl.u8 q10, d0
+ vmovl.u8 q11, d1
+ vmovl.u8 q12, d2
+
+ // calculate destination scale
+ vmul.u16 q5, q13, q4
+ vshr.u16 q5, q5, #8
+ vsub.u16 q5, q14, q5
+
+ // multiply destination ARGB components with dst_scale
+ vmul.u16 q6, q6, q5
+ vmul.u16 q7, q7, q5
+ vmul.u16 q8, q8, q5
+ vmul.u16 q9, q9, q5
+
+ // multiply source ARGB components with src_scale
+ vmul.u16 q10, q10, q4
+ vmul.u16 q11, q11, q4
+ vmul.u16 q12, q12, q4
+ vmul.u16 q13, q13, q4
+
+ // add processed src and dest pixels and extract high bytes
+ vqadd.u8 q10, q6, q10
+ vqadd.u8 q11, q7, q11
+ vqadd.u8 q12, q8, q12
+ vqadd.u8 q13, q9, q13
+
+ vshrn.u16 d4, q10, #8
+ vshrn.u16 d5, q11, #8
+ vshrn.u16 d6, q12, #8
+ vshrn.u16 d7, q13, #8
+
+ vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+.Lless_than_8:
+ vpop {q4-q7}
+ pop {r4, r5}
+
+.Lslow_path:
+ adds r2, #0
+ bxeq lr
+#endif
+
+/*
+ * r0 - dst
+ * r1 - src
+ * r2 - count
+ * r3 - alpha
+ */
+ push {r4-r11, lr}
+
+ mov r10, #0xFF
+ orr r10, r10, r10, lsl #16 //mask = r10 = 0x00FF00FF
+
+ subs r2, r2, #2
+ blt .Lblitrow32_single_loop
+
+.Lblitrow32_double_loop:
+ ldm r0, {r4, r5}
+ ldm r1!, {r6, r7}
+
+ /* First iteration */
+ lsr lr, r6, #24 //extract src_alpha
+
+ // calculate dst_scale = 256 - ((src_alpha*src_scale)>>8)
+ mul lr, r3, lr
+ lsr lr, #8
+ rsb lr, lr, #256
+
+ // src processing
+ and r8, r6, r10 //rb = (src & mask)
+ and r9, r10, r6, lsr #8 //ag = (src>>8) & mask
+
+ mul r11, r8, r3 //RB = rb * src_scale
+ mul r6, r9, r3 //AG = ag * src_scale
+
+ // combine RB and AG
+ and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask
+ and r6, r6, r10, lsl #8 //r9 = AG & ~mask
+
+ orr r6, r6, r11
+
+ // dst processing
+ and r8, r4, r10 //rb = (dst & mask)
+ and r9, r10, r4, lsr #8 //ag = (dst>>8) & mask
+
+ mul r11, r8, lr //RB = rb * dst_scale
+ mul r4, r9, lr //AG = ag * dst_scale
+
+ // combine RB and AG
+ and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask
+ and r4, r4, r10, lsl #8 //r9 = AG & ~mask
+
+ orr r4, r4, r11
+
+ /* Second iteration */
+ lsr lr, r7, #24 //extract src_alpha
+
+ // calculate dst_scale = 256 - ((src_alpha*src_scale)>>8)
+ mul lr, r3, lr
+ lsr lr, #8
+ rsb lr, lr, #256
+
+ // src processing
+ and r8, r7, r10 //rb = (src & mask)
+ and r9, r10, r7, lsr #8 //ag = (src>>8) & mask
+
+ mul r11, r8, r3 //RB = rb * src_scale
+ mul r7, r9, r3 //AG = ag * src_scale
+
+ // combine RB and AG
+ and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask
+ and r7, r7, r10, lsl #8 //r9 = AG & ~mask
+
+ orr r7, r7, r11
+
+ // dst processing
+ and r8, r5, r10 //rb = (dst & mask)
+ and r9, r10, r5, lsr #8 //ag = (dst>>8) & mask
+
+ mul r11, r8, lr //RB = rb * dst_scale
+ mul r5, r9, lr //AG = ag * dst_scale
+
+ // combine RB and AG
+ and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask
+ and r5, r5, r10, lsl #8 //r9 = AG & ~mask
+
+ orr r5, r5, r11
+
+
+ // add processed src and dst
+ add r6, r6, r4
+ add r7, r7, r5
+
+ subs r2, r2, #2
+ stm r0!, {r6, r7}
+
+ bge .Lblitrow32_double_loop
+
+.Lblitrow32_single_loop:
+ adds r2, #1
+ blo .Lexit
+
+ ldr r4, [r0]
+ ldr r6, [r1], #4
+
+ lsr lr, r6, #24 //extract src_alpha
+
+ // calculate dst_scale = 256 - ((src_alpha*src_scale)>>8)
+ mul lr, r3, lr
+ lsr lr, #8
+ rsb lr, lr, #256
+
+ // src processing
+ and r8, r6, r10 //rb = (src & mask)
+ and r9, r10, r6, lsr #8 //ag = (src>>8) & mask
+
+ mul r11, r8, r3 //RB = rb * src_scale
+ mul r6, r9, r3 //AG = ag * src_scale
+
+ // combine RB and AG
+ and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask
+ and r6, r6, r10, lsl #8 //r9 = AG & ~mask
+
+ orr r6, r6, r11
+
+ // dst processing
+ and r8, r4, r10 //rb = (dst & mask)
+ and r9, r10, r4, lsr #8 //ag = (dst>>8) & mask
+
+ mul r11, r8, lr //RB = rb * dst_scale
+ mul r4, r9, lr //AG = ag * dst_scale
+
+ // combine RB and AG
+ and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask
+ and r4, r4, r10, lsl #8 //r9 = AG & ~mask
+
+ orr r4, r4, r11
+
+ add r6, r6, r4 //add processed src and dst
+
+ str r6, [r0], #4
+
+.Lexit:
+ pop {r4-r11, lr}
+ bx lr
diff --git a/src/opts/S32A_D565_Opaque_arm.S b/src/opts/S32A_D565_Opaque_arm.S
new file mode 100644
index 0000000..9576521
--- /dev/null
+++ b/src/opts/S32A_D565_Opaque_arm.S
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2006, The Android Open Source Project
+ * Copyright (c) 2009, Code Aurora Forum.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * This file is derived from libpixelflinger version of BLIT routine.
+ * Algorithm used for BLIT operation here is equivalent to the one in
+ * C function, S32A_D565_Opaque. Use neon instructions to process 16 pixels
+ * at-a-time on armv7. If the number of pixels is less than 16 and/or the
+ * architecture is armv6 and below, use regular arm instructions. Regular
+ * arm code combines two 16-bit writes into one 32-bit write to destination,
+ * uses destination and source pre-loads, and unrolls the main loop thrice.
+ */
+ .text
+ .align
+
+ .global S32A_D565_Opaque_arm
+
+// uses r6, r7, r8, r9, r10, lr
+
+.macro pixel, DREG, SRC, FB, OFFSET
+
+ // SRC = AABBGGRR
+ subs r7, r10, \SRC, lsr #24 // sAA = 255 - sAA
+ beq 1f
+
+.if \OFFSET
+
+ // red
+ mov lr, \DREG, lsr #(\OFFSET + 6 + 5)
+ smlabb lr, r7, lr, r8
+ and r6, \SRC, r10
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ orr \FB, lr, lsl #(\OFFSET + 11)
+
+ // green
+ and r6, \DREG, #(0x3F<<(\OFFSET + 5))
+ lsr r6, #5
+ smlabt r6, r7, r6, r9
+ and lr, r10, \SRC, lsr #(8)
+ add r6, r6, r6, lsr #6
+ add r6, lr, r6, lsr #6
+ lsr r6, #2
+ orr \FB, \FB, r6, lsl #(\OFFSET + 5)
+
+ // blue
+ and lr, \DREG, #(0x1F << \OFFSET)
+ smlabt lr, r7, lr, r8
+ and r6, r10, \SRC, lsr #(8+8)
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ orr \FB, \FB, lr, lsl #\OFFSET
+
+.else
+
+ // red
+ mov lr, \DREG, lsr #(6+5)
+ and lr, lr, #0x1F
+ smlabb lr, r7, lr, r8
+ and r6, \SRC, r10
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ mov \FB, lr, lsl #11
+
+ // green
+ and r6, \DREG, #(0x3F<<5)
+ lsr r6, #5
+ smlabb r6, r7, r6, r9
+ and lr, r10, \SRC, lsr #(8)
+ add r6, r6, r6, lsr #6
+ add r6, lr, r6, lsr #6
+ lsr r6, #2
+ orr \FB, \FB, r6, lsl #5
+
+ // blue
+ and lr, \DREG, #0x1F
+ smlabb lr, r7, lr, r8
+ and r6, r10, \SRC, lsr #(8+8)
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ orr \FB, \FB, lr, lsr #3
+
+.endif
+ b 2f
+
+ /*
+ * When alpha = 255, down scale the source RGB pixel (24 bits)
+ * to 16 bits(RGB565)
+ */
+1:
+ lsl r6, \SRC, #8
+ lsr lr, \SRC, #5
+ and r7, r6, #0xf800
+ and lr, lr, #0x7e0
+ orr lr, lr, r7
+
+.if \OFFSET
+ orr lr, lr, r6, lsr #27
+ orr \FB, \FB, lr, lsl #(\OFFSET)
+.else
+ orr \FB, lr, r6, lsr #27
+.endif
+
+2:
+.endm
+
+
+// r0: dst ptr
+// r1: src ptr
+// r2: count
+// r3: d
+// r4: s0
+// r5: s1
+// r6: pixel
+// r7: pixel
+// r8: 0x10
+// r9: 0x20
+// r10: 0xFF
+// r11: free
+// r12: scratch
+// r14: free
+
+S32A_D565_Opaque_arm:
+ stmfd sp!, {r4-r10, lr}
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+ subs r2, r2, #16
+
+ blo blit_less_than_16_left
+
+ vmov.u16 q12, #0x80
+ vmov.u8 q13, #0xf8
+
+blit_neon_loop:
+ /*
+ * Load 64 bytes from source and 32 bytes from destination
+ * note that source pixels are 4 bytes wide and
+ * destination pixels are 2 bytes wide.
+ */
+ vld4.8 {d2, d4, d6, d8}, [r1]!
+ vld4.8 {d3, d5, d7, d9}, [r1]!
+
+ vand.8 d10, d8, d9
+ vmov r3, r4, d10
+
+ cmp r3, #0xffffffff
+ cmpeq r4, #0xffffffff
+ bne blit_alpha_not_255
+
+ // alpha equals 255 case
+
+ vshl.u8 q0, q2, #3
+
+ subs r2, r2, #16
+
+ vsri.u8 q1, q2, #5
+ vsri.u8 q0, q3, #3
+
+ // store the rgb destination values back to memory
+ vst2.8 {d0, d2}, [r0]!
+ vst2.8 {d1, d3}, [r0]!
+
+ blo blit_less_than_16_left
+ b blit_neon_loop
+
+blit_alpha_not_255:
+ // alpha = 255 - alpha
+ vmvn.u8 q0, q4
+
+ vld2.8 {q5, q6}, [r0]
+
+ vshl.u8 q7, q6, #3
+
+ subs r2, r2, #16
+
+ vand.u8 q6, q6, q13
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+
+ vsri.u8 q7, q5, #5
+ vshl.u8 q5, q5, #3
+
+ vmlal.u8 q8, d0, d12
+ vmlal.u8 q9, d1, d13
+
+ vshl.u8 q7, q7, #2
+
+ vshr.u16 q10, q8, #5
+ vshr.u16 q11, q9, #5
+ vaddhn.u16 d12, q8, q10
+ vaddhn.u16 d13, q9, q11
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+ vmlal.u8 q8, d0, d14
+ vmlal.u8 q9, d1, d15
+
+ vqadd.u8 q6, q6, q1
+
+ vshr.u16 q10, q8, #6
+ vshr.u16 q11, q9, #6
+ vaddhn.u16 d14, q8, q10
+ vaddhn.u16 d15, q9, q11
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+ vmlal.u8 q8, d0, d10
+ vmlal.u8 q9, d1, d11
+
+ vqadd.u8 q7, q7, q2
+
+ vshl.u8 q5, q7, #3
+
+ vshr.u16 q10, q8, #5
+ vshr.u16 q11, q9, #5
+
+ vsri.u8 q6, q7, #5
+
+ vaddhn.u16 d16, q8, q10
+ vaddhn.u16 d17, q9, q11
+ vqadd.u8 q8, q8, q3
+
+ vsri.u8 q5, q8, #3
+
+ // store the rgb destination values back to memory
+ vst2.8 {d10, d12}, [r0]!
+ vst2.8 {d11, d13}, [r0]!
+
+ blo blit_less_than_16_left
+ b blit_neon_loop
+#endif
+
+blit_less_than_16_left:
+ pld [r1]
+
+ mov r8, #0x10
+ mov r9, #0x20
+ mov r10, #0xFF
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+ adds r2, r2, #14
+#else
+ subs r2, r2, #2
+#endif
+
+ pld [r0]
+ blo 9f
+
+ // The main loop is unrolled thrice and process 6 pixels
+8: ldmia r1!, {r4, r5}
+ // stream the source
+ pld [r1, #32]
+ add r0, r0, #4
+ // it's all zero, skip this pixel
+ orrs r3, r4, r5
+ beq 7f
+
+ // load the destination
+ ldr r3, [r0, #-4]
+ // stream the destination
+ pld [r0, #32]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ // effectively, we're getting write-combining by virtue of the
+ // cpu's write-back cache.
+ str r12, [r0, #-4]
+
+ // 2nd iteration of the loop, don't stream anything
+ subs r2, r2, #2
+ blt 9f
+ ldmia r1!, {r4, r5}
+ add r0, r0, #4
+ orrs r3, r4, r5
+ beq 7f
+ ldr r3, [r0, #-4]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ str r12, [r0, #-4]
+
+ // 3rd iteration of the loop, don't stream anything
+ subs r2, r2, #2
+ blt 9f
+ ldmia r1!, {r4, r5}
+ add r0, r0, #4
+ orrs r3, r4, r5
+ beq 7f
+ ldr r3, [r0, #-4]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ str r12, [r0, #-4]
+
+7: subs r2, r2, #2
+ blo 9f
+ b 8b
+
+9: adds r2, r2, #1
+ ldmlofd sp!, {r4-r10, lr} // return
+ bxlo lr
+
+ // last pixel left
+ ldr r4, [r1], #4
+ ldrh r3, [r0]
+ pixel r3, r4, r12, 0
+ strh r12, [r0], #2
+ ldmfd sp!, {r4-r10, lr} // return
+ bx lr
diff --git a/src/opts/S32A_Opaque_BlitRow32_arm.S b/src/opts/S32A_Opaque_BlitRow32_arm.S
new file mode 100644
index 0000000..0ecfa1d
--- /dev/null
+++ b/src/opts/S32A_Opaque_BlitRow32_arm.S
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2005-2008, The Android Open Source Project
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ .text
+
+ .global S32A_Opaque_BlitRow32_arm
+ .func S32A_Opaque_BlitRow32_arm
+
+S32A_Opaque_BlitRow32_arm:
+
+ push {r4-r11}
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+
+ cmp r2,#24
+ blt .Lless_than_24
+
+ vpush {Q4-Q7}
+
+ vmov.i16 q14,#0x100 //Q14.16 = 256
+//prefix
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ //update source ptr but not dst ptr
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+ add r3, r0, #32 // minus 16 to pretend the last round
+ mov r5, #64
+ sub r2,r2,#8
+.Lloop:
+ pld [r1, #256]
+ pld [r0, #256]
+ sub r2,r2,#16
+ vsubw.u8 q4,q14,d3 //Q4.16 = 256-d3
+
+ //It has to be 24 since we pre-load 8 word for the next rounds
+ cmp r2,#16
+
+ vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4
+ vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7
+
+
+ vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4
+ vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4
+
+ vld4.8 {d20, d21, d22, d23}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+
+ vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4
+
+ vld4.8 {d24, d25, d26, d27}, [r3] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d4,d4,d0 //d4 = d4+d0
+ vadd.i8 d5,d5,d1 //d5 = d5+d1
+ vadd.i8 d6,d6,d2 //d6 = d6+d2
+ vadd.i8 d7,d7,d3 //d7 = d7+d3
+
+ vst4.8 {d4, d5, d6, d7}, [r0], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+ //add r0, r0, r5
+
+ //The next 4 words
+ vsubW.u8 q4,q14,d23 //Q4.16 = 256-d3
+
+ vmovl.u8 q6,d24 //Q6 = vmovl.u8 d4
+ vmovl.u8 q7,d25 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d26 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d27 //Q9 = vmovl.u8 d7
+
+ vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4
+ vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4
+
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+
+ vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4
+
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+ vshrn.i16 d24,q6,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d25,q7,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d26,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d27,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d24,d24,d20 //d4 = d4+d0
+ vadd.i8 d25,d25,d21 //d5 = d5+d1
+ vadd.i8 d26,d26,d22 //d6 = d6+d2
+ vadd.i8 d27,d27,d23 //d7 = d7+d3
+
+ vst4.8 {d24, d25, d26, d27}, [r3], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+ //add r3, r3, r5
+
+ bge .Lloop
+
+//There are 8 words left unprocessed from previous round
+ vsubw.u8 q4,q14,d3 //Q4.16 = 256-d3
+
+ cmp r2,#8
+
+ vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4
+ vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7
+
+ vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4
+ vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4
+ vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4
+
+ vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d4,d4,d0 //d4 = d4+d0
+ vadd.i8 d5,d5,d1 //d5 = d5+d1
+ vadd.i8 d6,d6,d2 //d6 = d6+d2
+ vadd.i8 d7,d7,d3 //d7 = d7+d3
+
+ vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+.Lless_than_16:
+ cmp r2,#8
+ blt .Lless_than_8
+
+ sub r2,r2,#8
+
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ //update source ptr but not dst ptr
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ vsubw.u8 q4,q14,d3 //Q4.16 = 256-d3
+
+ vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4
+ vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7
+
+ vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4
+ vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4
+ vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4
+
+ vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d4,d4,d0 //d4 = d4+d0
+ vadd.i8 d5,d5,d1 //d5 = d5+d1
+ vadd.i8 d6,d6,d2 //d6 = d6+d2
+ vadd.i8 d7,d7,d3 //d7 = d7+d3
+
+ vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+.Lless_than_8:
+ vpop {Q4-Q7}
+
+.Lless_than_4:
+ cmp r2, #1
+ bmi .Lexit
+ b .Lresidual_loop
+
+.Lless_than_24:
+ cmp r2,#8
+ blt .Lless_than_4
+
+.Lloop_8:
+ sub r2,r2,#8
+ // We already read the 8 words from the previous pipe line
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ //update source ptr but not dst ptr
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ vmov.i16 q10,#0x100 //Q4.16 = 256
+ vsubW.u8 q10,q10,d3 //Q4.16 = 256-d3
+
+ cmp r2,#8
+
+ vmovl.u8 q12,d4 //Q6 = vmovl.u8 d4
+ vmovl.u8 q13,d5 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7
+
+ vmul.i16 q12,q12,q10 //Q6 = Q6 * Q4
+ vmul.i16 q13,q13,q10 //Q7 = Q7 * Q4
+ vmul.i16 q8,q8,q10 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q10 //Q9 = Q9 * Q4
+
+ vshrn.i16 d4,q12,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d5,q13,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d4,d4,d0 //d4 = d4+d0
+ vadd.i8 d5,d5,d1 //d5 = d5+d1
+ vadd.i8 d6,d6,d2 //d6 = d6+d2
+ vadd.i8 d7,d7,d3 //d7 = d7+d3
+
+ vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+ bge .Lloop_8
+ b .Lless_than_4
+
+#endif
+
+/*
+ * r0 - dst
+ * r1 - src
+ * r2 - count
+ */
+.Lresidual_loop:
+ mov r10, #0xFF
+ orr r10, r10, r10, lsl #16 //mask = r10 = 0x00FF00FF
+
+ subs r2, r2, #2
+ blt .Lblitrow32_single_loop
+
+.Lblitrow32_double_loop:
+ ldm r0, {r3, r4}
+ ldm r1!, {r5, r6}
+
+ orrs r9, r3, r4
+ beq .Lblitrow32_loop_cond
+
+ // First iteration
+ lsr r7, r5, #24 //extract alpha
+ and r8, r3, r10 //rb = (dst & mask)
+ rsb r7, r7, #256 //r5 = scale = (255-alpha)+1
+ and r9, r10, r3, lsr #8 //ag = (dst>>8) & mask
+
+ mul r11, r8, r7 //RB = rb * scale
+ mul r3, r9, r7 //AG = ag * scale
+
+ // combine RB and AG
+ and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask
+ and r3, r3, r10, lsl #8 //r9 = AG & ~mask
+
+ lsr r7, r6, #24 //extract alpha for second iteration
+ orr r3, r3, r11
+
+ // Second iteration
+ and r8, r4, r10 //rb = (dst & mask)
+ rsb r7, r7, #256 //r5 = scale = (255-alpha)+1
+ and r9, r10, r4, lsr #8 //ag = (dst>>8) & mask
+
+ mul r11, r8, r7 //RB = rb * scale
+ mul r4, r9, r7 //AG = ag * scale
+
+ // combine RB and AG
+ and r11, r10, r11, lsr #8 //r8 = (RB>>8) & mask
+ and r4, r4, r10, lsl #8 //r9 = AG & ~mask
+ orr r4, r4, r11
+
+ // add src to combined value
+ add r5, r5, r3
+ add r6, r6, r4
+
+.Lblitrow32_loop_cond:
+ subs r2, r2, #2
+ stm r0!, {r5, r6}
+
+ bge .Lblitrow32_double_loop
+
+.Lblitrow32_single_loop:
+ adds r2, #1
+ blo .Lexit
+
+ ldr r3, [r0]
+ ldr r5, [r1], #4
+
+ cmp r3, #0
+ beq .Lblitrow32_single_store
+
+ lsr r7, r5, #24 //extract alpha
+ and r8, r3, r10 //rb = (dst & mask)
+ rsb r7, r7, #256 //r5 = scale = (255-alpha)+1
+ and r9, r10, r3, lsr #8 //ag = (dst>>8) & mask
+
+ mul r8, r8, r7 //RB = rb * scale
+ mul r9, r9, r7 //AG = ag * scale
+
+ // combine RB and AG
+ and r8, r10, r8, lsr #8 //r8 = (RB>>8) & mask
+ and r9, r9, r10, lsl #8 //r9 = AG & ~mask
+ orr r3, r8, r9
+
+ add r5, r5, r3 //add src to combined value
+
+.Lblitrow32_single_store:
+ str r5, [r0], #4
+
+.Lexit:
+ pop {r4-r11}
+ bx lr
diff --git a/src/opts/S32_Opaque_D32_nofilter_DX_gether_arm.S b/src/opts/S32_Opaque_D32_nofilter_DX_gether_arm.S
new file mode 100644
index 0000000..3467432
--- /dev/null
+++ b/src/opts/S32_Opaque_D32_nofilter_DX_gether_arm.S
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2005-2008, The Android Open Source Project
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ .text
+ .global S32_Opaque_D32_nofilter_DX_gether
+ .func S32_Opaque_D32_nofilter_DX_gether
+S32_Opaque_D32_nofilter_DX_gether:
+ push {r0-r11,lr}
+ asr r0,r2,#3
+ sub sp,sp,#4 //23
+ cmp r0,#0
+ str r0,[sp,#0] //r0 = count >> 3
+ ble .L1_140
+ ldr r4,[sp,#4] //r4 = r0 (dst)
+ mov r0,r3
+ add r12,r3,#4
+ asr r8,r2,#3
+.L1_52:
+ ldm r3!, {r0,r6,r9,r11}
+ lsr r5,r0,#16 //30
+ ldr r5,[r1,r5,lsl #2] //30
+ lsr r7,r6,#16 //32
+ ldr r7,[r1,r7,lsl #2] //31
+ uxth r0,r0 //34
+ ldr r0,[r1,r0,lsl #2] //34
+ uxth r6,r6 //31
+ ldr r6,[r1,r6,lsl #2] //32
+ //stm r4!, {r0,r5,r6,r7} ;35
+ lsr r10,r9,#16 //30
+ ldr r10,[r1,r10,lsl #2] //30
+ lsr lr,r11,#16 //32
+ ldr lr,[r1,lr,lsl #2] //31
+ uxth r9,r9 //34
+ ldr r9,[r1,r9,lsl #2] //34
+ uxth r11,r11 //31
+ ldr r11,[r1,r11,lsl #2] //32
+ subs r8,r8,#1
+ stm r4!, {r0,r5,r6,r7,r9,r10,r11,lr} //35
+
+ bne .L1_52
+
+ ldr r0,[sp,#0] // count >> 3
+ mov r12,r0
+ ldr r0,[sp,#4] //r0 = dst
+ add r0,r0,r12,lsl #5 //dst += count >>3 << 5
+ str r0,[sp,#4] //save r0 into stack again
+.L1_140:
+//;;39 const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy);
+//;;40 for (i = (count & 7); i > 0; --i) {
+ tst r2,#7
+ beq .L1_184
+ ldr r0,[sp,#4] //r0 = currnt dst
+ and r2,r2,#7
+.L1_156:
+//;;41 //SkASSERT(*xx < (unsigned)s.fBitmap->width());
+//;;42 src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
+ ldrh r4,[r3],#2
+ add r12,r0,#4
+//;;43 }
+ subs r2,r2,#1
+ ldr r4,[r1,r4,lsl #2] //42
+ str r4,[r0,#0] //42
+ mov r0,r12 //42
+ bne .L1_156
+.L1_184:
+//;;44 }
+ add sp,sp,#0x14
+ pop {r4-r11,pc}
+
+.endfunc
+.size S32_Opaque_D32_nofilter_DX_gether, .-S32_Opaque_D32_nofilter_DX_gether
diff --git a/src/opts/SkBitmapProcState_opts_arm.cpp b/src/opts/SkBitmapProcState_opts_arm.cpp
index 20d62e1..f7b89a9 100644
--- a/src/opts/SkBitmapProcState_opts_arm.cpp
+++ b/src/opts/SkBitmapProcState_opts_arm.cpp
@@ -11,6 +11,11 @@
#include "SkColorPriv.h"
#include "SkUtils.h"
+#if defined(__ARM_HAVE_NEON)
+#include <arm_neon.h>
+#endif
+
+
#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
void SI8_D16_nofilter_DX_arm(
const SkBitmapProcState& s,
@@ -184,11 +189,201 @@ void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
}
#endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
+
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+void S16_opaque_D32_nofilter_DX_neon_asm(const SkBitmapProcState& s,
+ const uint32_t* __restrict__ xy,
+ int count, uint32_t* __restrict__ colors) {
+
+ const uint16_t* __restrict__ srcAddr = (const uint16_t*)s.fBitmap->getPixels();
+
+ uint16_t* index;
+ uint16_t src;
+ int i;
+
+ srcAddr = (const uint16_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes());
+
+ const uint16_t* __restrict__ xx = (const uint16_t*)(++xy);
+
+ if (1 == s.fBitmap->width()) {
+
+ src = srcAddr[0];
+ uint32_t dstValue = SkPixel16ToPixel32(src);
+ sk_memset32(colors, dstValue, count);
+ } else if ((xx[count-1] - xx[0]) == (count-1)) {
+ // No scaling
+ const uint16_t* src_data = (const uint16_t*)(srcAddr + xx[0]);
+ asm volatile (
+ "subs %[count], %[count], #8 \n\t" // count -= 8, set flag
+ "blt 2f \n\t" // if count < 0, branch to label 2
+ "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use.
+ "1: \n\t" // 8 loop
+ // Handle 8 pixels in one loop.
+
+ "vld1.u16 {q0}, [%[src_data]]! \n\t" // load eight src 565 pixels
+
+ "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2
+ "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3
+ "vmov.u16 q1, q8 \n\t" // copy alpha from q8
+ "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1
+ "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1
+ "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2
+ "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2
+ "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2
+ "vzip.16 q2, q1 \n\t" // interleave q1 and q2
+ "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst
+ "subs %[count], %[count], #8 \n\t" // count -= 8, set flag
+ "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q1 to dst
+
+ "bge 1b \n\t" // loop if count >= 0
+ "2: \n\t" // exit of 8 loop
+
+ "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed.
+ "blt 3f \n\t" // if count < 0, branch to label 3
+
+ // Handle 4 pixels at once
+
+ "vld1.u16 {d0}, [%[src_data]]! \n\t" // load eight src 565 pixels
+
+ "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2
+ "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1
+ "vmov.u16 d3, d16 \n\t" // copy alpha from d16
+ "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3
+ "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3
+ "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2
+ "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2
+ "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2
+ "vzip.16 d2, d3 \n\t" // interleave d2 and d3
+ "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst
+
+ "3: \n\t" // end
+ : [src_data] "+r" (src_data), [colors] "+r" (colors), [count] "+r" (count)
+ :
+ : "cc", "memory","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17"
+ );
+
+ for (i = (count & 3); i > 0; --i) {
+ *colors++ = SkPixel16ToPixel32(*src_data++);
+ }
+
+ } else {
+ // Scaling case
+ uint16_t data[8];
+
+ asm volatile (
+ "subs %[count], %[count], #8 \n\t" // count -= 8, set flag
+ "blt 2f \n\t" // if count < 0, branch to label 2
+ "vmov.u16 q8, #0xFF00 \n\t" // Load alpha value into q8 for later use.
+ "1: \n\t" // 8 loop
+ // Handle 8 pixels in one loop.
+ "ldmia %[xx]!, {r4, r5, r6, r7} \n\t" // load ptrs to pixels 0-7
+
+ "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer
+ "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer
+ "mov r6, r6, lsl #1 \n\t" // <<1 because of 16 bit pointer
+ "mov r7, r7, lsl #1 \n\t" // <<1 because of 16 bit pointer
+
+ "uxth r8, r4 \n\t" // extract ptr 0
+ "mov r4, r4, lsr #16 \n\t" // extract ptr 1
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image
+ "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image
+ "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register
+
+ "uxth r8, r5 \n\t" // extract ptr 2
+ "mov r5, r5, lsr #16 \n\t" // extract ptr 3
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image
+ "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image
+ "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register
+
+ "uxth r8, r6 \n\t" // extract ptr 4
+ "mov r6, r6, lsr #16 \n\t" // extract ptr 5
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image
+ "ldrh r6, [%[srcAddr], r6] \n\t" // load pixel 5 from image
+ "pkhbt r6, r8, r6, lsl #16 \n\t" // combine pixel 4 and 5 in one register
+
+ "uxth r8, r7 \n\t" // extract ptr 6
+ "mov r7, r7, lsr #16 \n\t" // extract ptr 7
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 6 from image
+ "ldrh r7, [%[srcAddr], r7] \n\t" // load pixel 7 from image
+ "pkhbt r7, r8, r7, lsl #16 \n\t" // combine pixel 6 and 7 in one register
+
+ "stmia %[data], {r4, r5, r6, r7} \n\t" // store 8 src pixels
+
+ "vld1.u16 {q0}, [%[data]] \n\t" // load eight src 565 pixels
+
+ "vshl.u16 q2, q0, #5 \n\t" // put green in the 6 high bits of q2
+ "vshl.u16 q3, q0, #11 \n\t" // put blue in the 5 high bits of q3
+ "vmov.u16 q1, q8 \n\t" // copy alpha from q8
+ "vsri.u16 q1, q3, #8 \n\t" // put blue below alpha in q1
+ "vsri.u16 q1, q3, #13 \n\t" // put 3 MSB blue below blue in q1
+ "vsri.u16 q2, q2, #6 \n\t" // put 2 MSB green below green in q2
+ "vsri.u16 q2, q0, #8 \n\t" // put red below green in q2
+ "vsri.u16 q2, q0, #13 \n\t" // put 3 MSB red below red in q2
+ "vzip.16 q2, q1 \n\t" // interleave q1 and q2
+ "vst1.16 {d4, d5}, [%[colors]]! \n\t" // store q1 to dst
+ "subs %[count], %[count], #8 \n\t" // count -= 8, set flag
+ "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store q2 to dst
+
+ "bge 1b \n\t" // loop if count >= 0
+ "2: \n\t" // exit of 8 loop
+
+ "adds %[count], %[count], #4 \n\t" // add 4 to count to see if a 4 loop is needed.
+ "blt 3f \n\t" // if count < 0, branch to label 3
+
+ // Handle 4 pixels at once
+ "ldmia %[xx]!, {r4, r5} \n\t" // load ptrs to pixels 0-3
+
+ "mov r4, r4, lsl #1 \n\t" // <<1 because of 16 bit pointer
+ "mov r5, r5, lsl #1 \n\t" // <<1 because of 16 bit pointer
+
+ "uxth r8, r4 \n\t" // extract ptr 0
+ "mov r4, r4, lsr #16 \n\t" // extract ptr 1
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 0 from image
+ "ldrh r4, [%[srcAddr], r4] \n\t" // load pixel 1 from image
+ "pkhbt r4, r8, r4, lsl #16 \n\t" // combine pixel 0 and 1 in one register
+
+ "uxth r8, r5 \n\t" // extract ptr 2
+ "mov r5, r5, lsr #16 \n\t" // extract ptr 3
+ "ldrh r8, [%[srcAddr], r8] \n\t" // load pixel 2 from image
+ "ldrh r5, [%[srcAddr], r5] \n\t" // load pixel 3 from image
+ "pkhbt r5, r8, r5, lsl #16 \n\t" // combine pixel 2 and 3 in one register
+
+ "stmia %[data], {r4, r5} \n\t" // store 4 src pixels
+
+ "vld1.u16 {d0}, [%[data]] \n\t" // load eight src 565 pixels
+
+ "vshl.u16 d2, d0, #5 \n\t" // put green in the 6 high bits of d2
+ "vshl.u16 d1, d0, #11 \n\t" // put blue in the 5 high bits of d1
+ "vmov.u16 d3, d16 \n\t" // copy alpha from d16
+ "vsri.u16 d3, d1, #8 \n\t" // put blue below alpha in d3
+ "vsri.u16 d3, d1, #13 \n\t" // put 3 MSB blue below blue in d3
+ "vsri.u16 d2, d2, #6 \n\t" // put 2 MSB green below green in d2
+ "vsri.u16 d2, d0, #8 \n\t" // put red below green in d2
+ "vsri.u16 d2, d0, #13 \n\t" // put 3 MSB red below red in d2
+ "vzip.16 d2, d3 \n\t" // interleave d2 and d3
+ "vst1.16 {d2, d3}, [%[colors]]! \n\t" // store d2 and d3 to dst
+
+ "3: \n\t" // End
+ : [xx] "+r" (xx), [colors] "+r" (colors), [count] "+r" (count)
+ : [data] "r" (data), [srcAddr] "r" (srcAddr)
+ : "cc", "memory","r4","r5","r6","r7","r8","d0","d1","d2","d3","d4","d5","d6","d7","d16","d17"
+ );
+
+ for (i = (count & 3); i > 0; --i) {
+ src = srcAddr[*xx++]; *colors++ = SkPixel16ToPixel32(src);
+ }
+ }
+}
+#endif
+
+
///////////////////////////////////////////////////////////////////////////////
/* If we replace a sampleproc, then we null-out the associated shaderproc,
otherwise the shader won't even look at the matrix/sampler
*/
+
+
void SkBitmapProcState::platformProcs() {
bool doFilter = fDoFilter;
bool isOpaque = 256 == fAlphaScale;
@@ -214,6 +409,15 @@ void SkBitmapProcState::platformProcs() {
}
#endif
break;
+ case SkBitmap::kRGB_565_Config:
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+ if (justDx && !doFilter) {
+ if (isOpaque) {
+ fSampleProc32 = S16_opaque_D32_nofilter_DX_neon_asm;
+ }
+ }
+#endif
+ break;
default:
break;
}
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index dd8e406..361acbe 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -1,20 +1,33 @@
/*
- * Copyright 2009 The Android Open Source Project
+ * Copyright 2012 The Android Open Source Project
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
-#include "SkBlitRow.h"
#include "SkBlitMask.h"
+#include "SkBlitRow.h"
#include "SkColorPriv.h"
#include "SkDither.h"
+#include "SkUtils.h"
+
+#include "SkCachePreload_arm.h"
#if defined(__ARM_HAVE_NEON)
#include <arm_neon.h>
#endif
+extern "C" void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count,
+ U8CPU alpha);
+
+extern "C" void S32A_Blend_BlitRow32_arm_neon(SkPMColor* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count,
+ U8CPU alpha);
+
#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
@@ -29,6 +42,10 @@ static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
"vmov.u8 d31, #1<<7 \n\t"
"vld1.16 {q12}, [%[dst]] \n\t"
"vld4.8 {d0-d3}, [%[src]] \n\t"
+ // Thumb does not support the standard ARM conditional
+ // instructions but instead requires the 'it' instruction
+ // to signal conditional execution
+ "it eq \n\t"
"moveq ip, #8 \n\t"
"mov %[keep_dst], %[dst] \n\t"
@@ -470,15 +487,19 @@ static void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst,
: "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip"
);
}
-#define S32A_D565_Opaque_PROC S32A_D565_Opaque_v7
#define S32A_D565_Blend_PROC NULL
#define S32_D565_Blend_Dither_PROC NULL
#else
-#define S32A_D565_Opaque_PROC NULL
#define S32A_D565_Blend_PROC NULL
#define S32_D565_Blend_Dither_PROC NULL
#endif
+/*
+ * Use neon version of BLIT assembly code from S32A_D565_Opaque_arm.S, where we process
+ * 16 pixels at-a-time and also optimize for alpha=255 case.
+ */
+#define S32A_D565_Opaque_PROC NULL
+
/* Don't have a special version that assumes each src is opaque, but our S32A
is still faster than the default, so use it here
*/
@@ -663,6 +684,10 @@ TAIL:
#elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+/*
+ * User S32A_Opaque_BlitRow32 function from S32A_Opaque_BlitRow32.S
+ */
+#if 0
static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
@@ -786,6 +811,13 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
}
#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon
+#endif
+
+/*
+ * Use asm version of BlitRow function. Neon instructions are
+ * used for armv7 targets.
+ */
+#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm
#elif defined (__ARM_ARCH__) /* #if defined(__ARM_HAVE_NEON) && defined... */
@@ -1799,6 +1831,105 @@ static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
#define S32_D565_Opaque_Dither_PROC NULL
#endif
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+static void Color32_neon(SkPMColor* dst, const SkPMColor* src, int count,
+ SkPMColor color) {
+ if (count <= 0) {
+ return;
+ }
+
+ if (0 == color) {
+ if (src != dst) {
+ memcpy(dst, src, count * sizeof(SkPMColor));
+ }
+ return;
+ }
+
+ unsigned colorA = SkGetPackedA32(color);
+ if (255 == colorA) {
+ sk_memset32(dst, color, count);
+ } else {
+ unsigned scale = 256 - SkAlpha255To256(colorA);
+
+ if (count >= 8) {
+ // at the end of this assembly, count will have been decremented
+ // to a negative value. That is, if count mod 8 = x, it will be
+ // -8 +x coming out.
+ asm volatile (
+ PLD128(src, 0)
+
+ "vdup.32 q0, %[color] \n\t"
+
+ PLD128(src, 128)
+
+ // scale numerical interval [0-255], so load as 8 bits
+ "vdup.8 d2, %[scale] \n\t"
+
+ PLD128(src, 256)
+
+ "subs %[count], %[count], #8 \n\t"
+
+ PLD128(src, 384)
+
+ "Loop_Color32: \n\t"
+
+ // load src color, 8 pixels, 4 64 bit registers
+ // (and increment src).
+ "vld1.32 {d4-d7}, [%[src]]! \n\t"
+
+ PLD128(src, 384)
+
+ // multiply long by scale, 64 bits at a time,
+ // destination into a 128 bit register.
+ "vmull.u8 q4, d4, d2 \n\t"
+ "vmull.u8 q5, d5, d2 \n\t"
+ "vmull.u8 q6, d6, d2 \n\t"
+ "vmull.u8 q7, d7, d2 \n\t"
+
+ // shift the 128 bit registers, containing the 16
+ // bit scaled values back to 8 bits, narrowing the
+ // results to 64 bit registers.
+ "vshrn.i16 d8, q4, #8 \n\t"
+ "vshrn.i16 d9, q5, #8 \n\t"
+ "vshrn.i16 d10, q6, #8 \n\t"
+ "vshrn.i16 d11, q7, #8 \n\t"
+
+ // adding back the color, using 128 bit registers.
+ "vadd.i8 q6, q4, q0 \n\t"
+ "vadd.i8 q7, q5, q0 \n\t"
+
+ // store back the 8 calculated pixels (2 128 bit
+ // registers), and increment dst.
+ "vst1.32 {d12-d15}, [%[dst]]! \n\t"
+
+ "subs %[count], %[count], #8 \n\t"
+ "bge Loop_Color32 \n\t"
+ : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
+ : [color] "r" (color), [scale] "r" (scale)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
+ );
+ // At this point, if we went through the inline assembly, count is
+ // a negative value:
+ // if the value is -8, there is no pixel left to process.
+ // if the value is -7, there is one pixel left to process
+ // ...
+ // And'ing it with 7 will give us the number of pixels
+ // left to process.
+ count = count & 0x7;
+ }
+
+ while (count > 0) {
+ *dst = color + SkAlphaMulQ(*src, scale);
+ src += 1;
+ dst += 1;
+ count--;
+ }
+ }
+}
+#endif
+
///////////////////////////////////////////////////////////////////////////////
static const SkBlitRow::Proc platform_565_procs[] = {
@@ -1848,12 +1979,15 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
return platform_32_procs[flags];
}
+///////////////////////////////////////////////////////////////////////////////
SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+ return Color32_neon;
+#else
return NULL;
+#endif
}
-///////////////////////////////////////////////////////////////////////////////
-
SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
SkMask::Format maskFormat,
SkColor color) {
diff --git a/src/opts/SkCachePreload_arm.h b/src/opts/SkCachePreload_arm.h
new file mode 100644
index 0000000..cff8c2a
--- /dev/null
+++ b/src/opts/SkCachePreload_arm.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2012 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef SkCachePreload_arm_DEFINED
+#define SkCachePreload_arm_DEFINED
+
+// This file defines macros for preload instructions for ARM. These macros
+// are designed to be embedded inside GNU inline assembly.
+// For the use of these macros, __ARM_USE_PLD needs to be enabled. The cache
+// line size also needs to be known (and needs to be contained inside
+// __ARM_CACHE_LINE_SIZE).
+#if defined(__ARM_USE_PLD)
+
+#define PLD(x, n) "pld [%["#x"], #("#n")]\n\t"
+
+#if __ARM_CACHE_LINE_SIZE == 32
+ #define PLD64(x, n) PLD(x, n) PLD(x, (n) + 32)
+#elif __ARM_CACHE_LINE_SIZE == 64
+ #define PLD64(x, n) PLD(x, n)
+#else
+ #error "unknown __ARM_CACHE_LINE_SIZE."
+#endif
+#else
+ // PLD is disabled, all macros become empty.
+ #define PLD(x, n)
+ #define PLD64(x, n)
+#endif
+
+#define PLD128(x, n) PLD64(x, n) PLD64(x, (n) + 64)
+
+#endif // SkCachePreload_arm_DEFINED
diff --git a/src/ports/SkFontHost_FreeType.cpp b/src/ports/SkFontHost_FreeType.cpp
index 6c03bf9..ea24a6e 100644
--- a/src/ports/SkFontHost_FreeType.cpp
+++ b/src/ports/SkFontHost_FreeType.cpp
@@ -12,6 +12,7 @@
#include "SkColorPriv.h"
#include "SkDescriptor.h"
#include "SkFDot6.h"
+#include "SkFloatingPoint.h"
#include "SkFontHost.h"
#include "SkMask.h"
#include "SkAdvancedTypefaceMetrics.h"
@@ -55,6 +56,7 @@
//#define DUMP_STRIKE_CREATION
//#define SK_GAMMA_APPLY_TO_A8
+//#define SK_GAMMA_SRGB
#ifndef SK_GAMMA_CONTRAST
#define SK_GAMMA_CONTRAST 0x66
@@ -130,8 +132,8 @@ InitFreetype() {
// Setup LCD filtering. This reduces colour fringes for LCD rendered
// glyphs.
#ifdef FT_LCD_FILTER_H
-// err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_DEFAULT);
- err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_LIGHT);
+ err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_DEFAULT);
+// err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_LIGHT);
gLCDSupport = err == 0;
if (gLCDSupport) {
gLCDExtra = 2; //DEFAULT and LIGHT add one pixel to each side.
@@ -640,6 +642,13 @@ static bool isAxisAligned(const SkScalerContext::Rec& rec) {
}
void SkFontHost::FilterRec(SkScalerContext::Rec* rec) {
+ //BOGUS: http://code.google.com/p/chromium/issues/detail?id=121119
+ //Cap the requested size as larger sizes give bogus values.
+ //Remove when http://code.google.com/p/skia/issues/detail?id=554 is fixed.
+ if (rec->fTextSize > SkIntToScalar(1 << 14)) {
+ rec->fTextSize = SkIntToScalar(1 << 14);
+ }
+
if (!gLCDSupportValid) {
InitFreetype();
FT_Done_FreeType(gFTLibrary);
@@ -656,7 +665,7 @@ void SkFontHost::FilterRec(SkScalerContext::Rec* rec) {
// collapse full->normal hinting if we're not doing LCD
h = SkPaint::kNormal_Hinting;
}
- if ((rec->fFlags & SkScalerContext::kSubpixelPositioning_Flag) || isLCD(*rec)) {
+ if ((rec->fFlags & SkScalerContext::kSubpixelPositioning_Flag)) {
if (SkPaint::kNo_Hinting != h) {
h = SkPaint::kSlight_Hinting;
}
@@ -781,7 +790,7 @@ SkScalerContext_FreeType::SkScalerContext_FreeType(const SkDescriptor* desc)
fUseVertMetrics = false;
{
FT_Int32 loadFlags = FT_LOAD_DEFAULT;
- bool linearMetrics = false;
+ bool linearMetrics = SkToBool(fRec.fFlags & SkScalerContext::kSubpixelPositioning_Flag);
if (SkMask::kBW_Format == fRec.fMaskFormat) {
// See http://code.google.com/p/chromium/issues/detail?id=43252#c24
@@ -798,7 +807,6 @@ SkScalerContext_FreeType::SkScalerContext_FreeType(const SkDescriptor* desc)
break;
case SkPaint::kSlight_Hinting:
loadFlags = FT_LOAD_TARGET_LIGHT; // This implies FORCE_AUTOHINT
- linearMetrics = true;
break;
case SkPaint::kNormal_Hinting:
if (fRec.fFlags & SkScalerContext::kAutohinting_Flag)
@@ -1115,16 +1123,17 @@ void SkScalerContext_FreeType::generateMetrics(SkGlyph* glyph) {
goto ERROR;
}
- if ((fRec.fFlags & SkScalerContext::kSubpixelPositioning_Flag) == 0) {
+ if (fDoLinearMetrics) {
+ glyph->fAdvanceX = SkFixedMul(fMatrix22.xx, fFace->glyph->linearHoriAdvance);
+ glyph->fAdvanceY = -SkFixedMul(fMatrix22.yx, fFace->glyph->linearHoriAdvance);
+ } else {
glyph->fAdvanceX = SkFDot6ToFixed(fFace->glyph->advance.x);
glyph->fAdvanceY = -SkFDot6ToFixed(fFace->glyph->advance.y);
+
if (fRec.fFlags & kDevKernText_Flag) {
glyph->fRsbDelta = SkToS8(fFace->glyph->rsb_delta);
glyph->fLsbDelta = SkToS8(fFace->glyph->lsb_delta);
}
- } else {
- glyph->fAdvanceX = SkFixedMul(fMatrix22.xx, fFace->glyph->linearHoriAdvance);
- glyph->fAdvanceY = -SkFixedMul(fMatrix22.yx, fFace->glyph->linearHoriAdvance);
}
if (fUseVertMetrics) {
@@ -1200,49 +1209,71 @@ void SkScalerContext_FreeType::generateMetrics(SkGlyph* glyph) {
///////////////////////////////////////////////////////////////////////////////
-static int apply_contrast(int srca, int contrast) {
- return srca + (((255 - srca) * contrast * srca) / (255*255));
+#ifdef SK_USE_COLOR_LUMINANCE
+
+static float apply_contrast(float srca, float contrast) {
+ return srca + ((1.0f - srca) * contrast * srca);
}
-static void build_power_table(uint8_t table[], float ee) {
- for (int i = 0; i < 256; i++) {
- float x = i / 255.f;
- x = powf(x, ee);
- int xx = SkScalarRoundToInt(SkFloatToScalar(x * 255));
- table[i] = SkToU8(xx);
+#ifdef SK_GAMMA_SRGB
+static float lin(float per) {
+ if (per <= 0.04045f) {
+ return per / 12.92f;
}
+ return powf((per + 0.055f) / 1.055, 2.4f);
}
-
-static void build_gamma_table(uint8_t table[256], int src, int dst) {
- static bool gInit;
- static uint8_t powTable[256], invPowTable[256];
- if (!gInit) {
- const float g = SK_GAMMA_EXPONENT;
- build_power_table(powTable, g);
- build_power_table(invPowTable, 1/g);
- gInit = true;
+static float per(float lin) {
+ if (lin <= 0.0031308f) {
+ return lin * 12.92f;
}
+ return 1.055f * powf(lin, 1.0f / 2.4f) - 0.055f;
+}
+#else //SK_GAMMA_SRGB
+static float lin(float per) {
+ const float g = SK_GAMMA_EXPONENT;
+ return powf(per, g);
+}
+static float per(float lin) {
+ const float g = SK_GAMMA_EXPONENT;
+ return powf(lin, 1.0f / g);
+}
+#endif //SK_GAMMA_SRGB
- const int linSrc = powTable[src];
- const int linDst = powTable[dst];
- // have our contrast value taper off to 0 as the src luminance becomes white
- const int contrast = SK_GAMMA_CONTRAST * (255 - linSrc) / 255;
-
- for (int i = 0; i < 256; ++i) {
- int srca = apply_contrast(i, contrast);
- SkASSERT((unsigned)srca <= 255);
- int dsta = 255 - srca;
-
- //Calculate the output we want.
- int linOut = (linSrc * srca + dsta * linDst) / 255;
- SkASSERT((unsigned)linOut <= 255);
- int out = invPowTable[linOut];
-
- //Undo what the blit blend will do.
- int result = ((255 * out) - (255 * dst)) / (src - dst);
- SkASSERT((unsigned)result <= 255);
+static void build_gamma_table(uint8_t table[256], int srcI) {
+ const float src = (float)srcI / 255.0f;
+ const float linSrc = lin(src);
+ const float linDst = 1.0f - linSrc;
+ const float dst = per(linDst);
- table[i] = result;
+ // have our contrast value taper off to 0 as the src luminance becomes white
+ const float contrast = SK_GAMMA_CONTRAST / 255.0f * linDst;
+ const float step = 1.0f / 256.0f;
+
+ //Remove discontinuity and instability when src is close to dst.
+ if (fabs(src - dst) < 0.01f) {
+ float rawSrca = 0.0f;
+ for (int i = 0; i < 256; ++i, rawSrca += step) {
+ float srca = apply_contrast(rawSrca, contrast);
+ table[i] = sk_float_round2int(255.0f * srca);
+ }
+ } else {
+ float rawSrca = 0.0f;
+ for (int i = 0; i < 256; ++i, rawSrca += step) {
+ float srca = apply_contrast(rawSrca, contrast);
+ SkASSERT(srca <= 1.0f);
+ float dsta = 1 - srca;
+
+ //Calculate the output we want.
+ float linOut = (linSrc * srca + dsta * linDst);
+ SkASSERT(linOut <= 1.0f);
+ float out = per(linOut);
+
+ //Undo what the blit blend will do.
+ float result = (out - dst) / (src - dst);
+ SkASSERT(sk_float_round2int(255.0f * result) <= 255);
+
+ table[i] = sk_float_round2int(255.0f * result);
+ }
}
}
@@ -1250,10 +1281,10 @@ static const uint8_t* getGammaTable(U8CPU luminance) {
static uint8_t gGammaTables[4][256];
static bool gInited;
if (!gInited) {
- build_gamma_table(gGammaTables[0], 0x00, 0xFF);
- build_gamma_table(gGammaTables[1], 0x66, 0x99);
- build_gamma_table(gGammaTables[2], 0x99, 0x66);
- build_gamma_table(gGammaTables[3], 0xFF, 0x00);
+ build_gamma_table(gGammaTables[0], 0x00);
+ build_gamma_table(gGammaTables[1], 0x55);
+ build_gamma_table(gGammaTables[2], 0xAA);
+ build_gamma_table(gGammaTables[3], 0xFF);
gInited = true;
}
@@ -1261,7 +1292,7 @@ static const uint8_t* getGammaTable(U8CPU luminance) {
return gGammaTables[luminance >> 6];
}
-#ifndef SK_USE_COLOR_LUMINANCE
+#else //SK_USE_COLOR_LUMINANCE
static const uint8_t* getIdentityTable() {
static bool gOnce;
static uint8_t gIdentityTable[256];
@@ -1273,7 +1304,7 @@ static const uint8_t* getIdentityTable() {
}
return gIdentityTable;
}
-#endif
+#endif //SK_USE_COLOR_LUMINANCE
static uint16_t packTriple(unsigned r, unsigned g, unsigned b) {
return SkPackRGB16(r >> 3, g >> 2, b >> 3);
diff --git a/src/ports/SkFontHost_linux.cpp b/src/ports/SkFontHost_linux.cpp
index be99576..64fa2a3 100644
--- a/src/ports/SkFontHost_linux.cpp
+++ b/src/ports/SkFontHost_linux.cpp
@@ -464,53 +464,43 @@ static void load_system_fonts() {
///////////////////////////////////////////////////////////////////////////////
void SkFontHost::Serialize(const SkTypeface* face, SkWStream* stream) {
-#if 0
- const char* name = ((FamilyTypeface*)face)->getUniqueString();
-
- stream->write8((uint8_t)face->getStyle());
-
- if (NULL == name || 0 == *name) {
- stream->writePackedUInt(0);
- // SkDebugf("--- fonthost serialize null\n");
- } else {
- uint32_t len = strlen(name);
- stream->writePackedUInt(len);
- stream->write(name, len);
- // SkDebugf("--- fonthost serialize <%s> %d\n", name, face->getStyle());
- }
-#endif
- sk_throw();
+ SkStream* fontStream = ((FamilyTypeface*)face)->openStream();
+
+ // store the length of the custom font
+ uint32_t len = fontStream->getLength();
+ stream->write32(len);
+
+ // store the entire font in the serialized stream
+ void* fontData = malloc(len);
+
+ fontStream->read(fontData, len);
+ stream->write(fontData, len);
+
+ fontStream->unref();
+ free(fontData);
+
+
+// sk_throw();
}
SkTypeface* SkFontHost::Deserialize(SkStream* stream) {
-#if 0
load_system_fonts();
-
- int style = stream->readU8();
-
- int len = stream->readPackedUInt();
- if (len > 0) {
- SkString str;
- str.resize(len);
- stream->read(str.writable_str(), len);
-
- const FontInitRec* rec = gSystemFonts;
- for (size_t i = 0; i < SK_ARRAY_COUNT(gSystemFonts); i++) {
- if (strcmp(rec[i].fFileName, str.c_str()) == 0) {
- // backup until we hit the fNames
- for (int j = i; j >= 0; --j) {
- if (rec[j].fNames != NULL) {
- return SkFontHost::CreateTypeface(NULL, rec[j].fNames[0], NULL, 0,
- (SkTypeface::Style)style);
- }
- }
- }
- }
- }
- return SkFontHost::CreateTypeface(NULL, NULL, NULL, 0, (SkTypeface::Style)style);
-#endif
- sk_throw();
- return NULL;
+
+ // read the length of the custom font from the stream
+ uint32_t len = stream->readU32();
+
+ // generate a new stream to store the custom typeface
+ SkMemoryStream* fontStream = new SkMemoryStream(len);
+ stream->read((void*)fontStream->getMemoryBase(), len);
+
+ SkTypeface* face = CreateTypefaceFromStream(fontStream);
+
+ fontStream->unref();
+
+ return face;
+
+// sk_throw();
+// return NULL;
}
///////////////////////////////////////////////////////////////////////////////