summaryrefslogtreecommitdiffstats
path: root/libc/arch-arm/bionic/memcpy_8650A.S
blob: 69b4885da6c9249acd25e8a0f795f8af01b60dfd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 *
 * Copyright (c) 2010, Code Aurora Forum. All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/***************************************************************************
  Neon memcpy: Attempts to do a memcpy with Neon registers if possible,
     Inputs:
        dest: The destination buffer
        src: The source buffer
        n: The size of the buffer to transfer
     Outputs:

***************************************************************************/

#define PLDOFFS	(3)	/* For 8650a (set to 6 for 8660) */
#define PLDSIZE	(128)	/* L2 cache line size */

	.code 32
	.align 5
	.globl memcpy
	.func

memcpy:
	push		{r0}
	cmp		r2, #4
	blt		neon_lt4
	cmp		r2, #16
	blt		neon_lt16
	cmp		r2, #32
	blt		neon_16
	cmp		r2, #128
	blt		neon_copy_32_a
	/* Copy blocks of 128-bytes (word-aligned) at a time*/
	/* Code below is optimized for PLDSIZE=128 only */
	mov		r12, r2, lsr #7
	cmp		r12, #PLDOFFS
	ble		neon_copy_128_loop_nopld
	sub		r12, #PLDOFFS
	pld		[r1, #(PLDOFFS-1)*PLDSIZE]
neon_copy_128_loop_outer:
	pld		[r1, #(PLDOFFS*PLDSIZE)]
	vld1.32		{q0, q1}, [r1]!
	vld1.32		{q2, q3}, [r1]!
	vld1.32		{q8, q9}, [r1]!
	vld1.32		{q10, q11}, [r1]!
	subs		r12, r12, #1
	vst1.32		{q0, q1}, [r0]!
	vst1.32		{q2, q3}, [r0]!
	vst1.32		{q8, q9}, [r0]!
	vst1.32		{q10, q11}, [r0]!
	bne		neon_copy_128_loop_outer
	mov		r12, #PLDOFFS
neon_copy_128_loop_nopld:
	vld1.32		{q0, q1}, [r1]!
	vld1.32		{q2, q3}, [r1]!
	vld1.32		{q8, q9}, [r1]!
	vld1.32		{q10, q11}, [r1]!
	subs		r12, r12, #1
	vst1.32		{q0, q1}, [r0]!
	vst1.32		{q2, q3}, [r0]!
	vst1.32		{q8, q9}, [r0]!
	vst1.32		{q10, q11}, [r0]!
	bne		neon_copy_128_loop_nopld
	ands		r2, r2, #0x7f
	beq		neon_exit
	cmp		r2, #32
	blt		neon_16
	nop
	/* Copy blocks of 32-bytes (word aligned) at a time*/
neon_copy_32_a:
	mov		r12, r2, lsr #5
neon_copy_32_loop_a:
	vld1.32		{q0,q1}, [r1]!
	subs		r12, r12, #1
	vst1.32		{q0,q1}, [r0]!
	bne		neon_copy_32_loop_a
	ands		r2, r2, #0x1f
	beq		neon_exit
neon_16:
	subs		r2, r2, #16
	blt		neon_lt16
	vld1.32		{q8}, [r1]!
	vst1.32		{q8}, [r0]!
	beq		neon_exit
neon_lt16:
	movs		r12, r2, lsl #29
	bcc		neon_skip8
	ldr		r3, [r1], #4
	ldr		r12, [r1], #4
	str		r3, [r0], #4
	str		r12, [r0], #4
neon_skip8:
	bpl		neon_lt4
	ldr		r3, [r1], #4
	str		r3, [r0], #4
neon_lt4:
	movs		r2, r2, lsl #31
	bcc		neon_lt2
	ldrh		r3, [r1], #2
	strh		r3, [r0], #2
neon_lt2:
	bpl		neon_exit
	ldrb		r12, [r1]
	strb		r12, [r0]
neon_exit:
	pop		{r0}
	bx		lr

	.endfunc
	.end