summaryrefslogtreecommitdiffstats
path: root/libc/arch-arm64/kryo/bionic/memcpy_base.S
blob: 0096bb78610cea45076e5bdeccade291ee4df8cb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
/* Copyright (c) 2015 The Linux Foundation. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of The Linux Foundation nor the names of its contributors may
 *       be used to endorse or promote products derived from this software
 *       without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef PLDOFFS
#undef PLDOFFS
#endif
#define PLDOFFS		(16)

#ifdef PLDTHRESH
#undef PLDTHRESH
#endif
#define PLDTHRESH (PLDOFFS)

#ifdef BBTHRESH
#undef BBTHRESH
#endif
#define BBTHRESH (2048/128)

#if (PLDOFFS < 1)
#error Routine does not support offsets less than 1
#endif
#if (PLDTHRESH < PLDOFFS)
#error PLD threshold must be greater than or equal to the PLD offset
#endif

#ifdef PLDSIZE
#undef PLDSIZE
#endif
#define PLDSIZE	(128)

kryo_bb_memcpy:
	mov	x11, x0
	cmp	x2, #4
	blo	kryo_bb_lt4
	cmp	x2, #16
	blo	kryo_bb_lt16
	cmp	x2, #32
	blo	kryo_bb_16
	cmp	x2, #64
	blo	kryo_bb_copy_32_a
	cmp	x2, #128
	blo	kryo_bb_copy_64_a

	// we have at least 127 bytes to achieve 128-byte alignment
	neg	x3, x1			// calculate count to get SOURCE aligned
	ands	x3, x3, #0x7F
	b.eq	kryo_bb_source_aligned	// already aligned
	// alignment fixup, small to large (favorable alignment)
	tbz	x3, #0, 1f
	ldrb	w5, [x1], #1
	strb	w5, [x0], #1
1:	tbz	x3, #1, 2f
	ldrh	w6, [x1], #2
	strh	w6, [x0], #2
2:	tbz	x3, #2, 3f
	ldr	w8, [x1], #4
	str	w8, [x0], #4
3:	tbz	x3, #3, 4f
	ldr	x9, [x1], #8
	str	x9, [x0], #8
4:	tbz	x3, #4, 5f
	ldr	q7, [x1], #16
	str	q7, [x0], #16
5:	tbz	x3, #5, 55f
	ldp	q0, q1, [x1], #32
	stp	q0, q1, [x0], #32
55:	tbz	x3, #6, 6f
	ldp	q0, q1, [x1], #32
	ldp	q2, q3, [x1], #32
	stp	q0, q1, [x0], #32
	stp	q2, q3, [x0], #32
6:	subs	x2, x2, x3		// fixup count after alignment
	b.eq	kryo_bb_exit
	cmp	x2, #128
	blo	kryo_bb_copy_64_a
kryo_bb_source_aligned:
	lsr	x12, x2, #7
	cmp	x12, #PLDTHRESH
	bls	kryo_bb_copy_128_loop_nopld

	cmp	x12, #BBTHRESH
	bls	kryo_bb_prime_pump

	add	x14, x0, #0x400
	add	x9,  x1, #(PLDOFFS*PLDSIZE)
	sub	x14, x14, x9
	lsl	x14, x14, #(21+32)
	lsr	x14, x14, #(21+32)
	add	x14, x14, #(PLDOFFS*PLDSIZE)
	cmp	x12, x14, lsr #7
	bls	kryo_bb_prime_pump

	mov	x9, #(PLDOFFS)
	lsr     x13, x14, #7
	subs    x9, x13, x9
	bls	kryo_bb_prime_pump

	add	x10, x1, x14
	bic	x10, x10, #0x7F		// Round to multiple of PLDSIZE

	sub	x12, x12, x14, lsr #7
	cmp	x9, x12
	sub     x13, x12, x9
	csel    x12, x13, x12, LS
	csel    x9, x12, x9, HI
	csel    x12, xzr, x12, HI

	prfm	PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)]
	prfm	PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)]
kryo_bb_copy_128_loop_outer_doublepld:
	prfm	PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)]
	prfm	PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64]
	subs	x9, x9, #1
	ldp	q0, q1, [x1], #32
	ldp	q2, q3, [x1], #32
	ldp	q4, q5, [x1], #32
	ldp	q6, q7, [x1], #32
	prfm	PLDL1KEEP, [x10]
	prfm	PLDL1KEEP, [x10, #64]
	add	x10, x10, #128
	stp	q0, q1, [x0], #32
	stp	q2, q3, [x0], #32
	stp	q4, q5, [x0], #32
	stp	q6, q7, [x0], #32
	bne	kryo_bb_copy_128_loop_outer_doublepld
	cmp	x12, #0
	beq	kryo_bb_pop_before_nopld
	cmp	x12, #(448*1024/128)
	bls	kryo_bb_copy_128_loop_outer

kryo_bb_copy_128_loop_ddr:
	subs	x12, x12, #1
	ldr	x3, [x10], #128
	ldp	q0, q1, [x1], #32
	ldp	q2, q3, [x1], #32
	ldp	q4, q5, [x1], #32
	ldp	q6, q7, [x1], #32
	stp	q0, q1, [x0], #32
	stp	q2, q3, [x0], #32
	stp	q4, q5, [x0], #32
	stp	q6, q7, [x0], #32
	bne	kryo_bb_copy_128_loop_ddr
	b	kryo_bb_pop_before_nopld

kryo_bb_prime_pump:
	mov	x14, #(PLDOFFS*PLDSIZE)
	add	x10, x1, #(PLDOFFS*PLDSIZE)
	bic	x10, x10, #0x7F
	sub	x12, x12, #PLDOFFS
	prfm	PLDL1KEEP, [x10, #(-1*PLDSIZE)]
	prfm	PLDL1KEEP, [x10, #(-1*PLDSIZE+64)]
	cmp	x12, #(448*1024/128)
	bhi	kryo_bb_copy_128_loop_ddr

kryo_bb_copy_128_loop_outer:
	subs	x12, x12, #1
	prfm	PLDL1KEEP, [x10]
	prfm	PLDL1KEEP, [x10, #64]
	ldp	q0, q1, [x1], #32
	ldp	q2, q3, [x1], #32
	ldp	q4, q5, [x1], #32
	ldp	q6, q7, [x1], #32
	add	x10, x10, #128
	stp	q0, q1, [x0], #32
	stp	q2, q3, [x0], #32
	stp	q4, q5, [x0], #32
	stp	q6, q7, [x0], #32
	bne	kryo_bb_copy_128_loop_outer

kryo_bb_pop_before_nopld:
	lsr	x12, x14, #7
kryo_bb_copy_128_loop_nopld:
	ldp	q0, q1, [x1], #32
	ldp	q2, q3, [x1], #32
	ldp	q4, q5, [x1], #32
	ldp	q6, q7, [x1], #32
	subs	x12, x12, #1
	stp	q0, q1, [x0], #32
	stp	q2, q3, [x0], #32
	stp	q4, q5, [x0], #32
	stp	q6, q7, [x0], #32
	bne	kryo_bb_copy_128_loop_nopld
	ands	x2, x2, #0x7f
	beq	kryo_bb_exit

kryo_bb_copy_64_a:
	tbz	x2, #6, kryo_bb_copy_32_a
	ldp	q0, q1, [x1], #32
	ldp	q2, q3, [x1], #32
	stp	q0, q1, [x0], #32
	stp	q2, q3, [x0], #32
kryo_bb_copy_32_a:
	tbz	x2, #5, kryo_bb_16
	ldp	q0, q1, [x1], #32
	stp	q0, q1, [x0], #32
kryo_bb_16:
	tbz	x2, #4, kryo_bb_lt16
	ldr	q7, [x1], #16
	str	q7, [x0], #16
	ands	x2, x2, #0x0f
	beq	kryo_bb_exit
kryo_bb_lt16:
	tbz	x2, #3, kryo_bb_lt8
	ldr	x3, [x1], #8
	str	x3, [x0], #8
kryo_bb_lt8:
	tbz	x2, #2, kryo_bb_lt4
	ldr	w3, [x1], #4
	str	w3, [x0], #4
kryo_bb_lt4:
	tbz	x2, #1, kryo_bb_lt2
	ldrh	w3, [x1], #2
	strh	w3, [x0], #2
kryo_bb_lt2:
	tbz	x2, #0, kryo_bb_exit
	ldrb	w3, [x1], #1
	strb	w3, [x0], #1
kryo_bb_exit:
	mov	x0, x11
	ret