1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
|
/* Copyright (c) 2015 The Linux Foundation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of The Linux Foundation nor the names of its contributors may
* be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef PLDOFFS
#undef PLDOFFS
#endif
#define PLDOFFS (16)
#ifdef PLDTHRESH
#undef PLDTHRESH
#endif
#define PLDTHRESH (PLDOFFS)
#ifdef BBTHRESH
#undef BBTHRESH
#endif
#define BBTHRESH (2048/128)
#if (PLDOFFS < 1)
#error Routine does not support offsets less than 1
#endif
#if (PLDTHRESH < PLDOFFS)
#error PLD threshold must be greater than or equal to the PLD offset
#endif
#ifdef PLDSIZE
#undef PLDSIZE
#endif
#define PLDSIZE (128)
kryo_bb_memcpy:
mov x11, x0
cmp x2, #4
blo kryo_bb_lt4
cmp x2, #16
blo kryo_bb_lt16
cmp x2, #32
blo kryo_bb_16
cmp x2, #64
blo kryo_bb_copy_32_a
cmp x2, #128
blo kryo_bb_copy_64_a
// we have at least 127 bytes to achieve 128-byte alignment
neg x3, x1 // calculate count to get SOURCE aligned
ands x3, x3, #0x7F
b.eq kryo_bb_source_aligned // already aligned
// alignment fixup, small to large (favorable alignment)
tbz x3, #0, 1f
ldrb w5, [x1], #1
strb w5, [x0], #1
1: tbz x3, #1, 2f
ldrh w6, [x1], #2
strh w6, [x0], #2
2: tbz x3, #2, 3f
ldr w8, [x1], #4
str w8, [x0], #4
3: tbz x3, #3, 4f
ldr x9, [x1], #8
str x9, [x0], #8
4: tbz x3, #4, 5f
ldr q7, [x1], #16
str q7, [x0], #16
5: tbz x3, #5, 55f
ldp q0, q1, [x1], #32
stp q0, q1, [x0], #32
55: tbz x3, #6, 6f
ldp q0, q1, [x1], #32
ldp q2, q3, [x1], #32
stp q0, q1, [x0], #32
stp q2, q3, [x0], #32
6: subs x2, x2, x3 // fixup count after alignment
b.eq kryo_bb_exit
cmp x2, #128
blo kryo_bb_copy_64_a
kryo_bb_source_aligned:
lsr x12, x2, #7
cmp x12, #PLDTHRESH
bls kryo_bb_copy_128_loop_nopld
cmp x12, #BBTHRESH
bls kryo_bb_prime_pump
add x14, x0, #0x400
add x9, x1, #(PLDOFFS*PLDSIZE)
sub x14, x14, x9
lsl x14, x14, #(21+32)
lsr x14, x14, #(21+32)
add x14, x14, #(PLDOFFS*PLDSIZE)
cmp x12, x14, lsr #7
bls kryo_bb_prime_pump
mov x9, #(PLDOFFS)
lsr x13, x14, #7
subs x9, x13, x9
bls kryo_bb_prime_pump
add x10, x1, x14
bic x10, x10, #0x7F // Round to multiple of PLDSIZE
sub x12, x12, x14, lsr #7
cmp x9, x12
sub x13, x12, x9
csel x12, x13, x12, LS
csel x9, x12, x9, HI
csel x12, xzr, x12, HI
prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)]
prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)]
kryo_bb_copy_128_loop_outer_doublepld:
prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)]
prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64]
subs x9, x9, #1
ldp q0, q1, [x1], #32
ldp q2, q3, [x1], #32
ldp q4, q5, [x1], #32
ldp q6, q7, [x1], #32
prfm PLDL1KEEP, [x10]
prfm PLDL1KEEP, [x10, #64]
add x10, x10, #128
stp q0, q1, [x0], #32
stp q2, q3, [x0], #32
stp q4, q5, [x0], #32
stp q6, q7, [x0], #32
bne kryo_bb_copy_128_loop_outer_doublepld
cmp x12, #0
beq kryo_bb_pop_before_nopld
cmp x12, #(448*1024/128)
bls kryo_bb_copy_128_loop_outer
kryo_bb_copy_128_loop_ddr:
subs x12, x12, #1
ldr x3, [x10], #128
ldp q0, q1, [x1], #32
ldp q2, q3, [x1], #32
ldp q4, q5, [x1], #32
ldp q6, q7, [x1], #32
stp q0, q1, [x0], #32
stp q2, q3, [x0], #32
stp q4, q5, [x0], #32
stp q6, q7, [x0], #32
bne kryo_bb_copy_128_loop_ddr
b kryo_bb_pop_before_nopld
kryo_bb_prime_pump:
mov x14, #(PLDOFFS*PLDSIZE)
add x10, x1, #(PLDOFFS*PLDSIZE)
bic x10, x10, #0x7F
sub x12, x12, #PLDOFFS
prfm PLDL1KEEP, [x10, #(-1*PLDSIZE)]
prfm PLDL1KEEP, [x10, #(-1*PLDSIZE+64)]
cmp x12, #(448*1024/128)
bhi kryo_bb_copy_128_loop_ddr
kryo_bb_copy_128_loop_outer:
subs x12, x12, #1
prfm PLDL1KEEP, [x10]
prfm PLDL1KEEP, [x10, #64]
ldp q0, q1, [x1], #32
ldp q2, q3, [x1], #32
ldp q4, q5, [x1], #32
ldp q6, q7, [x1], #32
add x10, x10, #128
stp q0, q1, [x0], #32
stp q2, q3, [x0], #32
stp q4, q5, [x0], #32
stp q6, q7, [x0], #32
bne kryo_bb_copy_128_loop_outer
kryo_bb_pop_before_nopld:
lsr x12, x14, #7
kryo_bb_copy_128_loop_nopld:
ldp q0, q1, [x1], #32
ldp q2, q3, [x1], #32
ldp q4, q5, [x1], #32
ldp q6, q7, [x1], #32
subs x12, x12, #1
stp q0, q1, [x0], #32
stp q2, q3, [x0], #32
stp q4, q5, [x0], #32
stp q6, q7, [x0], #32
bne kryo_bb_copy_128_loop_nopld
ands x2, x2, #0x7f
beq kryo_bb_exit
kryo_bb_copy_64_a:
tbz x2, #6, kryo_bb_copy_32_a
ldp q0, q1, [x1], #32
ldp q2, q3, [x1], #32
stp q0, q1, [x0], #32
stp q2, q3, [x0], #32
kryo_bb_copy_32_a:
tbz x2, #5, kryo_bb_16
ldp q0, q1, [x1], #32
stp q0, q1, [x0], #32
kryo_bb_16:
tbz x2, #4, kryo_bb_lt16
ldr q7, [x1], #16
str q7, [x0], #16
ands x2, x2, #0x0f
beq kryo_bb_exit
kryo_bb_lt16:
tbz x2, #3, kryo_bb_lt8
ldr x3, [x1], #8
str x3, [x0], #8
kryo_bb_lt8:
tbz x2, #2, kryo_bb_lt4
ldr w3, [x1], #4
str w3, [x0], #4
kryo_bb_lt4:
tbz x2, #1, kryo_bb_lt2
ldrh w3, [x1], #2
strh w3, [x0], #2
kryo_bb_lt2:
tbz x2, #0, kryo_bb_exit
ldrb w3, [x1], #1
strb w3, [x0], #1
kryo_bb_exit:
mov x0, x11
ret
|