libFLAC/ppc/as/lpc_asm.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429

;  libFLAC - Free Lossless Audio Codec library
;  Copyright (C) 2004,2005,2006,2007  Josh Coalson
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;
;  - Redistributions of source code must retain the above copyright
;  notice, this list of conditions and the following disclaimer.
;
;  - Redistributions in binary form must reproduce the above copyright
;  notice, this list of conditions and the following disclaimer in the
;  documentation and/or other materials provided with the distribution.
;
;  - Neither the name of the Xiph.org Foundation nor the names of its
;  contributors may be used to endorse or promote products derived from
;  this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.text
	.align 2
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16

.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8

_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
;	r3: residual[]
;	r4: data_len
;	r5: qlp_coeff[]
;	r6: order
;	r7: lp_quantization
;	r8: data[]

; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
; bps<=15 for mid-side coding, since that uses an extra bit)

; these should be fast; the inner loop is unrolled (it takes no more than
; 3*(order%4) instructions, all of which are arithmetic), and all of the
; coefficients and all relevant history stay in registers, so the outer loop
; has only one load from memory (the residual)

; I have not yet run this through simg4, so there may be some avoidable stalls,
; and there may be a somewhat more clever way to do the outer loop

; the branch mechanism may prevent dynamic loading; I still need to examine
; this issue, and there may be a more elegant method

	stmw r31,-4(r1)

	addi r9,r1,-28
	li r31,0xf
	andc r9,r9,r31 ; for quadword-aligned stack data

	slwi r6,r6,2 ; adjust for word size
	slwi r4,r4,2
	add r4,r4,r8 ; r4 = data+data_len

	mfspr r0,256 ; cache old vrsave
	addis r31,0,hi16(0xfffffc00)
	ori r31,r31,lo16(0xfffffc00)
	mtspr 256,r31 ; declare VRs in vrsave

	cmplw cr0,r8,r4 ; i<data_len
	bc 4,0,L1400

	; load coefficients into v0-v7 and initial history into v8-v15
	li r31,0xf
	and r31,r8,r31 ; r31: data%4
	li r11,16
	subf r31,r31,r11 ; r31: 4-(data%4)
	slwi r31,r31,3 ; convert to bits for vsro
	li r10,-4
	stw r31,-4(r9)
	lvewx v0,r10,r9
	vspltisb v18,-1
	vsro v18,v18,v0 ; v18: mask vector

	li r31,0x8
	lvsl v0,0,r31
	vsldoi v0,v0,v0,12
	li r31,0xc
	lvsl v1,0,r31
	vspltisb v2,0
	vspltisb v3,-1
	vmrglw v2,v2,v3
	vsel v0,v1,v0,v2 ; v0: reversal permutation vector

	add r10,r5,r6
	lvsl v17,0,r5 ; v17: coefficient alignment permutation vector
	vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector

	mr r11,r8
	lvsl v16,0,r11 ; v16: history alignment permutation vector

	lvx v0,0,r5
	addi r5,r5,16
	lvx v1,0,r5
	vperm v0,v0,v1,v17
	lvx v8,0,r11
	addi r11,r11,-16
	lvx v9,0,r11
	vperm v8,v9,v8,v16
	cmplw cr0,r5,r10
	bc 12,0,L1101
	vand v0,v0,v18
	addis r31,0,hi16(L1307)
	ori r31,r31,lo16(L1307)
	b L1199

L1101:
	addi r5,r5,16
	lvx v2,0,r5
	vperm v1,v1,v2,v17
	addi r11,r11,-16
	lvx v10,0,r11
	vperm v9,v10,v9,v16
	cmplw cr0,r5,r10
	bc 12,0,L1102
	vand v1,v1,v18
	addis r31,0,hi16(L1306)
	ori r31,r31,lo16(L1306)
	b L1199

L1102:
	addi r5,r5,16
	lvx v3,0,r5
	vperm v2,v2,v3,v17
	addi r11,r11,-16
	lvx v11,0,r11
	vperm v10,v11,v10,v16
	cmplw cr0,r5,r10
	bc 12,0,L1103
	vand v2,v2,v18
	addis r31,0,hi16(L1305)
	ori r31,r31,lo16(L1305)
	b L1199

L1103:
	addi r5,r5,16
	lvx v4,0,r5
	vperm v3,v3,v4,v17
	addi r11,r11,-16
	lvx v12,0,r11
	vperm v11,v12,v11,v16
	cmplw cr0,r5,r10
	bc 12,0,L1104
	vand v3,v3,v18
	addis r31,0,hi16(L1304)
	ori r31,r31,lo16(L1304)
	b L1199

L1104:
	addi r5,r5,16
	lvx v5,0,r5
	vperm v4,v4,v5,v17
	addi r11,r11,-16
	lvx v13,0,r11
	vperm v12,v13,v12,v16
	cmplw cr0,r5,r10
	bc 12,0,L1105
	vand v4,v4,v18
	addis r31,0,hi16(L1303)
	ori r31,r31,lo16(L1303)
	b L1199

L1105:
	addi r5,r5,16
	lvx v6,0,r5
	vperm v5,v5,v6,v17
	addi r11,r11,-16
	lvx v14,0,r11
	vperm v13,v14,v13,v16
	cmplw cr0,r5,r10
	bc 12,0,L1106
	vand v5,v5,v18
	addis r31,0,hi16(L1302)
	ori r31,r31,lo16(L1302)
	b L1199

L1106:
	addi r5,r5,16
	lvx v7,0,r5
	vperm v6,v6,v7,v17
	addi r11,r11,-16
	lvx v15,0,r11
	vperm v14,v15,v14,v16
	cmplw cr0,r5,r10
	bc 12,0,L1107
	vand v6,v6,v18
	addis r31,0,hi16(L1301)
	ori r31,r31,lo16(L1301)
	b L1199

L1107:
	addi r5,r5,16
	lvx v19,0,r5
	vperm v7,v7,v19,v17
	addi r11,r11,-16
	lvx v19,0,r11
	vperm v15,v19,v15,v16
	vand v7,v7,v18
	addis r31,0,hi16(L1300)
	ori r31,r31,lo16(L1300)

L1199:
	mtctr r31

	; set up invariant vectors
	vspltish v16,0 ; v16: zero vector

	li r10,-12
	lvsr v17,r10,r8 ; v17: result shift vector
	lvsl v18,r10,r3 ; v18: residual shift back vector

	li r10,-4
	stw r7,-4(r9)
	lvewx v19,r10,r9 ; v19: lp_quantization vector

L1200:
	vmulosh v20,v0,v8 ; v20: sum vector
	bcctr 20,0

L1300:
	vmulosh v21,v7,v15
	vsldoi v15,v15,v14,4 ; increment history
	vaddsws v20,v20,v21

L1301:
	vmulosh v21,v6,v14
	vsldoi v14,v14,v13,4
	vaddsws v20,v20,v21

L1302:
	vmulosh v21,v5,v13
	vsldoi v13,v13,v12,4
	vaddsws v20,v20,v21

L1303:
	vmulosh v21,v4,v12
	vsldoi v12,v12,v11,4
	vaddsws v20,v20,v21

L1304:
	vmulosh v21,v3,v11
	vsldoi v11,v11,v10,4
	vaddsws v20,v20,v21

L1305:
	vmulosh v21,v2,v10
	vsldoi v10,v10,v9,4
	vaddsws v20,v20,v21

L1306:
	vmulosh v21,v1,v9
	vsldoi v9,v9,v8,4
	vaddsws v20,v20,v21

L1307:
	vsumsws v20,v20,v16 ; v20[3]: sum
	vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization

	lvewx v21,0,r3 ; v21[n]: *residual
	vperm v21,v21,v21,v18 ; v21[3]: *residual
	vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)
	vsldoi v18,v18,v18,4 ; increment shift vector

	vperm v21,v20,v20,v17 ; v21[n]: shift for storage
	vsldoi v17,v17,v17,12 ; increment shift vector
	stvewx v21,0,r8

	vsldoi v20,v20,v20,12
	vsldoi v8,v8,v20,4 ; insert value onto history

	addi r3,r3,4
	addi r8,r8,4
	cmplw cr0,r8,r4 ; i<data_len
	bc 12,0,L1200

L1400:
	mtspr 256,r0 ; restore old vrsave
	lmw r31,-4(r1)
	blr

_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
;	r3: residual[]
;	r4: data_len
;	r5: qlp_coeff[]
;	r6: order
;	r7: lp_quantization
;	r8: data[]

; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
; this version assumes order<=8; it uses fewer vector registers, which should
; save time in context switches, and has less code, which may improve
; instruction caching

	stmw r31,-4(r1)

	addi r9,r1,-28
	li r31,0xf
	andc r9,r9,r31 ; for quadword-aligned stack data

	slwi r6,r6,2 ; adjust for word size
	slwi r4,r4,2
	add r4,r4,r8 ; r4 = data+data_len

	mfspr r0,256 ; cache old vrsave
	addis r31,0,hi16(0xffc00000)
	ori r31,r31,lo16(0xffc00000)
	mtspr 256,r31 ; declare VRs in vrsave

	cmplw cr0,r8,r4 ; i<data_len
	bc 4,0,L2400

	; load coefficients into v0-v1 and initial history into v2-v3
	li r31,0xf
	and r31,r8,r31 ; r31: data%4
	li r11,16
	subf r31,r31,r11 ; r31: 4-(data%4)
	slwi r31,r31,3 ; convert to bits for vsro
	li r10,-4
	stw r31,-4(r9)
	lvewx v0,r10,r9
	vspltisb v6,-1
	vsro v6,v6,v0 ; v6: mask vector

	li r31,0x8
	lvsl v0,0,r31
	vsldoi v0,v0,v0,12
	li r31,0xc
	lvsl v1,0,r31
	vspltisb v2,0
	vspltisb v3,-1
	vmrglw v2,v2,v3
	vsel v0,v1,v0,v2 ; v0: reversal permutation vector

	add r10,r5,r6
	lvsl v5,0,r5 ; v5: coefficient alignment permutation vector
	vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector

	mr r11,r8
	lvsl v4,0,r11 ; v4: history alignment permutation vector

	lvx v0,0,r5
	addi r5,r5,16
	lvx v1,0,r5
	vperm v0,v0,v1,v5
	lvx v2,0,r11
	addi r11,r11,-16
	lvx v3,0,r11
	vperm v2,v3,v2,v4
	cmplw cr0,r5,r10
	bc 12,0,L2101
	vand v0,v0,v6
	addis r31,0,hi16(L2301)
	ori r31,r31,lo16(L2301)
	b L2199

L2101:
	addi r5,r5,16
	lvx v7,0,r5
	vperm v1,v1,v7,v5
	addi r11,r11,-16
	lvx v7,0,r11
	vperm v3,v7,v3,v4
	vand v1,v1,v6
	addis r31,0,hi16(L2300)
	ori r31,r31,lo16(L2300)

L2199:
	mtctr r31

	; set up invariant vectors
	vspltish v4,0 ; v4: zero vector

	li r10,-12
	lvsr v5,r10,r8 ; v5: result shift vector
	lvsl v6,r10,r3 ; v6: residual shift back vector

	li r10,-4
	stw r7,-4(r9)
	lvewx v7,r10,r9 ; v7: lp_quantization vector

L2200:
	vmulosh v8,v0,v2 ; v8: sum vector
	bcctr 20,0

L2300:
	vmulosh v9,v1,v3
	vsldoi v3,v3,v2,4
	vaddsws v8,v8,v9

L2301:
	vsumsws v8,v8,v4 ; v8[3]: sum
	vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization

	lvewx v9,0,r3 ; v9[n]: *residual
	vperm v9,v9,v9,v6 ; v9[3]: *residual
	vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)
	vsldoi v6,v6,v6,4 ; increment shift vector

	vperm v9,v8,v8,v5 ; v9[n]: shift for storage
	vsldoi v5,v5,v5,12 ; increment shift vector
	stvewx v9,0,r8

	vsldoi v8,v8,v8,12
	vsldoi v2,v2,v8,4 ; insert value onto history

	addi r3,r3,4
	addi r8,r8,4
	cmplw cr0,r8,r4 ; i<data_len
	bc 12,0,L2200

L2400:
	mtspr 256,r0 ; restore old vrsave
	lmw r31,-4(r1)
	blr