diff options
Diffstat (limited to 'libFLAC/ppc/gas/lpc_asm.s')
-rw-r--r-- | libFLAC/ppc/gas/lpc_asm.s | 431 |
1 files changed, 431 insertions, 0 deletions
diff --git a/libFLAC/ppc/gas/lpc_asm.s b/libFLAC/ppc/gas/lpc_asm.s new file mode 100644 index 0000000..2203570 --- /dev/null +++ b/libFLAC/ppc/gas/lpc_asm.s @@ -0,0 +1,431 @@ +# libFLAC - Free Lossless Audio Codec library +# Copyright (C) 2004,2005,2006,2007 Josh Coalson +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# - Neither the name of the Xiph.org Foundation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +.text + .align 2 +.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16 +.type _FLAC__lpc_restore_signal_asm_ppc_altivec_16, @function + +.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 +.type _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8, @function + +_FLAC__lpc_restore_signal_asm_ppc_altivec_16: +# r3: residual[] +# r4: data_len +# r5: qlp_coeff[] +# r6: order +# r7: lp_quantization +# r8: data[] + +# see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() +# these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual +# bps<=15 for mid-side coding, since that uses an extra bit) + +# these should be fast; the inner loop is unrolled (it takes no more than +# 3*(order%4) instructions, all of which are arithmetic), and all of the +# coefficients and all relevant history stay in registers, so the outer loop +# has only one load from memory (the residual) + +# I have not yet run this through simg4, so there may be some avoidable stalls, +# and there may be a somewhat more clever way to do the outer loop + +# the branch mechanism may prevent dynamic loading; I still need to examine +# this issue, and there may be a more elegant method + + stmw r31,-4(r1) + + addi r9,r1,-28 + li r31,0xf + andc r9,r9,r31 # for quadword-aligned stack data + + slwi r6,r6,2 # adjust for word size + slwi r4,r4,2 + add r4,r4,r8 # r4 = data+data_len + + mfspr r0,256 # cache old vrsave + addis r31,0,0xffff + ori r31,r31,0xfc00 + mtspr 256,r31 # declare VRs in vrsave + + cmplw cr0,r8,r4 # i<data_len + bc 4,0,L1400 + + # load coefficients into v0-v7 and initial history into v8-v15 + li r31,0xf + and r31,r8,r31 # r31: data%4 + li r11,16 + subf r31,r31,r11 # r31: 4-(data%4) + slwi r31,r31,3 # convert to bits for vsro + li r10,-4 + stw r31,-4(r9) + lvewx v0,r10,r9 + vspltisb v18,-1 + vsro v18,v18,v0 # v18: mask vector + + li r31,0x8 + lvsl v0,0,r31 + vsldoi v0,v0,v0,12 + li r31,0xc + lvsl v1,0,r31 + vspltisb v2,0 + vspltisb v3,-1 + vmrglw v2,v2,v3 + vsel v0,v1,v0,v2 # v0: reversal permutation vector + + add r10,r5,r6 + lvsl v17,0,r5 # v17: coefficient alignment permutation vector + vperm v17,v17,v17,v0 # v17: reversal coefficient alignment permutation vector + + mr r11,r8 + lvsl v16,0,r11 # v16: history alignment permutation vector + + lvx v0,0,r5 + addi r5,r5,16 + lvx v1,0,r5 + vperm v0,v0,v1,v17 + lvx v8,0,r11 + addi r11,r11,-16 + lvx v9,0,r11 + vperm v8,v9,v8,v16 + cmplw cr0,r5,r10 + bc 12,0,L1101 + vand v0,v0,v18 + addis r31,0,L1307@ha + ori r31,r31,L1307@l + b L1199 + +L1101: + addi r5,r5,16 + lvx v2,0,r5 + vperm v1,v1,v2,v17 + addi r11,r11,-16 + lvx v10,0,r11 + vperm v9,v10,v9,v16 + cmplw cr0,r5,r10 + bc 12,0,L1102 + vand v1,v1,v18 + addis r31,0,L1306@ha + ori r31,r31,L1306@l + b L1199 + +L1102: + addi r5,r5,16 + lvx v3,0,r5 + vperm v2,v2,v3,v17 + addi r11,r11,-16 + lvx v11,0,r11 + vperm v10,v11,v10,v16 + cmplw cr0,r5,r10 + bc 12,0,L1103 + vand v2,v2,v18 + lis r31,L1305@ha + la r31,L1305@l(r31) + b L1199 + +L1103: + addi r5,r5,16 + lvx v4,0,r5 + vperm v3,v3,v4,v17 + addi r11,r11,-16 + lvx v12,0,r11 + vperm v11,v12,v11,v16 + cmplw cr0,r5,r10 + bc 12,0,L1104 + vand v3,v3,v18 + lis r31,L1304@ha + la r31,L1304@l(r31) + b L1199 + +L1104: + addi r5,r5,16 + lvx v5,0,r5 + vperm v4,v4,v5,v17 + addi r11,r11,-16 + lvx v13,0,r11 + vperm v12,v13,v12,v16 + cmplw cr0,r5,r10 + bc 12,0,L1105 + vand v4,v4,v18 + lis r31,L1303@ha + la r31,L1303@l(r31) + b L1199 + +L1105: + addi r5,r5,16 + lvx v6,0,r5 + vperm v5,v5,v6,v17 + addi r11,r11,-16 + lvx v14,0,r11 + vperm v13,v14,v13,v16 + cmplw cr0,r5,r10 + bc 12,0,L1106 + vand v5,v5,v18 + lis r31,L1302@ha + la r31,L1302@l(r31) + b L1199 + +L1106: + addi r5,r5,16 + lvx v7,0,r5 + vperm v6,v6,v7,v17 + addi r11,r11,-16 + lvx v15,0,r11 + vperm v14,v15,v14,v16 + cmplw cr0,r5,r10 + bc 12,0,L1107 + vand v6,v6,v18 + lis r31,L1301@ha + la r31,L1301@l(r31) + b L1199 + +L1107: + addi r5,r5,16 + lvx v19,0,r5 + vperm v7,v7,v19,v17 + addi r11,r11,-16 + lvx v19,0,r11 + vperm v15,v19,v15,v16 + vand v7,v7,v18 + lis r31,L1300@ha + la r31,L1300@l(r31) + +L1199: + mtctr r31 + + # set up invariant vectors + vspltish v16,0 # v16: zero vector + + li r10,-12 + lvsr v17,r10,r8 # v17: result shift vector + lvsl v18,r10,r3 # v18: residual shift back vector + + li r10,-4 + stw r7,-4(r9) + lvewx v19,r10,r9 # v19: lp_quantization vector + +L1200: + vmulosh v20,v0,v8 # v20: sum vector + bcctr 20,0 + +L1300: + vmulosh v21,v7,v15 + vsldoi v15,v15,v14,4 # increment history + vaddsws v20,v20,v21 + +L1301: + vmulosh v21,v6,v14 + vsldoi v14,v14,v13,4 + vaddsws v20,v20,v21 + +L1302: + vmulosh v21,v5,v13 + vsldoi v13,v13,v12,4 + vaddsws v20,v20,v21 + +L1303: + vmulosh v21,v4,v12 + vsldoi v12,v12,v11,4 + vaddsws v20,v20,v21 + +L1304: + vmulosh v21,v3,v11 + vsldoi v11,v11,v10,4 + vaddsws v20,v20,v21 + +L1305: + vmulosh v21,v2,v10 + vsldoi v10,v10,v9,4 + vaddsws v20,v20,v21 + +L1306: + vmulosh v21,v1,v9 + vsldoi v9,v9,v8,4 + vaddsws v20,v20,v21 + +L1307: + vsumsws v20,v20,v16 # v20[3]: sum + vsraw v20,v20,v19 # v20[3]: sum >> lp_quantization + + lvewx v21,0,r3 # v21[n]: *residual + vperm v21,v21,v21,v18 # v21[3]: *residual + vaddsws v20,v21,v20 # v20[3]: *residual + (sum >> lp_quantization) + vsldoi v18,v18,v18,4 # increment shift vector + + vperm v21,v20,v20,v17 # v21[n]: shift for storage + vsldoi v17,v17,v17,12 # increment shift vector + stvewx v21,0,r8 + + vsldoi v20,v20,v20,12 + vsldoi v8,v8,v20,4 # insert value onto history + + addi r3,r3,4 + addi r8,r8,4 + cmplw cr0,r8,r4 # i<data_len + bc 12,0,L1200 + +L1400: + mtspr 256,r0 # restore old vrsave + lmw r31,-4(r1) + blr + +_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8: +# r3: residual[] +# r4: data_len +# r5: qlp_coeff[] +# r6: order +# r7: lp_quantization +# r8: data[] + +# see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above +# this version assumes order<=8; it uses fewer vector registers, which should +# save time in context switches, and has less code, which may improve +# instruction caching + + stmw r31,-4(r1) + + addi r9,r1,-28 + li r31,0xf + andc r9,r9,r31 # for quadword-aligned stack data + + slwi r6,r6,2 # adjust for word size + slwi r4,r4,2 + add r4,r4,r8 # r4 = data+data_len + + mfspr r0,256 # cache old vrsave + addis r31,0,0xffc0 + ori r31,r31,0x0000 + mtspr 256,r31 # declare VRs in vrsave + + cmplw cr0,r8,r4 # i<data_len + bc 4,0,L2400 + + # load coefficients into v0-v1 and initial history into v2-v3 + li r31,0xf + and r31,r8,r31 # r31: data%4 + li r11,16 + subf r31,r31,r11 # r31: 4-(data%4) + slwi r31,r31,3 # convert to bits for vsro + li r10,-4 + stw r31,-4(r9) + lvewx v0,r10,r9 + vspltisb v6,-1 + vsro v6,v6,v0 # v6: mask vector + + li r31,0x8 + lvsl v0,0,r31 + vsldoi v0,v0,v0,12 + li r31,0xc + lvsl v1,0,r31 + vspltisb v2,0 + vspltisb v3,-1 + vmrglw v2,v2,v3 + vsel v0,v1,v0,v2 # v0: reversal permutation vector + + add r10,r5,r6 + lvsl v5,0,r5 # v5: coefficient alignment permutation vector + vperm v5,v5,v5,v0 # v5: reversal coefficient alignment permutation vector + + mr r11,r8 + lvsl v4,0,r11 # v4: history alignment permutation vector + + lvx v0,0,r5 + addi r5,r5,16 + lvx v1,0,r5 + vperm v0,v0,v1,v5 + lvx v2,0,r11 + addi r11,r11,-16 + lvx v3,0,r11 + vperm v2,v3,v2,v4 + cmplw cr0,r5,r10 + bc 12,0,L2101 + vand v0,v0,v6 + lis r31,L2301@ha + la r31,L2301@l(r31) + b L2199 + +L2101: + addi r5,r5,16 + lvx v7,0,r5 + vperm v1,v1,v7,v5 + addi r11,r11,-16 + lvx v7,0,r11 + vperm v3,v7,v3,v4 + vand v1,v1,v6 + lis r31,L2300@ha + la r31,L2300@l(r31) + +L2199: + mtctr r31 + + # set up invariant vectors + vspltish v4,0 # v4: zero vector + + li r10,-12 + lvsr v5,r10,r8 # v5: result shift vector + lvsl v6,r10,r3 # v6: residual shift back vector + + li r10,-4 + stw r7,-4(r9) + lvewx v7,r10,r9 # v7: lp_quantization vector + +L2200: + vmulosh v8,v0,v2 # v8: sum vector + bcctr 20,0 + +L2300: + vmulosh v9,v1,v3 + vsldoi v3,v3,v2,4 + vaddsws v8,v8,v9 + +L2301: + vsumsws v8,v8,v4 # v8[3]: sum + vsraw v8,v8,v7 # v8[3]: sum >> lp_quantization + + lvewx v9,0,r3 # v9[n]: *residual + vperm v9,v9,v9,v6 # v9[3]: *residual + vaddsws v8,v9,v8 # v8[3]: *residual + (sum >> lp_quantization) + vsldoi v6,v6,v6,4 # increment shift vector + + vperm v9,v8,v8,v5 # v9[n]: shift for storage + vsldoi v5,v5,v5,12 # increment shift vector + stvewx v9,0,r8 + + vsldoi v8,v8,v8,12 + vsldoi v2,v2,v8,4 # insert value onto history + + addi r3,r3,4 + addi r8,r8,4 + cmplw cr0,r8,r4 # i<data_len + bc 12,0,L2200 + +L2400: + mtspr 256,r0 # restore old vrsave + lmw r31,-4(r1) + blr |