diff options
author | Brian Carlstrom <bdc@google.com> | 2012-03-15 15:27:15 -0700 |
---|---|---|
committer | Android Git Automerger <android-git-automerger@android.com> | 2012-03-15 15:27:15 -0700 |
commit | 7f1d63479ce92a2a4a0874b007e49f8acb13a0d9 (patch) | |
tree | 6ae4607f75c2c531a170270264b6c7d88306152a /crypto | |
parent | 21c841450af61d0a9119cdc863e93d019127bfe1 (diff) | |
parent | db166823303559663b1c209e14b326160519c51c (diff) | |
download | replicant_openssl-7f1d63479ce92a2a4a0874b007e49f8acb13a0d9.zip replicant_openssl-7f1d63479ce92a2a4a0874b007e49f8acb13a0d9.tar.gz replicant_openssl-7f1d63479ce92a2a4a0874b007e49f8acb13a0d9.tar.bz2 |
am db166823: Merge "From 67b1ae72527c9e173ace98e805e8b9c090455873 Mon Sep 17 00:00:00 2001 Subject: [MIPS] MIPS assembler pack update"
* commit 'db166823303559663b1c209e14b326160519c51c':
From 67b1ae72527c9e173ace98e805e8b9c090455873 Mon Sep 17 00:00:00 2001 Subject: [MIPS] MIPS assembler pack update
Diffstat (limited to 'crypto')
-rw-r--r-- | crypto/Android.mk | 50 | ||||
-rw-r--r-- | crypto/aes/asm/aes-mips.pl | 1611 | ||||
-rw-r--r-- | crypto/aes/asm/aes-mips.s | 1341 | ||||
-rw-r--r-- | crypto/bn/asm/bn-mips.s | 2177 | ||||
-rw-r--r-- | crypto/bn/asm/mips-mont.pl | 426 | ||||
-rw-r--r-- | crypto/bn/asm/mips-mont.s | 282 | ||||
-rw-r--r-- | crypto/bn/asm/mips.pl | 2585 | ||||
-rw-r--r-- | crypto/sha/asm/sha1-mips.pl | 354 | ||||
-rw-r--r-- | crypto/sha/asm/sha1-mips.s | 1664 | ||||
-rw-r--r-- | crypto/sha/asm/sha256-mips.s | 1999 | ||||
-rw-r--r-- | crypto/sha/asm/sha512-mips.pl | 455 |
11 files changed, 12934 insertions, 10 deletions
diff --git a/crypto/Android.mk b/crypto/Android.mk index bde68fc..8090c12 100644 --- a/crypto/Android.mk +++ b/crypto/Android.mk @@ -1,13 +1,26 @@ LOCAL_PATH:= $(call my-dir) arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM +mips_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM + arm_src_files := \ aes/asm/aes-armv4.s \ bn/asm/armv4-mont.s \ + bn/bn_asm.c \ sha/asm/sha1-armv4-large.s \ sha/asm/sha256-armv4.s \ sha/asm/sha512-armv4.s -non_arm_src_files := aes/aes_core.c + +mips_src_files := \ + aes/asm/aes-mips.s \ + bn/asm/bn-mips.s \ + bn/asm/mips-mont.s \ + sha/asm/sha1-mips.s \ + sha/asm/sha256-mips.s + +other_arch_src_files := \ + aes/aes_core.c \ + bn/bn_asm.c local_src_files := \ cryptlib.c \ @@ -131,7 +144,6 @@ local_src_files := \ bio/bss_null.c \ bio/bss_sock.c \ bn/bn_add.c \ - bn/bn_asm.c \ bn/bn_blind.c \ bn/bn_const.c \ bn/bn_ctx.c \ @@ -506,7 +518,7 @@ local_c_flags := -DNO_WINDOWS_BRAINDEATH include $(CLEAR_VARS) include $(LOCAL_PATH)/../android-config.mk -ifneq ($(TARGET_ARCH),x86) +ifeq ($(TARGET_ARCH),arm) LOCAL_NDK_VERSION := 5 LOCAL_SDK_VERSION := 9 endif @@ -517,8 +529,17 @@ LOCAL_C_INCLUDES += $(local_c_includes) ifeq ($(TARGET_ARCH),arm) LOCAL_SRC_FILES += $(arm_src_files) LOCAL_CFLAGS += $(arm_cflags) -else - LOCAL_SRC_FILES += $(non_arm_src_files) +endif +ifeq ($(TARGET_ARCH),mips) + ifneq (($TARGET_HAS_BIGENDIAN),true) + LOCAL_SRC_FILES += $(mips_src_files) + LOCAL_CFLAGS += $(mips_cflags) + else + LOCAL_SRC_FILES += $(other_arch_src_files) + endif +endif +ifeq ($(TARGET_ARCH),x86) + LOCAL_SRC_FILES += $(other_arch_src_files) endif LOCAL_MODULE_TAGS := optional LOCAL_MODULE:= libcrypto_static @@ -529,7 +550,7 @@ include $(BUILD_STATIC_LIBRARY) include $(CLEAR_VARS) include $(LOCAL_PATH)/../android-config.mk -ifneq ($(TARGET_ARCH),x86) +ifeq ($(TARGET_ARCH),arm) LOCAL_NDK_VERSION := 5 LOCAL_SDK_VERSION := 9 # Use the NDK prebuilt libz and libdl. @@ -544,8 +565,17 @@ LOCAL_C_INCLUDES += $(local_c_includes) ifeq ($(TARGET_ARCH),arm) LOCAL_SRC_FILES += $(arm_src_files) LOCAL_CFLAGS += $(arm_cflags) -else - LOCAL_SRC_FILES += $(non_arm_src_files) +endif +ifeq ($(TARGET_ARCH),mips) + ifneq (($TARGET_HAS_BIGENDIAN),true) + LOCAL_SRC_FILES += $(mips_src_files) + LOCAL_CFLAGS += $(mips_cflags) + else + LOCAL_SRC_FILES += $(other_arch_src_files) + endif +endif +ifeq ($(TARGET_ARCH),x86) + LOCAL_SRC_FILES += $(other_arch_src_files) endif LOCAL_MODULE_TAGS := optional LOCAL_MODULE:= libcrypto @@ -558,7 +588,7 @@ include $(LOCAL_PATH)/../android-config.mk LOCAL_SRC_FILES += $(local_src_files) LOCAL_CFLAGS += $(local_c_flags) -DPURIFY LOCAL_C_INCLUDES += $(local_c_includes) -LOCAL_SRC_FILES += $(non_arm_src_files) +LOCAL_SRC_FILES += $(other_arch_src_files) LOCAL_STATIC_LIBRARIES += libz LOCAL_LDLIBS += -ldl LOCAL_MODULE_TAGS := optional @@ -573,7 +603,7 @@ include $(LOCAL_PATH)/../android-config.mk LOCAL_SRC_FILES += $(local_src_files) LOCAL_CFLAGS += $(local_c_flags) -DPURIFY LOCAL_C_INCLUDES += $(local_c_includes) -LOCAL_SRC_FILES += $(non_arm_src_files) +LOCAL_SRC_FILES += $(other_arch_src_files) LOCAL_STATIC_LIBRARIES += libz LOCAL_LDLIBS += -ldl LOCAL_MODULE_TAGS := optional diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl new file mode 100644 index 0000000..2ce6def --- /dev/null +++ b/crypto/aes/asm/aes-mips.pl @@ -0,0 +1,1611 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# AES for MIPS + +# October 2010 +# +# Code uses 1K[+256B] S-box and on single-issue core [such as R5000] +# spends ~68 cycles per byte processed with 128-bit key. This is ~16% +# faster than gcc-generated code, which is not very impressive. But +# recall that compressed S-box requires extra processing, namely +# additional rotations. Rotations are implemented with lwl/lwr pairs, +# which is normally used for loading unaligned data. Another cool +# thing about this module is its endian neutrality, which means that +# it processes data without ever changing byte order... + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; +# +# <appro@openssl.org> +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) +{ $big_endian=(unpack('L',pack('N',1))==1); } + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +my ($MSB,$LSB)=(0,3); # automatically converted to little-endian + +$code.=<<___; +.text +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +#if !defined(__vxworks) || defined(__pic__) +.option pic2 +#endif +.set noat +___ + +{{{ +my $FRAMESIZE=16*$SZREG; +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7); +my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); +my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23)); +my ($key0,$cnt)=($gp,$fp); + +# instuction ordering is "stolen" from output from MIPSpro assembler +# invoked with -mips3 -O3 arguments... +$code.=<<___; +.align 5 +.ent _mips_AES_encrypt +_mips_AES_encrypt: + .frame $sp,0,$ra + .set reorder + lw $t0,0($key) + lw $t1,4($key) + lw $t2,8($key) + lw $t3,12($key) + lw $cnt,240($key) + $PTR_ADD $key0,$key,16 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + sub $cnt,1 + _xtr $i0,$s1,16-2 +.Loop_enc: + _xtr $i1,$s2,16-2 + _xtr $i2,$s3,16-2 + _xtr $i3,$s0,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t0,3($i0) # Te1[s1>>16] + lwl $t1,3($i1) # Te1[s2>>16] + lwl $t2,3($i2) # Te1[s3>>16] + lwl $t3,3($i3) # Te1[s0>>16] + lwr $t0,2($i0) # Te1[s1>>16] + lwr $t1,2($i1) # Te1[s2>>16] + lwr $t2,2($i2) # Te1[s3>>16] + lwr $t3,2($i3) # Te1[s0>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t4,2($i0) # Te2[s2>>8] + lwl $t5,2($i1) # Te2[s3>>8] + lwl $t6,2($i2) # Te2[s0>>8] + lwl $t7,2($i3) # Te2[s1>>8] + lwr $t4,1($i0) # Te2[s2>>8] + lwr $t5,1($i1) # Te2[s3>>8] + lwr $t6,1($i2) # Te2[s0>>8] + lwr $t7,1($i3) # Te2[s1>>8] + + _xtr $i0,$s3,0-2 + _xtr $i1,$s0,0-2 + _xtr $i2,$s1,0-2 + _xtr $i3,$s2,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t8,1($i0) # Te3[s3] + lwl $t9,1($i1) # Te3[s0] + lwl $t10,1($i2) # Te3[s1] + lwl $t11,1($i3) # Te3[s2] + lwr $t8,0($i0) # Te3[s3] + lwr $t9,0($i1) # Te3[s0] + lwr $t10,0($i2) # Te3[s1] + lwr $t11,0($i3) # Te3[s2] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + lw $t4,0($i0) # Te0[s0>>24] + lw $t5,0($i1) # Te0[s1>>24] + lw $t6,0($i2) # Te0[s2>>24] + lw $t7,0($i3) # Te0[s3>>24] + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_enc + _xtr $i0,$s1,16-2 + + .set reorder + _xtr $i1,$s2,16-2 + _xtr $i2,$s3,16-2 + _xtr $i3,$s0,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t0,2($i0) # Te4[s1>>16] + lbu $t1,2($i1) # Te4[s2>>16] + lbu $t2,2($i2) # Te4[s3>>16] + lbu $t3,2($i3) # Te4[s0>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,2($i0) # Te4[s2>>8] + lbu $t5,2($i1) # Te4[s3>>8] + lbu $t6,2($i2) # Te4[s0>>8] + lbu $t7,2($i3) # Te4[s1>>8] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,2($i0) # Te4[s0>>24] + lbu $t9,2($i1) # Te4[s1>>24] + lbu $t10,2($i2) # Te4[s2>>24] + lbu $t11,2($i3) # Te4[s3>>24] + + _xtr $i0,$s3,0-2 + _xtr $i1,$s0,0-2 + _xtr $i2,$s1,0-2 + _xtr $i3,$s2,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + + _ins $t4,8 + _ins $t5,8 + _ins $t6,8 + _ins $t7,8 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,2($i0) # Te4[s3] + lbu $t5,2($i1) # Te4[s0] + lbu $t6,2($i2) # Te4[s1] + lbu $t7,2($i3) # Te4[s2] + + _ins $t8,24 + _ins $t9,24 + _ins $t10,24 + _ins $t11,24 + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + _ins $t4,0 + _ins $t5,0 + _ins $t6,0 + _ins $t7,0 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + jr $ra +.end _mips_AES_encrypt + +.align 5 +.globl AES_encrypt +.ent AES_encrypt +AES_encrypt: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_encrypt +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + lwl $s0,0+$MSB($inp) + lwl $s1,4+$MSB($inp) + lwl $s2,8+$MSB($inp) + lwl $s3,12+$MSB($inp) + lwr $s0,0+$LSB($inp) + lwr $s1,4+$LSB($inp) + lwr $s2,8+$LSB($inp) + lwr $s3,12+$LSB($inp) + + bal _mips_AES_encrypt + + swr $s0,0+$LSB($out) + swr $s1,4+$LSB($out) + swr $s2,8+$LSB($out) + swr $s3,12+$LSB($out) + swl $s0,0+$MSB($out) + swl $s1,4+$MSB($out) + swl $s2,8+$MSB($out) + swl $s3,12+$MSB($out) + + .set noreorder + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_encrypt +___ + +$code.=<<___; +.align 5 +.ent _mips_AES_decrypt +_mips_AES_decrypt: + .frame $sp,0,$ra + .set reorder + lw $t0,0($key) + lw $t1,4($key) + lw $t2,8($key) + lw $t3,12($key) + lw $cnt,240($key) + $PTR_ADD $key0,$key,16 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + sub $cnt,1 + _xtr $i0,$s3,16-2 +.Loop_dec: + _xtr $i1,$s0,16-2 + _xtr $i2,$s1,16-2 + _xtr $i3,$s2,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t0,3($i0) # Td1[s3>>16] + lwl $t1,3($i1) # Td1[s0>>16] + lwl $t2,3($i2) # Td1[s1>>16] + lwl $t3,3($i3) # Td1[s2>>16] + lwr $t0,2($i0) # Td1[s3>>16] + lwr $t1,2($i1) # Td1[s0>>16] + lwr $t2,2($i2) # Td1[s1>>16] + lwr $t3,2($i3) # Td1[s2>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t4,2($i0) # Td2[s2>>8] + lwl $t5,2($i1) # Td2[s3>>8] + lwl $t6,2($i2) # Td2[s0>>8] + lwl $t7,2($i3) # Td2[s1>>8] + lwr $t4,1($i0) # Td2[s2>>8] + lwr $t5,1($i1) # Td2[s3>>8] + lwr $t6,1($i2) # Td2[s0>>8] + lwr $t7,1($i3) # Td2[s1>>8] + + _xtr $i0,$s1,0-2 + _xtr $i1,$s2,0-2 + _xtr $i2,$s3,0-2 + _xtr $i3,$s0,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t8,1($i0) # Td3[s1] + lwl $t9,1($i1) # Td3[s2] + lwl $t10,1($i2) # Td3[s3] + lwl $t11,1($i3) # Td3[s0] + lwr $t8,0($i0) # Td3[s1] + lwr $t9,0($i1) # Td3[s2] + lwr $t10,0($i2) # Td3[s3] + lwr $t11,0($i3) # Td3[s0] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + + lw $t4,0($i0) # Td0[s0>>24] + lw $t5,0($i1) # Td0[s1>>24] + lw $t6,0($i2) # Td0[s2>>24] + lw $t7,0($i3) # Td0[s3>>24] + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_dec + _xtr $i0,$s3,16-2 + + .set reorder + lw $t4,1024($Tbl) # prefetch Td4 + lw $t5,1024+32($Tbl) + lw $t6,1024+64($Tbl) + lw $t7,1024+96($Tbl) + lw $t8,1024+128($Tbl) + lw $t9,1024+160($Tbl) + lw $t10,1024+192($Tbl) + lw $t11,1024+224($Tbl) + + _xtr $i0,$s3,16 + _xtr $i1,$s0,16 + _xtr $i2,$s1,16 + _xtr $i3,$s2,16 + and $i0,0xff + and $i1,0xff + and $i2,0xff + and $i3,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t0,1024($i0) # Td4[s3>>16] + lbu $t1,1024($i1) # Td4[s0>>16] + lbu $t2,1024($i2) # Td4[s1>>16] + lbu $t3,1024($i3) # Td4[s2>>16] + + _xtr $i0,$s2,8 + _xtr $i1,$s3,8 + _xtr $i2,$s0,8 + _xtr $i3,$s1,8 + and $i0,0xff + and $i1,0xff + and $i2,0xff + and $i3,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,1024($i0) # Td4[s2>>8] + lbu $t5,1024($i1) # Td4[s3>>8] + lbu $t6,1024($i2) # Td4[s0>>8] + lbu $t7,1024($i3) # Td4[s1>>8] + + _xtr $i0,$s0,24 + _xtr $i1,$s1,24 + _xtr $i2,$s2,24 + _xtr $i3,$s3,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,1024($i0) # Td4[s0>>24] + lbu $t9,1024($i1) # Td4[s1>>24] + lbu $t10,1024($i2) # Td4[s2>>24] + lbu $t11,1024($i3) # Td4[s3>>24] + + _xtr $i0,$s1,0 + _xtr $i1,$s2,0 + _xtr $i2,$s3,0 + _xtr $i3,$s0,0 + + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + + _ins $t4,8 + _ins $t5,8 + _ins $t6,8 + _ins $t7,8 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,1024($i0) # Td4[s1] + lbu $t5,1024($i1) # Td4[s2] + lbu $t6,1024($i2) # Td4[s3] + lbu $t7,1024($i3) # Td4[s0] + + _ins $t8,24 + _ins $t9,24 + _ins $t10,24 + _ins $t11,24 + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + _ins $t4,0 + _ins $t5,0 + _ins $t6,0 + _ins $t7,0 + + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + jr $ra +.end _mips_AES_decrypt + +.align 5 +.globl AES_decrypt +.ent AES_decrypt +AES_decrypt: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_decrypt +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Td # PIC-ified 'load address' + + lwl $s0,0+$MSB($inp) + lwl $s1,4+$MSB($inp) + lwl $s2,8+$MSB($inp) + lwl $s3,12+$MSB($inp) + lwr $s0,0+$LSB($inp) + lwr $s1,4+$LSB($inp) + lwr $s2,8+$LSB($inp) + lwr $s3,12+$LSB($inp) + + bal _mips_AES_decrypt + + swr $s0,0+$LSB($out) + swr $s1,4+$LSB($out) + swr $s2,8+$LSB($out) + swr $s3,12+$LSB($out) + swl $s0,0+$MSB($out) + swl $s1,4+$MSB($out) + swl $s2,8+$MSB($out) + swl $s3,12+$MSB($out) + + .set noreorder + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_decrypt +___ +}}} + +{{{ +my $FRAMESIZE=8*$SZREG; +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000; + +my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3); +my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); +my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); +my ($rcon,$cnt)=($gp,$fp); + +$code.=<<___; +.align 5 +.ent _mips_AES_set_encrypt_key +_mips_AES_set_encrypt_key: + .frame $sp,0,$ra + .set noreorder + beqz $inp,.Lekey_done + li $t0,-1 + beqz $key,.Lekey_done + $PTR_ADD $rcon,$Tbl,1024+256 + + .set reorder + lwl $rk0,0+$MSB($inp) # load 128 bits + lwl $rk1,4+$MSB($inp) + lwl $rk2,8+$MSB($inp) + lwl $rk3,12+$MSB($inp) + li $at,128 + lwr $rk0,0+$LSB($inp) + lwr $rk1,4+$LSB($inp) + lwr $rk2,8+$LSB($inp) + lwr $rk3,12+$LSB($inp) + .set noreorder + beq $bits,$at,.L128bits + li $cnt,10 + + .set reorder + lwl $rk4,16+$MSB($inp) # load 192 bits + lwl $rk5,20+$MSB($inp) + li $at,192 + lwr $rk4,16+$LSB($inp) + lwr $rk5,20+$LSB($inp) + .set noreorder + beq $bits,$at,.L192bits + li $cnt,8 + + .set reorder + lwl $rk6,24+$MSB($inp) # load 256 bits + lwl $rk7,28+$MSB($inp) + li $at,256 + lwr $rk6,24+$LSB($inp) + lwr $rk7,28+$LSB($inp) + .set noreorder + beq $bits,$at,.L256bits + li $cnt,7 + + b .Lekey_done + li $t0,-2 + +.align 4 +.L128bits: + .set reorder + srl $i0,$rk3,16 + srl $i1,$rk3,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk3,0xff + srl $i3,$rk3,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sub $cnt,1 + $PTR_ADD $key,16 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + + .set noreorder + bnez $cnt,.L128bits + $PTR_ADD $rcon,4 + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + li $cnt,10 + sw $rk3,12($key) + li $t0,0 + sw $cnt,80($key) + b .Lekey_done + $PTR_SUB $key,10*16 + +.align 4 +.L192bits: + .set reorder + srl $i0,$rk5,16 + srl $i1,$rk5,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk5,0xff + srl $i3,$rk5,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sw $rk4,16($key) + sw $rk5,20($key) + sub $cnt,1 + $PTR_ADD $key,24 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + xor $rk4,$rk3 + xor $rk5,$rk4 + + .set noreorder + bnez $cnt,.L192bits + $PTR_ADD $rcon,4 + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + li $cnt,12 + sw $rk3,12($key) + li $t0,0 + sw $cnt,48($key) + b .Lekey_done + $PTR_SUB $key,12*16 + +.align 4 +.L256bits: + .set reorder + srl $i0,$rk7,16 + srl $i1,$rk7,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk7,0xff + srl $i3,$rk7,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sw $rk4,16($key) + sw $rk5,20($key) + sw $rk6,24($key) + sw $rk7,28($key) + sub $cnt,1 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + beqz $cnt,.L256bits_done + + srl $i0,$rk3,24 + srl $i1,$rk3,16 + srl $i2,$rk3,8 + and $i3,$rk3,0xff + and $i1,0xff + and $i2,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + sll $i0,24 + sll $i1,16 + sll $i2,8 + + xor $rk4,$i0 + xor $rk4,$i1 + xor $rk4,$i2 + xor $rk4,$i3 + + xor $rk5,$rk4 + xor $rk6,$rk5 + xor $rk7,$rk6 + + $PTR_ADD $key,32 + .set noreorder + b .L256bits + $PTR_ADD $rcon,4 + +.L256bits_done: + sw $rk0,32($key) + sw $rk1,36($key) + sw $rk2,40($key) + li $cnt,14 + sw $rk3,44($key) + li $t0,0 + sw $cnt,48($key) + $PTR_SUB $key,12*16 + +.Lekey_done: + jr $ra + nop +.end _mips_AES_set_encrypt_key + +.globl AES_set_encrypt_key +.ent AES_set_encrypt_key +AES_set_encrypt_key: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_set_encrypt_key +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + .set noreorder + move $a0,$t0 + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_set_encrypt_key +___ + +my ($head,$tail)=($inp,$bits); +my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); +my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2); +$code.=<<___; +.align 5 +.globl AES_set_decrypt_key +.ent AES_set_decrypt_key +AES_set_decrypt_key: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_set_decrypt_key +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + bltz $t0,.Ldkey_done + + sll $at,$cnt,4 + $PTR_ADD $head,$key,0 + $PTR_ADD $tail,$key,$at +.align 4 +.Lswap: + lw $rk0,0($head) + lw $rk1,4($head) + lw $rk2,8($head) + lw $rk3,12($head) + lw $rk4,0($tail) + lw $rk5,4($tail) + lw $rk6,8($tail) + lw $rk7,12($tail) + sw $rk0,0($tail) + sw $rk1,4($tail) + sw $rk2,8($tail) + sw $rk3,12($tail) + $PTR_ADD $head,16 + $PTR_SUB $tail,16 + sw $rk4,-16($head) + sw $rk5,-12($head) + sw $rk6,-8($head) + sw $rk7,-4($head) + bne $head,$tail,.Lswap + + lw $tp1,16($key) # modulo-scheduled + lui $x80808080,0x8080 + sub $cnt,1 + or $x80808080,0x8080 + sll $cnt,2 + $PTR_ADD $key,16 + lui $x1b1b1b1b,0x1b1b + nor $x7f7f7f7f,$zero,$x80808080 + or $x1b1b1b1b,0x1b1b +.align 4 +.Lmix: + and $m,$tp1,$x80808080 + and $tp2,$tp1,$x7f7f7f7f + srl $tp4,$m,7 + addu $tp2,$tp2 # tp2<<1 + subu $m,$tp4 + and $m,$x1b1b1b1b + xor $tp2,$m + + and $m,$tp2,$x80808080 + and $tp4,$tp2,$x7f7f7f7f + srl $tp8,$m,7 + addu $tp4,$tp4 # tp4<<1 + subu $m,$tp8 + and $m,$x1b1b1b1b + xor $tp4,$m + + and $m,$tp4,$x80808080 + and $tp8,$tp4,$x7f7f7f7f + srl $tp9,$m,7 + addu $tp8,$tp8 # tp8<<1 + subu $m,$tp9 + and $m,$x1b1b1b1b + xor $tp8,$m + + xor $tp9,$tp8,$tp1 + xor $tpe,$tp8,$tp4 + xor $tpb,$tp9,$tp2 + xor $tpd,$tp9,$tp4 + + _ror $tp1,$tpd,16 + xor $tpe,$tp2 + _ror $tp2,$tpd,-16 + xor $tpe,$tp1 + _ror $tp1,$tp9,8 + xor $tpe,$tp2 + _ror $tp2,$tp9,-24 + xor $tpe,$tp1 + _ror $tp1,$tpb,24 + xor $tpe,$tp2 + _ror $tp2,$tpb,-8 + xor $tpe,$tp1 + lw $tp1,4($key) # modulo-scheduled + xor $tpe,$tp2 + sub $cnt,1 + sw $tpe,0($key) + $PTR_ADD $key,4 + bnez $cnt,.Lmix + + li $t0,0 +.Ldkey_done: + .set noreorder + move $a0,$t0 + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_set_decrypt_key +___ +}}} + +###################################################################### +# Tables are kept in endian-neutral manner +$code.=<<___; +.rdata +.align 6 +AES_Te: +.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 +.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d +.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd +.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 +.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 +.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d +.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 +.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a +.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d +.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 +.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb +.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b +.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 +.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea +.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 +.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b +.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c +.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a +.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 +.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f +.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 +.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 +.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 +.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f +.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 +.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e +.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 +.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 +.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 +.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d +.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 +.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f +.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e +.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e +.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 +.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb +.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d +.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce +.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e +.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 +.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 +.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c +.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f +.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed +.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 +.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b +.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 +.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a +.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a +.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 +.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 +.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 +.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 +.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 +.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 +.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 +.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe +.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a +.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc +.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 +.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 +.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 +.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a +.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d +.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 +.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f +.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 +.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 +.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 +.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 +.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 +.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 +.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 +.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f +.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e +.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 +.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 +.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c +.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 +.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 +.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 +.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e +.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a +.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 +.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e +.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 +.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 +.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b +.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 +.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 +.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 +.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 +.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa +.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 +.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e +.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 +.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 +.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 +.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 +.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 +.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c +.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 +.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc +.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 +.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 +.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa +.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 +.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 +.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f +.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 +.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 +.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 +.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 +.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 +.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 +.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 +.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 +.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 +.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff +.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a +.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 +.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 +.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 +.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 +.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 +.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 +.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc +.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a + +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon +.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 +.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 +.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 +.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 + +.align 6 +AES_Td: +.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 +.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 +.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 +.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 +.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 +.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 +.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 +.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f +.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 +.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 +.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 +.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 +.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 +.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda +.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 +.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 +.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 +.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd +.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 +.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 +.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 +.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 +.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 +.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 +.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 +.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 +.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 +.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a +.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 +.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 +.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 +.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c +.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 +.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 +.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 +.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a +.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 +.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 +.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa +.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 +.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d +.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 +.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 +.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff +.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 +.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 +.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 +.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb +.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 +.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 +.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 +.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e +.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 +.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 +.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 +.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a +.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f +.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e +.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 +.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 +.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 +.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d +.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad +.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 +.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c +.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd +.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc +.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 +.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc +.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 +.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 +.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 +.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 +.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d +.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 +.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 +.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 +.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 +.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a +.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef +.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 +.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 +.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 +.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 +.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d +.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 +.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 +.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 +.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c +.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 +.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 +.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b +.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 +.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 +.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e +.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 +.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce +.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 +.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 +.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 +.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 +.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 +.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 +.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f +.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d +.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf +.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b +.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f +.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d +.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e +.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 +.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 +.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a +.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 +.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 +.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c +.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f +.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf +.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b +.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 +.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e +.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f +.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c +.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 +.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde +.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 +.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 +.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 + +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + # made-up _instructions, _xtr, _ins, _ror and _bias, cope + # with byte order dependencies... + if (/^\s+_/) { + s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/; + + s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/ + sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) + : eval("24-$3"))/e or + s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ + sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) + : eval("24-$3"))/e or + s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/ + sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) + : eval("$3*-1"))/e or + s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ + sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) + : eval("($3-16)&31"))/e; + + s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/ + sprintf("sll\t$1,$2,$3")/e or + s/srl\s+(\$[0-9]+),(\$[0-9]+),0/ + sprintf("and\t$1,$2,0xff")/e or + s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/; + } + + # convert lwl/lwr and swr/swl to little-endian order + if (!$big_endian && /^\s+[sl]w[lr]\s+/) { + s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/ + sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or + s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/ + sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e; + } + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/aes/asm/aes-mips.s b/crypto/aes/asm/aes-mips.s new file mode 100644 index 0000000..0c1d85a --- /dev/null +++ b/crypto/aes/asm/aes-mips.s @@ -0,0 +1,1341 @@ +.text +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +#if !defined(__vxworks) || defined(__pic__) +.option pic2 +#endif +.set noat +.align 5 +.ent _mips_AES_encrypt +_mips_AES_encrypt: + .frame $29,0,$31 + .set reorder + lw $12,0($6) + lw $13,4($6) + lw $14,8($6) + lw $15,12($6) + lw $30,240($6) + add $3,$6,16 + + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + + sub $30,1 + srl $1,$9,6 +.Loop_enc: + srl $2,$10,6 + srl $24,$11,6 + srl $25,$8,6 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lwl $12,2($1) # Te1[s1>>16] + lwl $13,2($2) # Te1[s2>>16] + lwl $14,2($24) # Te1[s3>>16] + lwl $15,2($25) # Te1[s0>>16] + lwr $12,3($1) # Te1[s1>>16] + lwr $13,3($2) # Te1[s2>>16] + lwr $14,3($24) # Te1[s3>>16] + lwr $15,3($25) # Te1[s0>>16] + + srl $1,$10,14 + srl $2,$11,14 + srl $24,$8,14 + srl $25,$9,14 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lwl $16,1($1) # Te2[s2>>8] + lwl $17,1($2) # Te2[s3>>8] + lwl $18,1($24) # Te2[s0>>8] + lwl $19,1($25) # Te2[s1>>8] + lwr $16,2($1) # Te2[s2>>8] + lwr $17,2($2) # Te2[s3>>8] + lwr $18,2($24) # Te2[s0>>8] + lwr $19,2($25) # Te2[s1>>8] + + srl $1,$11,22 + srl $2,$8,22 + srl $24,$9,22 + srl $25,$10,22 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lwl $20,0($1) # Te3[s3] + lwl $21,0($2) # Te3[s0] + lwl $22,0($24) # Te3[s1] + lwl $23,0($25) # Te3[s2] + lwr $20,1($1) # Te3[s3] + lwr $21,1($2) # Te3[s0] + lwr $22,1($24) # Te3[s1] + lwr $23,1($25) # Te3[s2] + + sll $1,$8,2 + sll $2,$9,2 + sll $24,$10,2 + sll $25,$11,2 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + lw $16,0($1) # Te0[s0>>24] + lw $17,0($2) # Te0[s1>>24] + lw $18,0($24) # Te0[s2>>24] + lw $19,0($25) # Te0[s3>>24] + + lw $8,0($3) + lw $9,4($3) + lw $10,8($3) + lw $11,12($3) + + xor $12,$20 + xor $13,$21 + xor $14,$22 + xor $15,$23 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + sub $30,1 + add $3,16 + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + .set noreorder + bnez $30,.Loop_enc + srl $1,$9,6 + + .set reorder + srl $2,$10,6 + srl $24,$11,6 + srl $25,$8,6 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $12,2($1) # Te4[s1>>16] + lbu $13,2($2) # Te4[s2>>16] + lbu $14,2($24) # Te4[s3>>16] + lbu $15,2($25) # Te4[s0>>16] + + srl $1,$10,14 + srl $2,$11,14 + srl $24,$8,14 + srl $25,$9,14 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $16,2($1) # Te4[s2>>8] + lbu $17,2($2) # Te4[s3>>8] + lbu $18,2($24) # Te4[s0>>8] + lbu $19,2($25) # Te4[s1>>8] + + sll $1,$8,2 + sll $2,$9,2 + sll $24,$10,2 + sll $25,$11,2 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $20,2($1) # Te4[s0>>24] + lbu $21,2($2) # Te4[s1>>24] + lbu $22,2($24) # Te4[s2>>24] + lbu $23,2($25) # Te4[s3>>24] + + srl $1,$11,22 + srl $2,$8,22 + srl $24,$9,22 + srl $25,$10,22 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + + sll $12,$12,8 + sll $13,$13,8 + sll $14,$14,8 + sll $15,$15,8 + + sll $16,$16,16 + sll $17,$17,16 + sll $18,$18,16 + sll $19,$19,16 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $16,2($1) # Te4[s3] + lbu $17,2($2) # Te4[s0] + lbu $18,2($24) # Te4[s1] + lbu $19,2($25) # Te4[s2] + + #sll $20,$20,0 + #sll $21,$21,0 + #sll $22,$22,0 + #sll $23,$23,0 + + lw $8,0($3) + lw $9,4($3) + lw $10,8($3) + lw $11,12($3) + + xor $12,$20 + xor $13,$21 + xor $14,$22 + xor $15,$23 + + sll $16,$16,24 + sll $17,$17,24 + sll $18,$18,24 + sll $19,$19,24 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + + jr $31 +.end _mips_AES_encrypt + +.align 5 +.globl AES_encrypt +.ent AES_encrypt +AES_encrypt: + .frame $29,64,$31 + .mask 3237937152,-4 + .set noreorder + sub $29,64 + sw $31,64-1*4($29) + sw $30,64-2*4($29) + sw $23,64-3*4($29) + sw $22,64-4*4($29) + sw $21,64-5*4($29) + sw $20,64-6*4($29) + sw $19,64-7*4($29) + sw $18,64-8*4($29) + sw $17,64-9*4($29) + sw $16,64-10*4($29) + .cplocal $7 + .cpsetup $25,$0,AES_encrypt + .set reorder + la $7,AES_Te # PIC-ified 'load address' + + lwl $8,0+3($4) + lwl $9,4+3($4) + lwl $10,8+3($4) + lwl $11,12+3($4) + lwr $8,0+0($4) + lwr $9,4+0($4) + lwr $10,8+0($4) + lwr $11,12+0($4) + + bal _mips_AES_encrypt + + swr $8,0+0($5) + swr $9,4+0($5) + swr $10,8+0($5) + swr $11,12+0($5) + swl $8,0+3($5) + swl $9,4+3($5) + swl $10,8+3($5) + swl $11,12+3($5) + + .set noreorder + lw $31,64-1*4($29) + lw $30,64-2*4($29) + lw $23,64-3*4($29) + lw $22,64-4*4($29) + lw $21,64-5*4($29) + lw $20,64-6*4($29) + lw $19,64-7*4($29) + lw $18,64-8*4($29) + lw $17,64-9*4($29) + lw $16,64-10*4($29) + jr $31 + add $29,64 +.end AES_encrypt +.align 5 +.ent _mips_AES_decrypt +_mips_AES_decrypt: + .frame $29,0,$31 + .set reorder + lw $12,0($6) + lw $13,4($6) + lw $14,8($6) + lw $15,12($6) + lw $30,240($6) + add $3,$6,16 + + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + + sub $30,1 + srl $1,$11,6 +.Loop_dec: + srl $2,$8,6 + srl $24,$9,6 + srl $25,$10,6 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lwl $12,2($1) # Td1[s3>>16] + lwl $13,2($2) # Td1[s0>>16] + lwl $14,2($24) # Td1[s1>>16] + lwl $15,2($25) # Td1[s2>>16] + lwr $12,3($1) # Td1[s3>>16] + lwr $13,3($2) # Td1[s0>>16] + lwr $14,3($24) # Td1[s1>>16] + lwr $15,3($25) # Td1[s2>>16] + + srl $1,$10,14 + srl $2,$11,14 + srl $24,$8,14 + srl $25,$9,14 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lwl $16,1($1) # Td2[s2>>8] + lwl $17,1($2) # Td2[s3>>8] + lwl $18,1($24) # Td2[s0>>8] + lwl $19,1($25) # Td2[s1>>8] + lwr $16,2($1) # Td2[s2>>8] + lwr $17,2($2) # Td2[s3>>8] + lwr $18,2($24) # Td2[s0>>8] + lwr $19,2($25) # Td2[s1>>8] + + srl $1,$9,22 + srl $2,$10,22 + srl $24,$11,22 + srl $25,$8,22 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lwl $20,0($1) # Td3[s1] + lwl $21,0($2) # Td3[s2] + lwl $22,0($24) # Td3[s3] + lwl $23,0($25) # Td3[s0] + lwr $20,1($1) # Td3[s1] + lwr $21,1($2) # Td3[s2] + lwr $22,1($24) # Td3[s3] + lwr $23,1($25) # Td3[s0] + + sll $1,$8,2 + sll $2,$9,2 + sll $24,$10,2 + sll $25,$11,2 + and $1,0x3fc + and $2,0x3fc + and $24,0x3fc + and $25,0x3fc + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + + lw $16,0($1) # Td0[s0>>24] + lw $17,0($2) # Td0[s1>>24] + lw $18,0($24) # Td0[s2>>24] + lw $19,0($25) # Td0[s3>>24] + + lw $8,0($3) + lw $9,4($3) + lw $10,8($3) + lw $11,12($3) + + xor $12,$20 + xor $13,$21 + xor $14,$22 + xor $15,$23 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + sub $30,1 + add $3,16 + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + .set noreorder + bnez $30,.Loop_dec + srl $1,$11,6 + + .set reorder + lw $16,1024($7) # prefetch Td4 + lw $17,1024+32($7) + lw $18,1024+64($7) + lw $19,1024+96($7) + lw $20,1024+128($7) + lw $21,1024+160($7) + lw $22,1024+192($7) + lw $23,1024+224($7) + + srl $1,$11,8 + srl $2,$8,8 + srl $24,$9,8 + srl $25,$10,8 + and $1,0xff + and $2,0xff + and $24,0xff + and $25,0xff + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $12,1024($1) # Td4[s3>>16] + lbu $13,1024($2) # Td4[s0>>16] + lbu $14,1024($24) # Td4[s1>>16] + lbu $15,1024($25) # Td4[s2>>16] + + srl $1,$10,16 + srl $2,$11,16 + srl $24,$8,16 + srl $25,$9,16 + and $1,0xff + and $2,0xff + and $24,0xff + and $25,0xff + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $16,1024($1) # Td4[s2>>8] + lbu $17,1024($2) # Td4[s3>>8] + lbu $18,1024($24) # Td4[s0>>8] + lbu $19,1024($25) # Td4[s1>>8] + + and $1,$8,0xff + and $2,$9,0xff + and $24,$10,0xff + and $25,$11,0xff + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $20,1024($1) # Td4[s0>>24] + lbu $21,1024($2) # Td4[s1>>24] + lbu $22,1024($24) # Td4[s2>>24] + lbu $23,1024($25) # Td4[s3>>24] + + srl $1,$9,24 + srl $2,$10,24 + srl $24,$11,24 + srl $25,$8,24 + + sll $12,$12,8 + sll $13,$13,8 + sll $14,$14,8 + sll $15,$15,8 + + sll $16,$16,16 + sll $17,$17,16 + sll $18,$18,16 + sll $19,$19,16 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $16,1024($1) # Td4[s1] + lbu $17,1024($2) # Td4[s2] + lbu $18,1024($24) # Td4[s3] + lbu $19,1024($25) # Td4[s0] + + #sll $20,$20,0 + #sll $21,$21,0 + #sll $22,$22,0 + #sll $23,$23,0 + + lw $8,0($3) + lw $9,4($3) + lw $10,8($3) + lw $11,12($3) + + sll $16,$16,24 + sll $17,$17,24 + sll $18,$18,24 + sll $19,$19,24 + + + xor $12,$20 + xor $13,$21 + xor $14,$22 + xor $15,$23 + + xor $12,$16 + xor $13,$17 + xor $14,$18 + xor $15,$19 + + xor $8,$12 + xor $9,$13 + xor $10,$14 + xor $11,$15 + + jr $31 +.end _mips_AES_decrypt + +.align 5 +.globl AES_decrypt +.ent AES_decrypt +AES_decrypt: + .frame $29,64,$31 + .mask 3237937152,-4 + .set noreorder + sub $29,64 + sw $31,64-1*4($29) + sw $30,64-2*4($29) + sw $23,64-3*4($29) + sw $22,64-4*4($29) + sw $21,64-5*4($29) + sw $20,64-6*4($29) + sw $19,64-7*4($29) + sw $18,64-8*4($29) + sw $17,64-9*4($29) + sw $16,64-10*4($29) + .cplocal $7 + .cpsetup $25,$0,AES_decrypt + .set reorder + la $7,AES_Td # PIC-ified 'load address' + + lwl $8,0+3($4) + lwl $9,4+3($4) + lwl $10,8+3($4) + lwl $11,12+3($4) + lwr $8,0+0($4) + lwr $9,4+0($4) + lwr $10,8+0($4) + lwr $11,12+0($4) + + bal _mips_AES_decrypt + + swr $8,0+0($5) + swr $9,4+0($5) + swr $10,8+0($5) + swr $11,12+0($5) + swl $8,0+3($5) + swl $9,4+3($5) + swl $10,8+3($5) + swl $11,12+3($5) + + .set noreorder + lw $31,64-1*4($29) + lw $30,64-2*4($29) + lw $23,64-3*4($29) + lw $22,64-4*4($29) + lw $21,64-5*4($29) + lw $20,64-6*4($29) + lw $19,64-7*4($29) + lw $18,64-8*4($29) + lw $17,64-9*4($29) + lw $16,64-10*4($29) + jr $31 + add $29,64 +.end AES_decrypt +.align 5 +.ent _mips_AES_set_encrypt_key +_mips_AES_set_encrypt_key: + .frame $29,0,$31 + .set noreorder + beqz $4,.Lekey_done + li $2,-1 + beqz $6,.Lekey_done + add $3,$7,1024+256 + + .set reorder + lwl $8,0+3($4) # load 128 bits + lwl $9,4+3($4) + lwl $10,8+3($4) + lwl $11,12+3($4) + li $1,128 + lwr $8,0+0($4) + lwr $9,4+0($4) + lwr $10,8+0($4) + lwr $11,12+0($4) + .set noreorder + beq $5,$1,.L128bits + li $30,10 + + .set reorder + lwl $12,16+3($4) # load 192 bits + lwl $13,20+3($4) + li $1,192 + lwr $12,16+0($4) + lwr $13,20+0($4) + .set noreorder + beq $5,$1,.L192bits + li $30,8 + + .set reorder + lwl $14,24+3($4) # load 256 bits + lwl $15,28+3($4) + li $1,256 + lwr $14,24+0($4) + lwr $15,28+0($4) + .set noreorder + beq $5,$1,.L256bits + li $30,7 + + b .Lekey_done + li $2,-2 + +.align 4 +.L128bits: + .set reorder + srl $1,$11,16 + srl $2,$11,8 + and $1,0xff + and $2,0xff + and $24,$11,0xff + srl $25,$11,24 + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $1,1024($1) + lbu $2,1024($2) + lbu $24,1024($24) + lbu $25,1024($25) + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + sw $11,12($6) + sub $30,1 + add $6,16 + + sll $1,$1,8 + #sll $2,$2,0 + sll $24,$24,24 + sll $25,$25,16 + + xor $8,$1 + lw $1,0($3) + xor $8,$2 + xor $8,$24 + xor $8,$25 + xor $8,$1 + + xor $9,$8 + xor $10,$9 + xor $11,$10 + + .set noreorder + bnez $30,.L128bits + add $3,4 + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + li $30,10 + sw $11,12($6) + li $2,0 + sw $30,80($6) + b .Lekey_done + sub $6,10*16 + +.align 4 +.L192bits: + .set reorder + srl $1,$13,16 + srl $2,$13,8 + and $1,0xff + and $2,0xff + and $24,$13,0xff + srl $25,$13,24 + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $1,1024($1) + lbu $2,1024($2) + lbu $24,1024($24) + lbu $25,1024($25) + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + sw $11,12($6) + sw $12,16($6) + sw $13,20($6) + sub $30,1 + add $6,24 + + sll $1,$1,8 + #sll $2,$2,0 + sll $24,$24,24 + sll $25,$25,16 + + xor $8,$1 + lw $1,0($3) + xor $8,$2 + xor $8,$24 + xor $8,$25 + xor $8,$1 + + xor $9,$8 + xor $10,$9 + xor $11,$10 + xor $12,$11 + xor $13,$12 + + .set noreorder + bnez $30,.L192bits + add $3,4 + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + li $30,12 + sw $11,12($6) + li $2,0 + sw $30,48($6) + b .Lekey_done + sub $6,12*16 + +.align 4 +.L256bits: + .set reorder + srl $1,$15,16 + srl $2,$15,8 + and $1,0xff + and $2,0xff + and $24,$15,0xff + srl $25,$15,24 + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $1,1024($1) + lbu $2,1024($2) + lbu $24,1024($24) + lbu $25,1024($25) + + sw $8,0($6) + sw $9,4($6) + sw $10,8($6) + sw $11,12($6) + sw $12,16($6) + sw $13,20($6) + sw $14,24($6) + sw $15,28($6) + sub $30,1 + + sll $1,$1,8 + #sll $2,$2,0 + sll $24,$24,24 + sll $25,$25,16 + + xor $8,$1 + lw $1,0($3) + xor $8,$2 + xor $8,$24 + xor $8,$25 + xor $8,$1 + + xor $9,$8 + xor $10,$9 + xor $11,$10 + beqz $30,.L256bits_done + + srl $1,$11,24 + srl $2,$11,16 + srl $24,$11,8 + and $25,$11,0xff + and $2,0xff + and $24,0xff + add $1,$7 + add $2,$7 + add $24,$7 + add $25,$7 + lbu $1,1024($1) + lbu $2,1024($2) + lbu $24,1024($24) + lbu $25,1024($25) + sll $1,24 + sll $2,16 + sll $24,8 + + xor $12,$1 + xor $12,$2 + xor $12,$24 + xor $12,$25 + + xor $13,$12 + xor $14,$13 + xor $15,$14 + + add $6,32 + .set noreorder + b .L256bits + add $3,4 + +.L256bits_done: + sw $8,32($6) + sw $9,36($6) + sw $10,40($6) + li $30,14 + sw $11,44($6) + li $2,0 + sw $30,48($6) + sub $6,12*16 + +.Lekey_done: + jr $31 + nop +.end _mips_AES_set_encrypt_key + +.globl AES_set_encrypt_key +.ent AES_set_encrypt_key +AES_set_encrypt_key: + .frame $29,32,$31 + .mask 3221225472,-4 + .set noreorder + sub $29,32 + sw $31,32-1*4($29) + sw $30,32-2*4($29) + .cplocal $7 + .cpsetup $25,$0,AES_set_encrypt_key + .set reorder + la $7,AES_Te # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + .set noreorder + move $4,$2 + lw $31,32-1*4($29) + lw $30,32-2*4($29) + jr $31 + add $29,32 +.end AES_set_encrypt_key +.align 5 +.globl AES_set_decrypt_key +.ent AES_set_decrypt_key +AES_set_decrypt_key: + .frame $29,32,$31 + .mask 3221225472,-4 + .set noreorder + sub $29,32 + sw $31,32-1*4($29) + sw $30,32-2*4($29) + .cplocal $7 + .cpsetup $25,$0,AES_set_decrypt_key + .set reorder + la $7,AES_Te # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + bltz $2,.Ldkey_done + + sll $1,$30,4 + add $4,$6,0 + add $5,$6,$1 +.align 4 +.Lswap: + lw $8,0($4) + lw $9,4($4) + lw $10,8($4) + lw $11,12($4) + lw $12,0($5) + lw $13,4($5) + lw $14,8($5) + lw $15,12($5) + sw $8,0($5) + sw $9,4($5) + sw $10,8($5) + sw $11,12($5) + add $4,16 + sub $5,16 + sw $12,-16($4) + sw $13,-12($4) + sw $14,-8($4) + sw $15,-4($4) + bne $4,$5,.Lswap + + lw $8,16($6) # modulo-scheduled + lui $2,0x8080 + sub $30,1 + or $2,0x8080 + sll $30,2 + add $6,16 + lui $25,0x1b1b + nor $24,$0,$2 + or $25,0x1b1b +.align 4 +.Lmix: + and $1,$8,$2 + and $9,$8,$24 + srl $10,$1,7 + addu $9,$9 # tp2<<1 + subu $1,$10 + and $1,$25 + xor $9,$1 + + and $1,$9,$2 + and $10,$9,$24 + srl $11,$1,7 + addu $10,$10 # tp4<<1 + subu $1,$11 + and $1,$25 + xor $10,$1 + + and $1,$10,$2 + and $11,$10,$24 + srl $12,$1,7 + addu $11,$11 # tp8<<1 + subu $1,$12 + and $1,$25 + xor $11,$1 + + xor $12,$11,$8 + xor $15,$11,$10 + xor $13,$12,$9 + xor $14,$12,$10 + + sll $8,$14,16 + xor $15,$9 + srl $9,$14,16 + xor $15,$8 + sll $8,$12,8 + xor $15,$9 + srl $9,$12,24 + xor $15,$8 + sll $8,$13,24 + xor $15,$9 + srl $9,$13,8 + xor $15,$8 + lw $8,4($6) # modulo-scheduled + xor $15,$9 + sub $30,1 + sw $15,0($6) + add $6,4 + bnez $30,.Lmix + + li $2,0 +.Ldkey_done: + .set noreorder + move $4,$2 + lw $31,32-1*4($29) + lw $30,32-2*4($29) + jr $31 + add $29,32 +.end AES_set_decrypt_key +.rdata +.align 6 +AES_Te: +.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 +.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d +.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd +.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 +.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 +.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d +.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 +.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a +.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d +.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 +.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb +.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b +.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 +.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea +.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 +.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b +.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c +.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a +.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 +.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f +.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 +.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 +.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 +.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f +.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 +.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e +.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 +.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 +.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 +.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d +.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 +.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f +.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e +.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e +.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 +.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb +.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d +.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce +.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e +.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 +.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 +.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c +.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f +.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed +.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 +.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b +.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 +.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a +.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a +.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 +.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 +.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 +.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 +.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 +.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 +.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 +.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe +.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a +.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc +.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 +.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 +.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 +.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a +.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d +.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 +.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f +.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 +.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 +.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 +.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 +.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 +.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 +.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 +.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f +.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e +.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 +.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 +.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c +.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 +.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 +.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 +.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e +.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a +.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 +.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e +.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 +.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 +.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b +.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 +.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 +.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 +.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 +.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa +.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 +.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e +.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 +.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 +.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 +.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 +.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 +.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c +.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 +.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc +.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 +.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 +.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa +.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 +.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 +.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f +.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 +.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 +.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 +.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 +.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 +.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 +.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 +.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 +.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 +.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff +.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a +.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 +.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 +.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 +.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 +.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 +.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 +.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc +.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a + +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon +.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 +.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 +.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 +.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 + +.align 6 +AES_Td: +.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 +.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 +.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 +.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 +.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 +.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 +.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 +.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f +.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 +.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 +.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 +.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 +.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 +.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda +.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 +.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 +.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 +.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd +.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 +.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 +.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 +.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 +.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 +.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 +.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 +.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 +.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 +.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a +.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 +.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 +.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 +.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c +.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 +.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 +.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 +.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a +.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 +.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 +.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa +.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 +.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d +.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 +.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 +.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff +.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 +.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 +.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 +.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb +.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 +.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 +.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 +.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e +.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 +.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 +.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 +.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a +.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f +.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e +.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 +.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 +.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 +.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d +.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad +.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 +.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c +.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd +.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc +.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 +.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc +.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 +.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 +.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 +.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 +.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d +.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 +.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 +.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 +.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 +.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a +.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef +.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 +.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 +.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 +.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 +.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d +.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 +.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 +.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 +.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c +.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 +.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 +.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b +.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 +.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 +.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e +.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 +.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce +.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 +.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 +.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 +.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 +.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 +.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 +.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f +.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d +.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf +.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b +.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f +.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d +.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e +.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 +.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 +.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a +.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 +.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 +.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c +.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f +.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf +.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b +.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 +.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e +.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f +.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c +.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 +.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde +.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 +.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 +.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 + +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d diff --git a/crypto/bn/asm/bn-mips.s b/crypto/bn/asm/bn-mips.s new file mode 100644 index 0000000..d1535b1 --- /dev/null +++ b/crypto/bn/asm/bn-mips.s @@ -0,0 +1,2177 @@ +.set mips2 +.rdata +.asciiz "mips3.s, Version 1.2" +.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" + +.text +.set noat + +.align 5 +.globl bn_mul_add_words +.ent bn_mul_add_words +bn_mul_add_words: + .set noreorder + bgtz $6,bn_mul_add_words_internal + move $2,$0 + jr $31 + move $4,$2 +.end bn_mul_add_words + +.align 5 +.ent bn_mul_add_words_internal +bn_mul_add_words_internal: + .set reorder + li $3,-4 + and $8,$6,$3 + lw $12,0($5) + beqz $8,.L_bn_mul_add_words_tail + +.L_bn_mul_add_words_loop: + multu $12,$7 + lw $13,0($4) + lw $14,4($5) + lw $15,4($4) + lw $8,2*4($5) + lw $9,2*4($4) + addu $13,$2 + sltu $2,$13,$2 # All manuals say it "compares 32-bit + # values", but it seems to work fine + # even on 64-bit registers. + mflo $1 + mfhi $12 + addu $13,$1 + addu $2,$12 + multu $14,$7 + sltu $1,$13,$1 + sw $13,0($4) + addu $2,$1 + + lw $10,3*4($5) + lw $11,3*4($4) + addu $15,$2 + sltu $2,$15,$2 + mflo $1 + mfhi $14 + addu $15,$1 + addu $2,$14 + multu $8,$7 + sltu $1,$15,$1 + sw $15,4($4) + addu $2,$1 + + subu $6,4 + addu $4,4*4 + addu $5,4*4 + addu $9,$2 + sltu $2,$9,$2 + mflo $1 + mfhi $8 + addu $9,$1 + addu $2,$8 + multu $10,$7 + sltu $1,$9,$1 + sw $9,-2*4($4) + addu $2,$1 + + + and $8,$6,$3 + addu $11,$2 + sltu $2,$11,$2 + mflo $1 + mfhi $10 + addu $11,$1 + addu $2,$10 + sltu $1,$11,$1 + sw $11,-4($4) + addu $2,$1 + .set noreorder + bgtzl $8,.L_bn_mul_add_words_loop + lw $12,0($5) + + beqz $6,.L_bn_mul_add_words_return + nop + +.L_bn_mul_add_words_tail: + .set reorder + lw $12,0($5) + multu $12,$7 + lw $13,0($4) + subu $6,1 + addu $13,$2 + sltu $2,$13,$2 + mflo $1 + mfhi $12 + addu $13,$1 + addu $2,$12 + sltu $1,$13,$1 + sw $13,0($4) + addu $2,$1 + beqz $6,.L_bn_mul_add_words_return + + lw $12,4($5) + multu $12,$7 + lw $13,4($4) + subu $6,1 + addu $13,$2 + sltu $2,$13,$2 + mflo $1 + mfhi $12 + addu $13,$1 + addu $2,$12 + sltu $1,$13,$1 + sw $13,4($4) + addu $2,$1 + beqz $6,.L_bn_mul_add_words_return + + lw $12,2*4($5) + multu $12,$7 + lw $13,2*4($4) + addu $13,$2 + sltu $2,$13,$2 + mflo $1 + mfhi $12 + addu $13,$1 + addu $2,$12 + sltu $1,$13,$1 + sw $13,2*4($4) + addu $2,$1 + +.L_bn_mul_add_words_return: + .set noreorder + jr $31 + move $4,$2 +.end bn_mul_add_words_internal + +.align 5 +.globl bn_mul_words +.ent bn_mul_words +bn_mul_words: + .set noreorder + bgtz $6,bn_mul_words_internal + move $2,$0 + jr $31 + move $4,$2 +.end bn_mul_words + +.align 5 +.ent bn_mul_words_internal +bn_mul_words_internal: + .set reorder + li $3,-4 + and $8,$6,$3 + lw $12,0($5) + beqz $8,.L_bn_mul_words_tail + +.L_bn_mul_words_loop: + multu $12,$7 + lw $14,4($5) + lw $8,2*4($5) + lw $10,3*4($5) + mflo $1 + mfhi $12 + addu $2,$1 + sltu $13,$2,$1 + multu $14,$7 + sw $2,0($4) + addu $2,$13,$12 + + subu $6,4 + addu $4,4*4 + addu $5,4*4 + mflo $1 + mfhi $14 + addu $2,$1 + sltu $15,$2,$1 + multu $8,$7 + sw $2,-3*4($4) + addu $2,$15,$14 + + mflo $1 + mfhi $8 + addu $2,$1 + sltu $9,$2,$1 + multu $10,$7 + sw $2,-2*4($4) + addu $2,$9,$8 + + and $8,$6,$3 + mflo $1 + mfhi $10 + addu $2,$1 + sltu $11,$2,$1 + sw $2,-4($4) + addu $2,$11,$10 + .set noreorder + bgtzl $8,.L_bn_mul_words_loop + lw $12,0($5) + + beqz $6,.L_bn_mul_words_return + nop + +.L_bn_mul_words_tail: + .set reorder + lw $12,0($5) + multu $12,$7 + subu $6,1 + mflo $1 + mfhi $12 + addu $2,$1 + sltu $13,$2,$1 + sw $2,0($4) + addu $2,$13,$12 + beqz $6,.L_bn_mul_words_return + + lw $12,4($5) + multu $12,$7 + subu $6,1 + mflo $1 + mfhi $12 + addu $2,$1 + sltu $13,$2,$1 + sw $2,4($4) + addu $2,$13,$12 + beqz $6,.L_bn_mul_words_return + + lw $12,2*4($5) + multu $12,$7 + mflo $1 + mfhi $12 + addu $2,$1 + sltu $13,$2,$1 + sw $2,2*4($4) + addu $2,$13,$12 + +.L_bn_mul_words_return: + .set noreorder + jr $31 + move $4,$2 +.end bn_mul_words_internal + +.align 5 +.globl bn_sqr_words +.ent bn_sqr_words +bn_sqr_words: + .set noreorder + bgtz $6,bn_sqr_words_internal + move $2,$0 + jr $31 + move $4,$2 +.end bn_sqr_words + +.align 5 +.ent bn_sqr_words_internal +bn_sqr_words_internal: + .set reorder + li $3,-4 + and $8,$6,$3 + lw $12,0($5) + beqz $8,.L_bn_sqr_words_tail + +.L_bn_sqr_words_loop: + multu $12,$12 + lw $14,4($5) + lw $8,2*4($5) + lw $10,3*4($5) + mflo $13 + mfhi $12 + sw $13,0($4) + sw $12,4($4) + + multu $14,$14 + subu $6,4 + addu $4,8*4 + addu $5,4*4 + mflo $15 + mfhi $14 + sw $15,-6*4($4) + sw $14,-5*4($4) + + multu $8,$8 + mflo $9 + mfhi $8 + sw $9,-4*4($4) + sw $8,-3*4($4) + + + multu $10,$10 + and $8,$6,$3 + mflo $11 + mfhi $10 + sw $11,-2*4($4) + sw $10,-4($4) + + .set noreorder + bgtzl $8,.L_bn_sqr_words_loop + lw $12,0($5) + + beqz $6,.L_bn_sqr_words_return + nop + +.L_bn_sqr_words_tail: + .set reorder + lw $12,0($5) + multu $12,$12 + subu $6,1 + mflo $13 + mfhi $12 + sw $13,0($4) + sw $12,4($4) + beqz $6,.L_bn_sqr_words_return + + lw $12,4($5) + multu $12,$12 + subu $6,1 + mflo $13 + mfhi $12 + sw $13,2*4($4) + sw $12,3*4($4) + beqz $6,.L_bn_sqr_words_return + + lw $12,2*4($5) + multu $12,$12 + mflo $13 + mfhi $12 + sw $13,4*4($4) + sw $12,5*4($4) + +.L_bn_sqr_words_return: + .set noreorder + jr $31 + move $4,$2 + +.end bn_sqr_words_internal + +.align 5 +.globl bn_add_words +.ent bn_add_words +bn_add_words: + .set noreorder + bgtz $7,bn_add_words_internal + move $2,$0 + jr $31 + move $4,$2 +.end bn_add_words + +.align 5 +.ent bn_add_words_internal +bn_add_words_internal: + .set reorder + li $3,-4 + and $1,$7,$3 + lw $12,0($5) + beqz $1,.L_bn_add_words_tail + +.L_bn_add_words_loop: + lw $8,0($6) + subu $7,4 + lw $13,4($5) + and $1,$7,$3 + lw $14,2*4($5) + addu $6,4*4 + lw $15,3*4($5) + addu $4,4*4 + lw $9,-3*4($6) + addu $5,4*4 + lw $10,-2*4($6) + lw $11,-4($6) + addu $8,$12 + sltu $24,$8,$12 + addu $12,$8,$2 + sltu $2,$12,$8 + sw $12,-4*4($4) + addu $2,$24 + + addu $9,$13 + sltu $25,$9,$13 + addu $13,$9,$2 + sltu $2,$13,$9 + sw $13,-3*4($4) + addu $2,$25 + + addu $10,$14 + sltu $24,$10,$14 + addu $14,$10,$2 + sltu $2,$14,$10 + sw $14,-2*4($4) + addu $2,$24 + + addu $11,$15 + sltu $25,$11,$15 + addu $15,$11,$2 + sltu $2,$15,$11 + sw $15,-4($4) + addu $2,$25 + + .set noreorder + bgtzl $1,.L_bn_add_words_loop + lw $12,0($5) + + beqz $7,.L_bn_add_words_return + nop + +.L_bn_add_words_tail: + .set reorder + lw $12,0($5) + lw $8,0($6) + addu $8,$12 + subu $7,1 + sltu $24,$8,$12 + addu $12,$8,$2 + sltu $2,$12,$8 + sw $12,0($4) + addu $2,$24 + beqz $7,.L_bn_add_words_return + + lw $13,4($5) + lw $9,4($6) + addu $9,$13 + subu $7,1 + sltu $25,$9,$13 + addu $13,$9,$2 + sltu $2,$13,$9 + sw $13,4($4) + addu $2,$25 + beqz $7,.L_bn_add_words_return + + lw $14,2*4($5) + lw $10,2*4($6) + addu $10,$14 + sltu $24,$10,$14 + addu $14,$10,$2 + sltu $2,$14,$10 + sw $14,2*4($4) + addu $2,$24 + +.L_bn_add_words_return: + .set noreorder + jr $31 + move $4,$2 + +.end bn_add_words_internal + +.align 5 +.globl bn_sub_words +.ent bn_sub_words +bn_sub_words: + .set noreorder + bgtz $7,bn_sub_words_internal + move $2,$0 + jr $31 + move $4,$0 +.end bn_sub_words + +.align 5 +.ent bn_sub_words_internal +bn_sub_words_internal: + .set reorder + li $3,-4 + and $1,$7,$3 + lw $12,0($5) + beqz $1,.L_bn_sub_words_tail + +.L_bn_sub_words_loop: + lw $8,0($6) + subu $7,4 + lw $13,4($5) + and $1,$7,$3 + lw $14,2*4($5) + addu $6,4*4 + lw $15,3*4($5) + addu $4,4*4 + lw $9,-3*4($6) + addu $5,4*4 + lw $10,-2*4($6) + lw $11,-4($6) + sltu $24,$12,$8 + subu $8,$12,$8 + subu $12,$8,$2 + sgtu $2,$12,$8 + sw $12,-4*4($4) + addu $2,$24 + + sltu $25,$13,$9 + subu $9,$13,$9 + subu $13,$9,$2 + sgtu $2,$13,$9 + sw $13,-3*4($4) + addu $2,$25 + + + sltu $24,$14,$10 + subu $10,$14,$10 + subu $14,$10,$2 + sgtu $2,$14,$10 + sw $14,-2*4($4) + addu $2,$24 + + sltu $25,$15,$11 + subu $11,$15,$11 + subu $15,$11,$2 + sgtu $2,$15,$11 + sw $15,-4($4) + addu $2,$25 + + .set noreorder + bgtzl $1,.L_bn_sub_words_loop + lw $12,0($5) + + beqz $7,.L_bn_sub_words_return + nop + +.L_bn_sub_words_tail: + .set reorder + lw $12,0($5) + lw $8,0($6) + subu $7,1 + sltu $24,$12,$8 + subu $8,$12,$8 + subu $12,$8,$2 + sgtu $2,$12,$8 + sw $12,0($4) + addu $2,$24 + beqz $7,.L_bn_sub_words_return + + lw $13,4($5) + subu $7,1 + lw $9,4($6) + sltu $25,$13,$9 + subu $9,$13,$9 + subu $13,$9,$2 + sgtu $2,$13,$9 + sw $13,4($4) + addu $2,$25 + beqz $7,.L_bn_sub_words_return + + lw $14,2*4($5) + lw $10,2*4($6) + sltu $24,$14,$10 + subu $10,$14,$10 + subu $14,$10,$2 + sgtu $2,$14,$10 + sw $14,2*4($4) + addu $2,$24 + +.L_bn_sub_words_return: + .set noreorder + jr $31 + move $4,$2 +.end bn_sub_words_internal + +.align 5 +.globl bn_div_3_words +.ent bn_div_3_words +bn_div_3_words: + .set noreorder + move $7,$4 # we know that bn_div_words does not + # touch $7, $10, $11 and preserves $6 + # so that we can save two arguments + # and return address in registers + # instead of stack:-) + + lw $4,($7) + move $10,$5 + bne $4,$6,bn_div_3_words_internal + lw $5,-4($7) + li $2,-1 + jr $31 + move $4,$2 +.end bn_div_3_words + +.align 5 +.ent bn_div_3_words_internal +bn_div_3_words_internal: + .set reorder + move $11,$31 + bal bn_div_words + move $31,$11 + multu $10,$2 + lw $14,-2*4($7) + move $8,$0 + mfhi $13 + mflo $12 + sltu $24,$13,$5 +.L_bn_div_3_words_inner_loop: + bnez $24,.L_bn_div_3_words_inner_loop_done + sgeu $1,$14,$12 + seq $25,$13,$5 + and $1,$25 + sltu $15,$12,$10 + addu $5,$6 + subu $13,$15 + subu $12,$10 + sltu $24,$13,$5 + sltu $8,$5,$6 + or $24,$8 + .set noreorder + beqzl $1,.L_bn_div_3_words_inner_loop + subu $2,1 + .set reorder +.L_bn_div_3_words_inner_loop_done: + .set noreorder + jr $31 + move $4,$2 +.end bn_div_3_words_internal + +.align 5 +.globl bn_div_words +.ent bn_div_words +bn_div_words: + .set noreorder + bnez $6,bn_div_words_internal + li $2,-1 # I would rather signal div-by-zero + # which can be done with 'break 7' + jr $31 + move $4,$2 +.end bn_div_words + +.align 5 +.ent bn_div_words_internal +bn_div_words_internal: + move $3,$0 + bltz $6,.L_bn_div_words_body + move $25,$3 + sll $6,1 + bgtz $6,.-4 + addu $25,1 + + .set reorder + negu $13,$25 + li $14,-1 + sll $14,$13 + and $14,$4 + srl $1,$5,$13 + .set noreorder + bnezl $14,.+8 + break 6 # signal overflow + .set reorder + sll $4,$25 + sll $5,$25 + or $4,$1 +.L_bn_div_words_body: + srl $3,$6,4*4 # bits + sgeu $1,$4,$6 + .set noreorder + bnezl $1,.+8 + subu $4,$6 + .set reorder + + li $8,-1 + srl $9,$4,4*4 # bits + srl $8,4*4 # q=0xffffffff + beq $3,$9,.L_bn_div_words_skip_div1 + divu $0,$4,$3 + mflo $8 +.L_bn_div_words_skip_div1: + multu $6,$8 + sll $15,$4,4*4 # bits + srl $1,$5,4*4 # bits + or $15,$1 + mflo $12 + mfhi $13 +.L_bn_div_words_inner_loop1: + sltu $14,$15,$12 + seq $24,$9,$13 + sltu $1,$9,$13 + and $14,$24 + sltu $2,$12,$6 + or $1,$14 + .set noreorder + beqz $1,.L_bn_div_words_inner_loop1_done + subu $13,$2 + subu $12,$6 + b .L_bn_div_words_inner_loop1 + subu $8,1 + .set reorder +.L_bn_div_words_inner_loop1_done: + + sll $5,4*4 # bits + subu $4,$15,$12 + sll $2,$8,4*4 # bits + + li $8,-1 + srl $9,$4,4*4 # bits + srl $8,4*4 # q=0xffffffff + beq $3,$9,.L_bn_div_words_skip_div2 + divu $0,$4,$3 + mflo $8 +.L_bn_div_words_skip_div2: + multu $6,$8 + sll $15,$4,4*4 # bits + srl $1,$5,4*4 # bits + or $15,$1 + mflo $12 + mfhi $13 +.L_bn_div_words_inner_loop2: + sltu $14,$15,$12 + seq $24,$9,$13 + sltu $1,$9,$13 + and $14,$24 + sltu $3,$12,$6 + or $1,$14 + .set noreorder + beqz $1,.L_bn_div_words_inner_loop2_done + subu $13,$3 + subu $12,$6 + b .L_bn_div_words_inner_loop2 + subu $8,1 + .set reorder +.L_bn_div_words_inner_loop2_done: + + subu $4,$15,$12 + or $2,$8 + srl $3,$4,$25 # $3 contains remainder if anybody wants it + srl $6,$25 # restore $6 + + .set noreorder + move $5,$3 + jr $31 + move $4,$2 +.end bn_div_words_internal + +.align 5 +.globl bn_mul_comba8 +.ent bn_mul_comba8 +bn_mul_comba8: + .set noreorder + .frame $29,6*4,$31 + .mask 0x003f0000,-4 + subu $29,6*4 + sw $21,5*4($29) + sw $20,4*4($29) + sw $19,3*4($29) + sw $18,2*4($29) + sw $17,1*4($29) + sw $16,0*4($29) + + .set reorder + lw $12,0($5) # If compiled with -mips3 option on + # R5000 box assembler barks on this + # 1ine with "should not have mult/div + # as last instruction in bb (R10K + # bug)" warning. If anybody out there + # has a clue about how to circumvent + # this do send me a note. + # <appro@fy.chalmers.se> + + lw $8,0($6) + lw $13,4($5) + lw $14,2*4($5) + multu $12,$8 # mul_add_c(a[0],b[0],c1,c2,c3); + lw $15,3*4($5) + lw $9,4($6) + lw $10,2*4($6) + lw $11,3*4($6) + mflo $2 + mfhi $3 + + lw $16,4*4($5) + lw $18,5*4($5) + multu $12,$9 # mul_add_c(a[0],b[1],c2,c3,c1); + lw $20,6*4($5) + lw $5,7*4($5) + lw $17,4*4($6) + lw $19,5*4($6) + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $13,$8 # mul_add_c(a[1],b[0],c2,c3,c1); + addu $7,$25,$1 + lw $21,6*4($6) + lw $6,7*4($6) + sw $2,0($4) # r[0]=c1; + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $14,$8 # mul_add_c(a[2],b[0],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $2,$7,$25 + sw $3,4($4) # r[1]=c2; + + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $13,$9 # mul_add_c(a[1],b[1],c3,c1,c2); + addu $25,$1 + addu $2,$25 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $12,$10 # mul_add_c(a[0],b[2],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $3,$2,$25 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $12,$11 # mul_add_c(a[0],b[3],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,2*4($4) # r[2]=c3; + + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $13,$10 # mul_add_c(a[1],b[2],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $7,$3,$25 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $14,$9 # mul_add_c(a[2],b[1],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $15,$8 # mul_add_c(a[3],b[0],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $16,$8 # mul_add_c(a[4],b[0],c2,c3,c1); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,3*4($4) # r[3]=c1; + + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $15,$9 # mul_add_c(a[3],b[1],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $2,$7,$25 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $14,$10 # mul_add_c(a[2],b[2],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $13,$11 # mul_add_c(a[1],b[3],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $12,$17 # mul_add_c(a[0],b[4],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $12,$19 # mul_add_c(a[0],b[5],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,4*4($4) # r[4]=c2; + + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $13,$17 # mul_add_c(a[1],b[4],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $3,$2,$25 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $14,$11 # mul_add_c(a[2],b[3],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $15,$10 # mul_add_c(a[3],b[2],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $16,$9 # mul_add_c(a[4],b[1],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $18,$8 # mul_add_c(a[5],b[0],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $20,$8 # mul_add_c(a[6],b[0],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,5*4($4) # r[5]=c3; + + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $18,$9 # mul_add_c(a[5],b[1],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $7,$3,$25 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $16,$10 # mul_add_c(a[4],b[2],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $15,$11 # mul_add_c(a[3],b[3],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $14,$17 # mul_add_c(a[2],b[4],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $13,$19 # mul_add_c(a[1],b[5],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $12,$21 # mul_add_c(a[0],b[6],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $12,$6 # mul_add_c(a[0],b[7],c2,c3,c1); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,6*4($4) # r[6]=c1; + + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $13,$21 # mul_add_c(a[1],b[6],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $2,$7,$25 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $14,$19 # mul_add_c(a[2],b[5],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $15,$17 # mul_add_c(a[3],b[4],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $16,$11 # mul_add_c(a[4],b[3],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $18,$10 # mul_add_c(a[5],b[2],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $20,$9 # mul_add_c(a[6],b[1],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $5,$8 # mul_add_c(a[7],b[0],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $5,$9 # mul_add_c(a[7],b[1],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,7*4($4) # r[7]=c2; + + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $20,$10 # mul_add_c(a[6],b[2],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $3,$2,$25 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $18,$11 # mul_add_c(a[5],b[3],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $16,$17 # mul_add_c(a[4],b[4],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $15,$19 # mul_add_c(a[3],b[5],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $14,$21 # mul_add_c(a[2],b[6],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $13,$6 # mul_add_c(a[1],b[7],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $14,$6 # mul_add_c(a[2],b[7],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,8*4($4) # r[8]=c3; + + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $15,$21 # mul_add_c(a[3],b[6],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $7,$3,$25 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $16,$19 # mul_add_c(a[4],b[5],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $18,$17 # mul_add_c(a[5],b[4],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $20,$11 # mul_add_c(a[6],b[3],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $5,$10 # mul_add_c(a[7],b[2],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $5,$11 # mul_add_c(a[7],b[3],c2,c3,c1); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,9*4($4) # r[9]=c1; + + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $20,$17 # mul_add_c(a[6],b[4],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $2,$7,$25 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $18,$19 # mul_add_c(a[5],b[5],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $16,$21 # mul_add_c(a[4],b[6],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $15,$6 # mul_add_c(a[3],b[7],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $16,$6 # mul_add_c(a[4],b[7],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,10*4($4) # r[10]=c2; + + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $18,$21 # mul_add_c(a[5],b[6],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $3,$2,$25 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $20,$19 # mul_add_c(a[6],b[5],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $5,$17 # mul_add_c(a[7],b[4],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $5,$19 # mul_add_c(a[7],b[5],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,11*4($4) # r[11]=c3; + + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $20,$21 # mul_add_c(a[6],b[6],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $7,$3,$25 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $18,$6 # mul_add_c(a[5],b[7],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $20,$6 # mul_add_c(a[6],b[7],c2,c3,c1); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,12*4($4) # r[12]=c1; + + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $5,$21 # mul_add_c(a[7],b[6],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $2,$7,$25 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $5,$6 # mul_add_c(a[7],b[7],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,13*4($4) # r[13]=c2; + + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sw $7,14*4($4) # r[14]=c3; + sw $2,15*4($4) # r[15]=c1; + + .set noreorder + lw $21,5*4($29) + lw $20,4*4($29) + lw $19,3*4($29) + lw $18,2*4($29) + lw $17,1*4($29) + lw $16,0*4($29) + jr $31 + addu $29,6*4 +.end bn_mul_comba8 + +.align 5 +.globl bn_mul_comba4 +.ent bn_mul_comba4 +bn_mul_comba4: + .set reorder + lw $12,0($5) + lw $8,0($6) + lw $13,4($5) + lw $14,2*4($5) + multu $12,$8 # mul_add_c(a[0],b[0],c1,c2,c3); + lw $15,3*4($5) + lw $9,4($6) + lw $10,2*4($6) + lw $11,3*4($6) + mflo $2 + mfhi $3 + sw $2,0($4) + + multu $12,$9 # mul_add_c(a[0],b[1],c2,c3,c1); + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $13,$8 # mul_add_c(a[1],b[0],c2,c3,c1); + addu $7,$25,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $14,$8 # mul_add_c(a[2],b[0],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $2,$7,$25 + sw $3,4($4) + + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $13,$9 # mul_add_c(a[1],b[1],c3,c1,c2); + addu $25,$1 + addu $2,$25 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $12,$10 # mul_add_c(a[0],b[2],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $3,$2,$25 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $12,$11 # mul_add_c(a[0],b[3],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,2*4($4) + + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $13,$10 # mul_add_c(a[1],b[2],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $7,$3,$25 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $14,$9 # mul_add_c(a[2],b[1],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $15,$8 # mul_add_c(a[3],b[0],c1,c2,c3); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $15,$9 # mul_add_c(a[3],b[1],c2,c3,c1); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,3*4($4) + + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $14,$10 # mul_add_c(a[2],b[2],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $2,$7,$25 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $13,$11 # mul_add_c(a[1],b[3],c2,c3,c1); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $14,$11 # mul_add_c(a[2],b[3],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,4*4($4) + + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $15,$10 # mul_add_c(a[3],b[2],c3,c1,c2); + addu $25,$1 + addu $2,$25 + sltu $3,$2,$25 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $15,$11 # mul_add_c(a[3],b[3],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,5*4($4) + + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sw $2,6*4($4) + sw $3,7*4($4) + + .set noreorder + jr $31 + nop +.end bn_mul_comba4 + +.align 5 +.globl bn_sqr_comba8 +.ent bn_sqr_comba8 +bn_sqr_comba8: + .set reorder + lw $12,0($5) + lw $13,4($5) + lw $14,2*4($5) + lw $15,3*4($5) + + multu $12,$12 # mul_add_c(a[0],b[0],c1,c2,c3); + lw $8,4*4($5) + lw $9,5*4($5) + lw $10,6*4($5) + lw $11,7*4($5) + mflo $2 + mfhi $3 + sw $2,0($4) + + multu $12,$13 # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo $24 + mfhi $25 + slt $2,$25,$0 + sll $25,1 + multu $14,$12 # mul_add_c2(a[2],b[0],c3,c1,c2); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $7,$25,$1 + sw $3,4($4) + + mflo $24 + mfhi $25 + slt $3,$25,$0 + sll $25,1 + multu $13,$13 # mul_add_c(a[1],b[1],c3,c1,c2); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $12,$15 # mul_add_c2(a[0],b[3],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,2*4($4) + + mflo $24 + mfhi $25 + slt $7,$25,$0 + sll $25,1 + multu $13,$14 # mul_add_c2(a[1],b[2],c1,c2,c3); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $7,$1 + multu $8,$12 # mul_add_c2(a[4],b[0],c2,c3,c1); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,3*4($4) + + mflo $24 + mfhi $25 + slt $2,$25,$0 + sll $25,1 + multu $15,$13 # mul_add_c2(a[3],b[1],c2,c3,c1); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $2,$1 + multu $14,$14 # mul_add_c(a[2],b[2],c2,c3,c1); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $12,$9 # mul_add_c2(a[0],b[5],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,4*4($4) + + mflo $24 + mfhi $25 + slt $3,$25,$0 + sll $25,1 + multu $13,$8 # mul_add_c2(a[1],b[4],c3,c1,c2); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $3,$1 + multu $14,$15 # mul_add_c2(a[2],b[3],c3,c1,c2); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + multu $10,$12 # mul_add_c2(a[6],b[0],c1,c2,c3); + addu $3,$1 + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,5*4($4) + + mflo $24 + mfhi $25 + slt $7,$25,$0 + sll $25,1 + multu $9,$13 # mul_add_c2(a[5],b[1],c1,c2,c3); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $7,$1 + multu $8,$14 # mul_add_c2(a[4],b[2],c1,c2,c3); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $7,$1 + multu $15,$15 # mul_add_c(a[3],b[3],c1,c2,c3); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $12,$11 # mul_add_c2(a[0],b[7],c2,c3,c1); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,6*4($4) + + mflo $24 + mfhi $25 + slt $2,$25,$0 + sll $25,1 + multu $13,$10 # mul_add_c2(a[1],b[6],c2,c3,c1); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $2,$1 + multu $14,$9 # mul_add_c2(a[2],b[5],c2,c3,c1); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $2,$1 + multu $15,$8 # mul_add_c2(a[3],b[4],c2,c3,c1); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $2,$1 + multu $11,$13 # mul_add_c2(a[7],b[1],c3,c1,c2); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,7*4($4) + + mflo $24 + mfhi $25 + slt $3,$25,$0 + sll $25,1 + multu $10,$14 # mul_add_c2(a[6],b[2],c3,c1,c2); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $3,$1 + multu $9,$15 # mul_add_c2(a[5],b[3],c3,c1,c2); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $3,$1 + multu $8,$8 # mul_add_c(a[4],b[4],c3,c1,c2); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $14,$11 # mul_add_c2(a[2],b[7],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,8*4($4) + + mflo $24 + mfhi $25 + slt $7,$25,$0 + sll $25,1 + multu $15,$10 # mul_add_c2(a[3],b[6],c1,c2,c3); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $7,$1 + multu $8,$9 # mul_add_c2(a[4],b[5],c1,c2,c3); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $7,$1 + multu $11,$15 # mul_add_c2(a[7],b[3],c2,c3,c1); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,9*4($4) + + mflo $24 + mfhi $25 + slt $2,$25,$0 + sll $25,1 + multu $10,$8 # mul_add_c2(a[6],b[4],c2,c3,c1); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $2,$1 + multu $9,$9 # mul_add_c(a[5],b[5],c2,c3,c1); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $8,$11 # mul_add_c2(a[4],b[7],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,10*4($4) + + mflo $24 + mfhi $25 + slt $3,$25,$0 + sll $25,1 + multu $9,$10 # mul_add_c2(a[5],b[6],c3,c1,c2); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $3,$1 + multu $11,$9 # mul_add_c2(a[7],b[5],c1,c2,c3); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,11*4($4) + + mflo $24 + mfhi $25 + slt $7,$25,$0 + sll $25,1 + multu $10,$10 # mul_add_c(a[6],b[6],c1,c2,c3); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + multu $10,$11 # mul_add_c2(a[6],b[7],c2,c3,c1); + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,12*4($4) + + mflo $24 + mfhi $25 + slt $2,$25,$0 + sll $25,1 + multu $11,$11 # mul_add_c(a[7],b[7],c3,c1,c2); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,13*4($4) + + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sw $7,14*4($4) + sw $2,15*4($4) + + .set noreorder + jr $31 + nop +.end bn_sqr_comba8 + +.align 5 +.globl bn_sqr_comba4 +.ent bn_sqr_comba4 +bn_sqr_comba4: + .set reorder + lw $12,0($5) + lw $13,4($5) + multu $12,$12 # mul_add_c(a[0],b[0],c1,c2,c3); + lw $14,2*4($5) + lw $15,3*4($5) + mflo $2 + mfhi $3 + sw $2,0($4) + + multu $12,$13 # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo $24 + mfhi $25 + slt $2,$25,$0 + sll $25,1 + multu $14,$12 # mul_add_c2(a[2],b[0],c3,c1,c2); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $7,$25,$1 + sw $3,4($4) + + mflo $24 + mfhi $25 + slt $3,$25,$0 + sll $25,1 + multu $13,$13 # mul_add_c(a[1],b[1],c3,c1,c2); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + mflo $24 + mfhi $25 + addu $7,$24 + sltu $1,$7,$24 + multu $12,$15 # mul_add_c2(a[0],b[3],c1,c2,c3); + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,2*4($4) + + mflo $24 + mfhi $25 + slt $7,$25,$0 + sll $25,1 + multu $13,$14 # mul_add_c(a2[1],b[2],c1,c2,c3); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + mflo $24 + mfhi $25 + slt $1,$25,$0 + addu $7,$1 + multu $15,$13 # mul_add_c2(a[3],b[1],c2,c3,c1); + sll $25,1 + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sltu $1,$3,$25 + addu $7,$1 + sw $2,3*4($4) + + mflo $24 + mfhi $25 + slt $2,$25,$0 + sll $25,1 + multu $14,$14 # mul_add_c(a[2],b[2],c2,c3,c1); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $3,$24 + sltu $1,$3,$24 + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + mflo $24 + mfhi $25 + addu $3,$24 + sltu $1,$3,$24 + multu $14,$15 # mul_add_c2(a[2],b[3],c3,c1,c2); + addu $25,$1 + addu $7,$25 + sltu $1,$7,$25 + addu $2,$1 + sw $3,4*4($4) + + mflo $24 + mfhi $25 + slt $3,$25,$0 + sll $25,1 + multu $15,$15 # mul_add_c(a[3],b[3],c1,c2,c3); + slt $6,$24,$0 + addu $25,$6 + sll $24,1 + addu $7,$24 + sltu $1,$7,$24 + addu $25,$1 + addu $2,$25 + sltu $1,$2,$25 + addu $3,$1 + sw $7,5*4($4) + + mflo $24 + mfhi $25 + addu $2,$24 + sltu $1,$2,$24 + addu $25,$1 + addu $3,$25 + sw $2,6*4($4) + sw $3,7*4($4) + + .set noreorder + jr $31 + nop +.end bn_sqr_comba4 diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl new file mode 100644 index 0000000..b944a12 --- /dev/null +++ b/crypto/bn/asm/mips-mont.pl @@ -0,0 +1,426 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# This module doesn't present direct interest for OpenSSL, because it +# doesn't provide better performance for longer keys, at least not on +# in-order-execution cores. While 512-bit RSA sign operations can be +# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and +# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from +# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA +# verify:-( All comparisons are against bn_mul_mont-free assembler. +# The module might be of interest to embedded system developers, as +# the code is smaller than 1KB, yet offers >3x improvement on MIPS64 +# and 75-30% [less for longer keys] on MIPS32 over compiler-generated +# code. + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $SZREG=4; +} +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; +# +# <appro@openssl.org> +# +###################################################################### + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +if ($flavour =~ /64|n32/i) { + $LD="ld"; + $ST="sd"; + $MULTU="dmultu"; + $ADDU="daddu"; + $SUBU="dsubu"; + $BNSZ=8; +} else { + $LD="lw"; + $ST="sw"; + $MULTU="multu"; + $ADDU="addu"; + $SUBU="subu"; + $BNSZ=4; +} + +# int bn_mul_mont( +$rp=$a0; # BN_ULONG *rp, +$ap=$a1; # const BN_ULONG *ap, +$bp=$a2; # const BN_ULONG *bp, +$np=$a3; # const BN_ULONG *np, +$n0=$a4; # const BN_ULONG *n0, +$num=$a5; # int num); + +$lo0=$a6; +$hi0=$a7; +$lo1=$t1; +$hi1=$t2; +$aj=$s0; +$bi=$s1; +$nj=$s2; +$tp=$s3; +$alo=$s4; +$ahi=$s5; +$nlo=$s6; +$nhi=$s7; +$tj=$s8; +$i=$s9; +$j=$s10; +$m1=$s11; + +$FRAMESIZE=14; + +$code=<<___; +.text + +.set noat +.set noreorder + +.align 5 +.globl bn_mul_mont +.ent bn_mul_mont +bn_mul_mont: +___ +$code.=<<___ if ($flavour =~ /o32/i); + lw $n0,16($sp) + lw $num,20($sp) +___ +$code.=<<___; + slt $at,$num,4 + bnez $at,1f + li $t0,0 + slt $at,$num,17 # on in-order CPU + bnezl $at,bn_mul_mont_internal + nop +1: jr $ra + li $a0,0 +.end bn_mul_mont + +.align 5 +.ent bn_mul_mont_internal +bn_mul_mont_internal: + .frame $fp,$FRAMESIZE*$SZREG,$ra + .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG + $PTR_SUB $sp,$FRAMESIZE*$SZREG + $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) + $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) + $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) + $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) + $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) + $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) + $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) + $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) + $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) + $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) + $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) + $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) +___ +$code.=<<___; + move $fp,$sp + + .set reorder + $LD $n0,0($n0) + $LD $bi,0($bp) # bp[0] + $LD $aj,0($ap) # ap[0] + $LD $nj,0($np) # np[0] + + $PTR_SUB $sp,2*$BNSZ # place for two extra words + sll $num,`log($BNSZ)/log(2)` + li $at,-4096 + $PTR_SUB $sp,$num + and $sp,$at + + $MULTU $aj,$bi + $LD $alo,$BNSZ($ap) + $LD $nlo,$BNSZ($np) + mflo $lo0 + mfhi $hi0 + $MULTU $lo0,$n0 + mflo $m1 + + $MULTU $alo,$bi + mflo $alo + mfhi $ahi + + $MULTU $nj,$m1 + mflo $lo1 + mfhi $hi1 + $MULTU $nlo,$m1 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + mflo $nlo + mfhi $nhi + + move $tp,$sp + li $j,2*$BNSZ +.align 4 +.L1st: + .set noreorder + $PTR_ADD $aj,$ap,$j + $PTR_ADD $nj,$np,$j + $LD $aj,($aj) + $LD $nj,($nj) + + $MULTU $aj,$bi + $ADDU $lo0,$alo,$hi0 + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo0,$hi0 + sltu $t0,$lo1,$hi1 + $ADDU $hi0,$ahi,$at + $ADDU $hi1,$nhi,$t0 + mflo $alo + mfhi $ahi + + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $MULTU $nj,$m1 + $ADDU $hi1,$at + addu $j,$BNSZ + $ST $lo1,($tp) + sltu $t0,$j,$num + mflo $nlo + mfhi $nhi + + bnez $t0,.L1st + $PTR_ADD $tp,$BNSZ + .set reorder + + $ADDU $lo0,$alo,$hi0 + sltu $at,$lo0,$hi0 + $ADDU $hi0,$ahi,$at + + $ADDU $lo1,$nlo,$hi1 + sltu $t0,$lo1,$hi1 + $ADDU $hi1,$nhi,$t0 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + + $ST $lo1,($tp) + + $ADDU $hi1,$hi0 + sltu $at,$hi1,$hi0 + $ST $hi1,$BNSZ($tp) + $ST $at,2*$BNSZ($tp) + + li $i,$BNSZ +.align 4 +.Louter: + $PTR_ADD $bi,$bp,$i + $LD $bi,($bi) + $LD $aj,($ap) + $LD $alo,$BNSZ($ap) + $LD $tj,($sp) + + $MULTU $aj,$bi + $LD $nj,($np) + $LD $nlo,$BNSZ($np) + mflo $lo0 + mfhi $hi0 + $ADDU $lo0,$tj + $MULTU $lo0,$n0 + sltu $at,$lo0,$tj + $ADDU $hi0,$at + mflo $m1 + + $MULTU $alo,$bi + mflo $alo + mfhi $ahi + + $MULTU $nj,$m1 + mflo $lo1 + mfhi $hi1 + + $MULTU $nlo,$m1 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + mflo $nlo + mfhi $nhi + + move $tp,$sp + li $j,2*$BNSZ + $LD $tj,$BNSZ($tp) +.align 4 +.Linner: + .set noreorder + $PTR_ADD $aj,$ap,$j + $PTR_ADD $nj,$np,$j + $LD $aj,($aj) + $LD $nj,($nj) + + $MULTU $aj,$bi + $ADDU $lo0,$alo,$hi0 + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo0,$hi0 + sltu $t0,$lo1,$hi1 + $ADDU $hi0,$ahi,$at + $ADDU $hi1,$nhi,$t0 + mflo $alo + mfhi $ahi + + $ADDU $lo0,$tj + addu $j,$BNSZ + $MULTU $nj,$m1 + sltu $at,$lo0,$tj + $ADDU $lo1,$lo0 + $ADDU $hi0,$at + sltu $t0,$lo1,$lo0 + $LD $tj,2*$BNSZ($tp) + $ADDU $hi1,$t0 + sltu $at,$j,$num + mflo $nlo + mfhi $nhi + $ST $lo1,($tp) + bnez $at,.Linner + $PTR_ADD $tp,$BNSZ + .set reorder + + $ADDU $lo0,$alo,$hi0 + sltu $at,$lo0,$hi0 + $ADDU $hi0,$ahi,$at + $ADDU $lo0,$tj + sltu $t0,$lo0,$tj + $ADDU $hi0,$t0 + + $LD $tj,2*$BNSZ($tp) + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo1,$hi1 + $ADDU $hi1,$nhi,$at + $ADDU $lo1,$lo0 + sltu $t0,$lo1,$lo0 + $ADDU $hi1,$t0 + $ST $lo1,($tp) + + $ADDU $lo1,$hi1,$hi0 + sltu $hi1,$lo1,$hi0 + $ADDU $lo1,$tj + sltu $at,$lo1,$tj + $ADDU $hi1,$at + $ST $lo1,$BNSZ($tp) + $ST $hi1,2*$BNSZ($tp) + + addu $i,$BNSZ + sltu $t0,$i,$num + bnez $t0,.Louter + + .set noreorder + $PTR_ADD $tj,$sp,$num # &tp[num] + move $tp,$sp + move $ap,$sp + li $hi0,0 # clear borrow bit + +.align 4 +.Lsub: $LD $lo0,($tp) + $LD $lo1,($np) + $PTR_ADD $tp,$BNSZ + $PTR_ADD $np,$BNSZ + $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] + sgtu $at,$lo1,$lo0 + $SUBU $lo0,$lo1,$hi0 + sgtu $hi0,$lo0,$lo1 + $ST $lo0,($rp) + or $hi0,$at + sltu $at,$tp,$tj + bnez $at,.Lsub + $PTR_ADD $rp,$BNSZ + + $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit + move $tp,$sp + $PTR_SUB $rp,$num # restore rp + not $hi1,$hi0 + + and $ap,$hi0,$sp + and $bp,$hi1,$rp + or $ap,$ap,$bp # ap=borrow?tp:rp + +.align 4 +.Lcopy: $LD $aj,($ap) + $PTR_ADD $ap,$BNSZ + $ST $zero,($tp) + $PTR_ADD $tp,$BNSZ + sltu $at,$tp,$tj + $ST $aj,($rp) + bnez $at,.Lcopy + $PTR_ADD $rp,$BNSZ + + li $a0,1 + li $t0,1 + + .set noreorder + move $sp,$fp + $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) + $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) + $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) + $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) + $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) + $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) + $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) + $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) + $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) + $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) + $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) + $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE*$SZREG +.end bn_mul_mont_internal +.rdata +.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; +close STDOUT; diff --git a/crypto/bn/asm/mips-mont.s b/crypto/bn/asm/mips-mont.s new file mode 100644 index 0000000..867de6f --- /dev/null +++ b/crypto/bn/asm/mips-mont.s @@ -0,0 +1,282 @@ +.text + +.set noat +.set noreorder + +.align 5 +.globl bn_mul_mont +.ent bn_mul_mont +bn_mul_mont: + slt $1,$9,4 + bnez $1,1f + li $2,0 + slt $1,$9,17 # on in-order CPU + bnezl $1,bn_mul_mont_internal + nop +1: jr $31 + li $4,0 +.end bn_mul_mont + +.align 5 +.ent bn_mul_mont_internal +bn_mul_mont_internal: + .frame $30,14*4,$31 + .mask 0x40000000|16711680,-4 + sub $29,14*4 + sw $30,(14-1)*4($29) + sw $23,(14-2)*4($29) + sw $22,(14-3)*4($29) + sw $21,(14-4)*4($29) + sw $20,(14-5)*4($29) + sw $19,(14-6)*4($29) + sw $18,(14-7)*4($29) + sw $17,(14-8)*4($29) + sw $16,(14-9)*4($29) + move $30,$29 + + .set reorder + lw $8,0($8) + lw $13,0($6) # bp[0] + lw $12,0($5) # ap[0] + lw $14,0($7) # np[0] + + sub $29,2*4 # place for two extra words + sll $9,2 + li $1,-4096 + sub $29,$9 + and $29,$1 + + multu $12,$13 + lw $16,4($5) + lw $18,4($7) + mflo $10 + mfhi $11 + multu $10,$8 + mflo $23 + + multu $16,$13 + mflo $16 + mfhi $17 + + multu $14,$23 + mflo $24 + mfhi $25 + multu $18,$23 + addu $24,$10 + sltu $1,$24,$10 + addu $25,$1 + mflo $18 + mfhi $19 + + move $15,$29 + li $22,2*4 +.align 4 +.L1st: + .set noreorder + add $12,$5,$22 + add $14,$7,$22 + lw $12,($12) + lw $14,($14) + + multu $12,$13 + addu $10,$16,$11 + addu $24,$18,$25 + sltu $1,$10,$11 + sltu $2,$24,$25 + addu $11,$17,$1 + addu $25,$19,$2 + mflo $16 + mfhi $17 + + addu $24,$10 + sltu $1,$24,$10 + multu $14,$23 + addu $25,$1 + addu $22,4 + sw $24,($15) + sltu $2,$22,$9 + mflo $18 + mfhi $19 + + bnez $2,.L1st + add $15,4 + .set reorder + + addu $10,$16,$11 + sltu $1,$10,$11 + addu $11,$17,$1 + + addu $24,$18,$25 + sltu $2,$24,$25 + addu $25,$19,$2 + addu $24,$10 + sltu $1,$24,$10 + addu $25,$1 + + sw $24,($15) + + addu $25,$11 + sltu $1,$25,$11 + sw $25,4($15) + sw $1,2*4($15) + + li $21,4 +.align 4 +.Louter: + add $13,$6,$21 + lw $13,($13) + lw $12,($5) + lw $16,4($5) + lw $20,($29) + + multu $12,$13 + lw $14,($7) + lw $18,4($7) + mflo $10 + mfhi $11 + addu $10,$20 + multu $10,$8 + sltu $1,$10,$20 + addu $11,$1 + mflo $23 + + multu $16,$13 + mflo $16 + mfhi $17 + + multu $14,$23 + mflo $24 + mfhi $25 + + multu $18,$23 + addu $24,$10 + sltu $1,$24,$10 + addu $25,$1 + mflo $18 + mfhi $19 + + move $15,$29 + li $22,2*4 + lw $20,4($15) +.align 4 +.Linner: + .set noreorder + add $12,$5,$22 + add $14,$7,$22 + lw $12,($12) + lw $14,($14) + + multu $12,$13 + addu $10,$16,$11 + addu $24,$18,$25 + sltu $1,$10,$11 + sltu $2,$24,$25 + addu $11,$17,$1 + addu $25,$19,$2 + mflo $16 + mfhi $17 + + addu $10,$20 + addu $22,4 + multu $14,$23 + sltu $1,$10,$20 + addu $24,$10 + addu $11,$1 + sltu $2,$24,$10 + lw $20,2*4($15) + addu $25,$2 + sltu $1,$22,$9 + mflo $18 + mfhi $19 + sw $24,($15) + bnez $1,.Linner + add $15,4 + .set reorder + + addu $10,$16,$11 + sltu $1,$10,$11 + addu $11,$17,$1 + addu $10,$20 + sltu $2,$10,$20 + addu $11,$2 + + lw $20,2*4($15) + addu $24,$18,$25 + sltu $1,$24,$25 + addu $25,$19,$1 + addu $24,$10 + sltu $2,$24,$10 + addu $25,$2 + sw $24,($15) + + addu $24,$25,$11 + sltu $25,$24,$11 + addu $24,$20 + sltu $1,$24,$20 + addu $25,$1 + sw $24,4($15) + sw $25,2*4($15) + + addu $21,4 + sltu $2,$21,$9 + bnez $2,.Louter + + .set noreorder + add $20,$29,$9 # &tp[num] + move $15,$29 + move $5,$29 + li $11,0 # clear borrow bit + +.align 4 +.Lsub: lw $10,($15) + lw $24,($7) + add $15,4 + add $7,4 + subu $24,$10,$24 # tp[i]-np[i] + sgtu $1,$24,$10 + subu $10,$24,$11 + sgtu $11,$10,$24 + sw $10,($4) + or $11,$1 + sltu $1,$15,$20 + bnez $1,.Lsub + add $4,4 + + subu $11,$25,$11 # handle upmost overflow bit + move $15,$29 + sub $4,$9 # restore rp + not $25,$11 + + and $5,$11,$29 + and $6,$25,$4 + or $5,$5,$6 # ap=borrow?tp:rp + +.align 4 +.Lcopy: lw $12,($5) + add $5,4 + sw $0,($15) + add $15,4 + sltu $1,$15,$20 + sw $12,($4) + bnez $1,.Lcopy + add $4,4 + + li $4,1 + li $2,1 + + .set noreorder + move $29,$30 + lw $30,(14-1)*4($29) + lw $23,(14-2)*4($29) + lw $22,(14-3)*4($29) + lw $21,(14-4)*4($29) + lw $20,(14-5)*4($29) + lw $19,(14-6)*4($29) + lw $18,(14-7)*4($29) + lw $17,(14-8)*4($29) + lw $16,(14-9)*4($29) + jr $31 + add $29,14*4 +.end bn_mul_mont_internal +.rdata +.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro@openssl.org>" diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl new file mode 100644 index 0000000..f04b3b9 --- /dev/null +++ b/crypto/bn/asm/mips.pl @@ -0,0 +1,2585 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. +# +# Rights for redistribution and usage in source and binary forms are +# granted according to the OpenSSL license. Warranty of any kind is +# disclaimed. +# ==================================================================== + + +# July 1999 +# +# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. +# +# The module is designed to work with either of the "new" MIPS ABI(5), +# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under +# IRIX 5.x not only because it doesn't support new ABIs but also +# because 5.x kernels put R4x00 CPU into 32-bit mode and all those +# 64-bit instructions (daddu, dmultu, etc.) found below gonna only +# cause illegal instruction exception:-( +# +# In addition the code depends on preprocessor flags set up by MIPSpro +# compiler driver (either as or cc) and therefore (probably?) can't be +# compiled by the GNU assembler. GNU C driver manages fine though... +# I mean as long as -mmips-as is specified or is the default option, +# because then it simply invokes /usr/bin/as which in turn takes +# perfect care of the preprocessor definitions. Another neat feature +# offered by the MIPSpro assembler is an optimization pass. This gave +# me the opportunity to have the code looking more regular as all those +# architecture dependent instruction rescheduling details were left to +# the assembler. Cool, huh? +# +# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' +# goes way over 3 times faster! +# +# <appro@fy.chalmers.se> + +# October 2010 +# +# Adapt the module even for 32-bit ABIs and other OSes. The former was +# achieved by mechanical replacement of 64-bit arithmetic instructions +# such as dmultu, daddu, etc. with their 32-bit counterparts and +# adjusting offsets denoting multiples of BN_ULONG. Above mentioned +# >3x performance improvement naturally does not apply to 32-bit code +# [because there is no instruction 32-bit compiler can't use], one +# has to content with 40-85% improvement depending on benchmark and +# key length, more for longer keys. + +$flavour = shift; +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +if ($flavour =~ /64|n32/i) { + $LD="ld"; + $ST="sd"; + $MULTU="dmultu"; + $DIVU="ddivu"; + $ADDU="daddu"; + $SUBU="dsubu"; + $SRL="dsrl"; + $SLL="dsll"; + $BNSZ=8; + $PTR_ADD="daddu"; + $PTR_SUB="dsubu"; + $SZREG=8; + $REG_S="sd"; + $REG_L="ld"; +} else { + $LD="lw"; + $ST="sw"; + $MULTU="multu"; + $DIVU="divu"; + $ADDU="addu"; + $SUBU="subu"; + $SRL="srl"; + $SLL="sll"; + $BNSZ=4; + $PTR_ADD="addu"; + $PTR_SUB="subu"; + $SZREG=4; + $REG_S="sw"; + $REG_L="lw"; + $code=".set mips2\n"; +} + +# Below is N32/64 register layout used in the original module. +# +($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); +# +# No special adaptation is required for O32. NUBI on the other hand +# is treated by saving/restoring ($v1,$t0..$t3). + +$gp=$v1 if ($flavour =~ /nubi/i); + +$minus4=$v1; + +$code.=<<___; +.rdata +.asciiz "mips3.s, Version 1.2" +.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" + +.text +.set noat + +.align 5 +.globl bn_mul_add_words +.ent bn_mul_add_words +bn_mul_add_words: + .set noreorder + bgtz $a2,bn_mul_add_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_mul_add_words + +.align 5 +.ent bn_mul_add_words_internal +bn_mul_add_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_mul_add_words_tail + +.L_bn_mul_add_words_loop: + $MULTU $t0,$a3 + $LD $t1,0($a0) + $LD $t2,$BNSZ($a1) + $LD $t3,$BNSZ($a0) + $LD $ta0,2*$BNSZ($a1) + $LD $ta1,2*$BNSZ($a0) + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit + # values", but it seems to work fine + # even on 64-bit registers. + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + $MULTU $t2,$a3 + sltu $at,$t1,$at + $ST $t1,0($a0) + $ADDU $v0,$at + + $LD $ta2,3*$BNSZ($a1) + $LD $ta3,3*$BNSZ($a0) + $ADDU $t3,$v0 + sltu $v0,$t3,$v0 + mflo $at + mfhi $t2 + $ADDU $t3,$at + $ADDU $v0,$t2 + $MULTU $ta0,$a3 + sltu $at,$t3,$at + $ST $t3,$BNSZ($a0) + $ADDU $v0,$at + + subu $a2,4 + $PTR_ADD $a0,4*$BNSZ + $PTR_ADD $a1,4*$BNSZ + $ADDU $ta1,$v0 + sltu $v0,$ta1,$v0 + mflo $at + mfhi $ta0 + $ADDU $ta1,$at + $ADDU $v0,$ta0 + $MULTU $ta2,$a3 + sltu $at,$ta1,$at + $ST $ta1,-2*$BNSZ($a0) + $ADDU $v0,$at + + + and $ta0,$a2,$minus4 + $ADDU $ta3,$v0 + sltu $v0,$ta3,$v0 + mflo $at + mfhi $ta2 + $ADDU $ta3,$at + $ADDU $v0,$ta2 + sltu $at,$ta3,$at + $ST $ta3,-$BNSZ($a0) + $ADDU $v0,$at + .set noreorder + bgtzl $ta0,.L_bn_mul_add_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_mul_add_words_return + nop + +.L_bn_mul_add_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$a3 + $LD $t1,0($a0) + subu $a2,1 + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,0($a0) + $ADDU $v0,$at + beqz $a2,.L_bn_mul_add_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$a3 + $LD $t1,$BNSZ($a0) + subu $a2,1 + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,$BNSZ($a0) + $ADDU $v0,$at + beqz $a2,.L_bn_mul_add_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$a3 + $LD $t1,2*$BNSZ($a0) + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,2*$BNSZ($a0) + $ADDU $v0,$at + +.L_bn_mul_add_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_mul_add_words_internal + +.align 5 +.globl bn_mul_words +.ent bn_mul_words +bn_mul_words: + .set noreorder + bgtz $a2,bn_mul_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_mul_words + +.align 5 +.ent bn_mul_words_internal +bn_mul_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_mul_words_tail + +.L_bn_mul_words_loop: + $MULTU $t0,$a3 + $LD $t2,$BNSZ($a1) + $LD $ta0,2*$BNSZ($a1) + $LD $ta2,3*$BNSZ($a1) + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $MULTU $t2,$a3 + $ST $v0,0($a0) + $ADDU $v0,$t1,$t0 + + subu $a2,4 + $PTR_ADD $a0,4*$BNSZ + $PTR_ADD $a1,4*$BNSZ + mflo $at + mfhi $t2 + $ADDU $v0,$at + sltu $t3,$v0,$at + $MULTU $ta0,$a3 + $ST $v0,-3*$BNSZ($a0) + $ADDU $v0,$t3,$t2 + + mflo $at + mfhi $ta0 + $ADDU $v0,$at + sltu $ta1,$v0,$at + $MULTU $ta2,$a3 + $ST $v0,-2*$BNSZ($a0) + $ADDU $v0,$ta1,$ta0 + + and $ta0,$a2,$minus4 + mflo $at + mfhi $ta2 + $ADDU $v0,$at + sltu $ta3,$v0,$at + $ST $v0,-$BNSZ($a0) + $ADDU $v0,$ta3,$ta2 + .set noreorder + bgtzl $ta0,.L_bn_mul_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_mul_words_return + nop + +.L_bn_mul_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$a3 + subu $a2,1 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,0($a0) + $ADDU $v0,$t1,$t0 + beqz $a2,.L_bn_mul_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$a3 + subu $a2,1 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,$BNSZ($a0) + $ADDU $v0,$t1,$t0 + beqz $a2,.L_bn_mul_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$a3 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,2*$BNSZ($a0) + $ADDU $v0,$t1,$t0 + +.L_bn_mul_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_mul_words_internal + +.align 5 +.globl bn_sqr_words +.ent bn_sqr_words +bn_sqr_words: + .set noreorder + bgtz $a2,bn_sqr_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_sqr_words + +.align 5 +.ent bn_sqr_words_internal +bn_sqr_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_sqr_words_tail + +.L_bn_sqr_words_loop: + $MULTU $t0,$t0 + $LD $t2,$BNSZ($a1) + $LD $ta0,2*$BNSZ($a1) + $LD $ta2,3*$BNSZ($a1) + mflo $t1 + mfhi $t0 + $ST $t1,0($a0) + $ST $t0,$BNSZ($a0) + + $MULTU $t2,$t2 + subu $a2,4 + $PTR_ADD $a0,8*$BNSZ + $PTR_ADD $a1,4*$BNSZ + mflo $t3 + mfhi $t2 + $ST $t3,-6*$BNSZ($a0) + $ST $t2,-5*$BNSZ($a0) + + $MULTU $ta0,$ta0 + mflo $ta1 + mfhi $ta0 + $ST $ta1,-4*$BNSZ($a0) + $ST $ta0,-3*$BNSZ($a0) + + + $MULTU $ta2,$ta2 + and $ta0,$a2,$minus4 + mflo $ta3 + mfhi $ta2 + $ST $ta3,-2*$BNSZ($a0) + $ST $ta2,-$BNSZ($a0) + + .set noreorder + bgtzl $ta0,.L_bn_sqr_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_sqr_words_return + nop + +.L_bn_sqr_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$t0 + subu $a2,1 + mflo $t1 + mfhi $t0 + $ST $t1,0($a0) + $ST $t0,$BNSZ($a0) + beqz $a2,.L_bn_sqr_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$t0 + subu $a2,1 + mflo $t1 + mfhi $t0 + $ST $t1,2*$BNSZ($a0) + $ST $t0,3*$BNSZ($a0) + beqz $a2,.L_bn_sqr_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$t0 + mflo $t1 + mfhi $t0 + $ST $t1,4*$BNSZ($a0) + $ST $t0,5*$BNSZ($a0) + +.L_bn_sqr_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 + +.end bn_sqr_words_internal + +.align 5 +.globl bn_add_words +.ent bn_add_words +bn_add_words: + .set noreorder + bgtz $a3,bn_add_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_add_words + +.align 5 +.ent bn_add_words_internal +bn_add_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $at,$a3,$minus4 + $LD $t0,0($a1) + beqz $at,.L_bn_add_words_tail + +.L_bn_add_words_loop: + $LD $ta0,0($a2) + subu $a3,4 + $LD $t1,$BNSZ($a1) + and $at,$a3,$minus4 + $LD $t2,2*$BNSZ($a1) + $PTR_ADD $a2,4*$BNSZ + $LD $t3,3*$BNSZ($a1) + $PTR_ADD $a0,4*$BNSZ + $LD $ta1,-3*$BNSZ($a2) + $PTR_ADD $a1,4*$BNSZ + $LD $ta2,-2*$BNSZ($a2) + $LD $ta3,-$BNSZ($a2) + $ADDU $ta0,$t0 + sltu $t8,$ta0,$t0 + $ADDU $t0,$ta0,$v0 + sltu $v0,$t0,$ta0 + $ST $t0,-4*$BNSZ($a0) + $ADDU $v0,$t8 + + $ADDU $ta1,$t1 + sltu $t9,$ta1,$t1 + $ADDU $t1,$ta1,$v0 + sltu $v0,$t1,$ta1 + $ST $t1,-3*$BNSZ($a0) + $ADDU $v0,$t9 + + $ADDU $ta2,$t2 + sltu $t8,$ta2,$t2 + $ADDU $t2,$ta2,$v0 + sltu $v0,$t2,$ta2 + $ST $t2,-2*$BNSZ($a0) + $ADDU $v0,$t8 + + $ADDU $ta3,$t3 + sltu $t9,$ta3,$t3 + $ADDU $t3,$ta3,$v0 + sltu $v0,$t3,$ta3 + $ST $t3,-$BNSZ($a0) + $ADDU $v0,$t9 + + .set noreorder + bgtzl $at,.L_bn_add_words_loop + $LD $t0,0($a1) + + beqz $a3,.L_bn_add_words_return + nop + +.L_bn_add_words_tail: + .set reorder + $LD $t0,0($a1) + $LD $ta0,0($a2) + $ADDU $ta0,$t0 + subu $a3,1 + sltu $t8,$ta0,$t0 + $ADDU $t0,$ta0,$v0 + sltu $v0,$t0,$ta0 + $ST $t0,0($a0) + $ADDU $v0,$t8 + beqz $a3,.L_bn_add_words_return + + $LD $t1,$BNSZ($a1) + $LD $ta1,$BNSZ($a2) + $ADDU $ta1,$t1 + subu $a3,1 + sltu $t9,$ta1,$t1 + $ADDU $t1,$ta1,$v0 + sltu $v0,$t1,$ta1 + $ST $t1,$BNSZ($a0) + $ADDU $v0,$t9 + beqz $a3,.L_bn_add_words_return + + $LD $t2,2*$BNSZ($a1) + $LD $ta2,2*$BNSZ($a2) + $ADDU $ta2,$t2 + sltu $t8,$ta2,$t2 + $ADDU $t2,$ta2,$v0 + sltu $v0,$t2,$ta2 + $ST $t2,2*$BNSZ($a0) + $ADDU $v0,$t8 + +.L_bn_add_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 + +.end bn_add_words_internal + +.align 5 +.globl bn_sub_words +.ent bn_sub_words +bn_sub_words: + .set noreorder + bgtz $a3,bn_sub_words_internal + move $v0,$zero + jr $ra + move $a0,$zero +.end bn_sub_words + +.align 5 +.ent bn_sub_words_internal +bn_sub_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $at,$a3,$minus4 + $LD $t0,0($a1) + beqz $at,.L_bn_sub_words_tail + +.L_bn_sub_words_loop: + $LD $ta0,0($a2) + subu $a3,4 + $LD $t1,$BNSZ($a1) + and $at,$a3,$minus4 + $LD $t2,2*$BNSZ($a1) + $PTR_ADD $a2,4*$BNSZ + $LD $t3,3*$BNSZ($a1) + $PTR_ADD $a0,4*$BNSZ + $LD $ta1,-3*$BNSZ($a2) + $PTR_ADD $a1,4*$BNSZ + $LD $ta2,-2*$BNSZ($a2) + $LD $ta3,-$BNSZ($a2) + sltu $t8,$t0,$ta0 + $SUBU $ta0,$t0,$ta0 + $SUBU $t0,$ta0,$v0 + sgtu $v0,$t0,$ta0 + $ST $t0,-4*$BNSZ($a0) + $ADDU $v0,$t8 + + sltu $t9,$t1,$ta1 + $SUBU $ta1,$t1,$ta1 + $SUBU $t1,$ta1,$v0 + sgtu $v0,$t1,$ta1 + $ST $t1,-3*$BNSZ($a0) + $ADDU $v0,$t9 + + + sltu $t8,$t2,$ta2 + $SUBU $ta2,$t2,$ta2 + $SUBU $t2,$ta2,$v0 + sgtu $v0,$t2,$ta2 + $ST $t2,-2*$BNSZ($a0) + $ADDU $v0,$t8 + + sltu $t9,$t3,$ta3 + $SUBU $ta3,$t3,$ta3 + $SUBU $t3,$ta3,$v0 + sgtu $v0,$t3,$ta3 + $ST $t3,-$BNSZ($a0) + $ADDU $v0,$t9 + + .set noreorder + bgtzl $at,.L_bn_sub_words_loop + $LD $t0,0($a1) + + beqz $a3,.L_bn_sub_words_return + nop + +.L_bn_sub_words_tail: + .set reorder + $LD $t0,0($a1) + $LD $ta0,0($a2) + subu $a3,1 + sltu $t8,$t0,$ta0 + $SUBU $ta0,$t0,$ta0 + $SUBU $t0,$ta0,$v0 + sgtu $v0,$t0,$ta0 + $ST $t0,0($a0) + $ADDU $v0,$t8 + beqz $a3,.L_bn_sub_words_return + + $LD $t1,$BNSZ($a1) + subu $a3,1 + $LD $ta1,$BNSZ($a2) + sltu $t9,$t1,$ta1 + $SUBU $ta1,$t1,$ta1 + $SUBU $t1,$ta1,$v0 + sgtu $v0,$t1,$ta1 + $ST $t1,$BNSZ($a0) + $ADDU $v0,$t9 + beqz $a3,.L_bn_sub_words_return + + $LD $t2,2*$BNSZ($a1) + $LD $ta2,2*$BNSZ($a2) + sltu $t8,$t2,$ta2 + $SUBU $ta2,$t2,$ta2 + $SUBU $t2,$ta2,$v0 + sgtu $v0,$t2,$ta2 + $ST $t2,2*$BNSZ($a0) + $ADDU $v0,$t8 + +.L_bn_sub_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_sub_words_internal + +.align 5 +.globl bn_div_3_words +.ent bn_div_3_words +bn_div_3_words: + .set noreorder + move $a3,$a0 # we know that bn_div_words does not + # touch $a3, $ta2, $ta3 and preserves $a2 + # so that we can save two arguments + # and return address in registers + # instead of stack:-) + + $LD $a0,($a3) + move $ta2,$a1 + bne $a0,$a2,bn_div_3_words_internal + $LD $a1,-$BNSZ($a3) + li $v0,-1 + jr $ra + move $a0,$v0 +.end bn_div_3_words + +.align 5 +.ent bn_div_3_words_internal +bn_div_3_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + move $ta3,$ra + bal bn_div_words + move $ra,$ta3 + $MULTU $ta2,$v0 + $LD $t2,-2*$BNSZ($a3) + move $ta0,$zero + mfhi $t1 + mflo $t0 + sltu $t8,$t1,$a1 +.L_bn_div_3_words_inner_loop: + bnez $t8,.L_bn_div_3_words_inner_loop_done + sgeu $at,$t2,$t0 + seq $t9,$t1,$a1 + and $at,$t9 + sltu $t3,$t0,$ta2 + $ADDU $a1,$a2 + $SUBU $t1,$t3 + $SUBU $t0,$ta2 + sltu $t8,$t1,$a1 + sltu $ta0,$a1,$a2 + or $t8,$ta0 + .set noreorder + beqzl $at,.L_bn_div_3_words_inner_loop + $SUBU $v0,1 + .set reorder +.L_bn_div_3_words_inner_loop_done: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_div_3_words_internal + +.align 5 +.globl bn_div_words +.ent bn_div_words +bn_div_words: + .set noreorder + bnez $a2,bn_div_words_internal + li $v0,-1 # I would rather signal div-by-zero + # which can be done with 'break 7' + jr $ra + move $a0,$v0 +.end bn_div_words + +.align 5 +.ent bn_div_words_internal +bn_div_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + move $v1,$zero + bltz $a2,.L_bn_div_words_body + move $t9,$v1 + $SLL $a2,1 + bgtz $a2,.-4 + addu $t9,1 + + .set reorder + negu $t1,$t9 + li $t2,-1 + $SLL $t2,$t1 + and $t2,$a0 + $SRL $at,$a1,$t1 + .set noreorder + bnezl $t2,.+8 + break 6 # signal overflow + .set reorder + $SLL $a0,$t9 + $SLL $a1,$t9 + or $a0,$at +___ +$QT=$ta0; +$HH=$ta1; +$DH=$v1; +$code.=<<___; +.L_bn_div_words_body: + $SRL $DH,$a2,4*$BNSZ # bits + sgeu $at,$a0,$a2 + .set noreorder + bnezl $at,.+8 + $SUBU $a0,$a2 + .set reorder + + li $QT,-1 + $SRL $HH,$a0,4*$BNSZ # bits + $SRL $QT,4*$BNSZ # q=0xffffffff + beq $DH,$HH,.L_bn_div_words_skip_div1 + $DIVU $zero,$a0,$DH + mflo $QT +.L_bn_div_words_skip_div1: + $MULTU $a2,$QT + $SLL $t3,$a0,4*$BNSZ # bits + $SRL $at,$a1,4*$BNSZ # bits + or $t3,$at + mflo $t0 + mfhi $t1 +.L_bn_div_words_inner_loop1: + sltu $t2,$t3,$t0 + seq $t8,$HH,$t1 + sltu $at,$HH,$t1 + and $t2,$t8 + sltu $v0,$t0,$a2 + or $at,$t2 + .set noreorder + beqz $at,.L_bn_div_words_inner_loop1_done + $SUBU $t1,$v0 + $SUBU $t0,$a2 + b .L_bn_div_words_inner_loop1 + $SUBU $QT,1 + .set reorder +.L_bn_div_words_inner_loop1_done: + + $SLL $a1,4*$BNSZ # bits + $SUBU $a0,$t3,$t0 + $SLL $v0,$QT,4*$BNSZ # bits + + li $QT,-1 + $SRL $HH,$a0,4*$BNSZ # bits + $SRL $QT,4*$BNSZ # q=0xffffffff + beq $DH,$HH,.L_bn_div_words_skip_div2 + $DIVU $zero,$a0,$DH + mflo $QT +.L_bn_div_words_skip_div2: + $MULTU $a2,$QT + $SLL $t3,$a0,4*$BNSZ # bits + $SRL $at,$a1,4*$BNSZ # bits + or $t3,$at + mflo $t0 + mfhi $t1 +.L_bn_div_words_inner_loop2: + sltu $t2,$t3,$t0 + seq $t8,$HH,$t1 + sltu $at,$HH,$t1 + and $t2,$t8 + sltu $v1,$t0,$a2 + or $at,$t2 + .set noreorder + beqz $at,.L_bn_div_words_inner_loop2_done + $SUBU $t1,$v1 + $SUBU $t0,$a2 + b .L_bn_div_words_inner_loop2 + $SUBU $QT,1 + .set reorder +.L_bn_div_words_inner_loop2_done: + + $SUBU $a0,$t3,$t0 + or $v0,$QT + $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it + $SRL $a2,$t9 # restore $a2 + + .set noreorder + move $a1,$v1 +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_div_words_internal +___ +undef $HH; undef $QT; undef $DH; + +($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); +($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); + +($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 +($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 + +($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); + +$code.=<<___; + +.align 5 +.globl bn_mul_comba8 +.ent bn_mul_comba8 +bn_mul_comba8: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,12*$SZREG,$ra + .mask 0x803ff008,-$SZREG + $PTR_SUB $sp,12*$SZREG + $REG_S $ra,11*$SZREG($sp) + $REG_S $s5,10*$SZREG($sp) + $REG_S $s4,9*$SZREG($sp) + $REG_S $s3,8*$SZREG($sp) + $REG_S $s2,7*$SZREG($sp) + $REG_S $s1,6*$SZREG($sp) + $REG_S $s0,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x003f0000,-$SZREG + $PTR_SUB $sp,6*$SZREG + $REG_S $s5,5*$SZREG($sp) + $REG_S $s4,4*$SZREG($sp) + $REG_S $s3,3*$SZREG($sp) + $REG_S $s2,2*$SZREG($sp) + $REG_S $s1,1*$SZREG($sp) + $REG_S $s0,0*$SZREG($sp) +___ +$code.=<<___; + + .set reorder + $LD $a_0,0($a1) # If compiled with -mips3 option on + # R5000 box assembler barks on this + # 1ine with "should not have mult/div + # as last instruction in bb (R10K + # bug)" warning. If anybody out there + # has a clue about how to circumvent + # this do send me a note. + # <appro\@fy.chalmers.se> + + $LD $b_0,0($a2) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_3,3*$BNSZ($a1) + $LD $b_1,$BNSZ($a2) + $LD $b_2,2*$BNSZ($a2) + $LD $b_3,3*$BNSZ($a2) + mflo $c_1 + mfhi $c_2 + + $LD $a_4,4*$BNSZ($a1) + $LD $a_5,5*$BNSZ($a1) + $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); + $LD $a_6,6*$BNSZ($a1) + $LD $a_7,7*$BNSZ($a1) + $LD $b_4,4*$BNSZ($a2) + $LD $b_5,5*$BNSZ($a2) + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $ADDU $c_3,$t_2,$at + $LD $b_6,6*$BNSZ($a2) + $LD $b_7,7*$BNSZ($a2) + $ST $c_1,0($a0) # r[0]=c1; + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + $ST $c_2,$BNSZ($a0) # r[1]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) # r[2]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) # r[3]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) # r[4]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) # r[5]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,6*$BNSZ($a0) # r[6]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,7*$BNSZ($a0) # r[7]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,8*$BNSZ($a0) # r[8]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,9*$BNSZ($a0) # r[9]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,10*$BNSZ($a0) # r[10]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,11*$BNSZ($a0) # r[11]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,12*$BNSZ($a0) # r[12]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,13*$BNSZ($a0) # r[13]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + $ST $c_3,14*$BNSZ($a0) # r[14]=c3; + $ST $c_1,15*$BNSZ($a0) # r[15]=c1; + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s5,10*$SZREG($sp) + $REG_L $s4,9*$SZREG($sp) + $REG_L $s3,8*$SZREG($sp) + $REG_L $s2,7*$SZREG($sp) + $REG_L $s1,6*$SZREG($sp) + $REG_L $s0,5*$SZREG($sp) + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + jr $ra + $PTR_ADD $sp,12*$SZREG +___ +$code.=<<___ if ($flavour !~ /nubi/i); + $REG_L $s5,5*$SZREG($sp) + $REG_L $s4,4*$SZREG($sp) + $REG_L $s3,3*$SZREG($sp) + $REG_L $s2,2*$SZREG($sp) + $REG_L $s1,1*$SZREG($sp) + $REG_L $s0,0*$SZREG($sp) + jr $ra + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; +.end bn_mul_comba8 + +.align 5 +.globl bn_mul_comba4 +.ent bn_mul_comba4 +bn_mul_comba4: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $b_0,0($a2) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_3,3*$BNSZ($a1) + $LD $b_1,$BNSZ($a2) + $LD $b_2,2*$BNSZ($a2) + $LD $b_3,3*$BNSZ($a2) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $ADDU $c_3,$t_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + $ST $c_1,6*$BNSZ($a0) + $ST $c_2,7*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_mul_comba4 +___ + +($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); + +$code.=<<___; + +.align 5 +.globl bn_sqr_comba8 +.ent bn_sqr_comba8 +bn_sqr_comba8: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $LD $a_3,3*$BNSZ($a1) + + $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_4,4*$BNSZ($a1) + $LD $a_5,5*$BNSZ($a1) + $LD $a_6,6*$BNSZ($a1) + $LD $a_7,7*$BNSZ($a1) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $c_3,$t_2,$at + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); + $ADDU $c_2,$at + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,6*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,7*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,8*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,9*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,10*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,11*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,12*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,13*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + $ST $c_3,14*$BNSZ($a0) + $ST $c_1,15*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_sqr_comba8 + +.align 5 +.globl bn_sqr_comba4 +.ent bn_sqr_comba4 +bn_sqr_comba4: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $a_1,$BNSZ($a1) + $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_2,2*$BNSZ($a1) + $LD $a_3,3*$BNSZ($a1) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $c_3,$t_2,$at + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + $ST $c_1,6*$BNSZ($a0) + $ST $c_2,7*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_sqr_comba4 +___ +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl new file mode 100644 index 0000000..f1a702f --- /dev/null +++ b/crypto/sha/asm/sha1-mips.pl @@ -0,0 +1,354 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for MIPS. + +# Performance improvement is 30% on unaligned input. The "secret" is +# to deploy lwl/lwr pair to load unaligned input. One could have +# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- +# compatible subroutine. There is room for minor optimization on +# little-endian platforms... + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +# +# <appro@openssl.org> +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +# offsets of the Most and Least Significant Bytes +$MSB=$big_endian?0:3; +$LSB=3&~$MSB; + +@X=map("\$$_",(8..23)); # a4-a7,s0-s11 + +$ctx=$a0; +$inp=$a1; +$num=$a2; +$A="\$1"; +$B="\$2"; +$C="\$3"; +$D="\$7"; +$E="\$24"; @V=($A,$B,$C,$D,$E); +$t0="\$25"; +$t1=$num; # $num is offloaded to stack +$t2="\$30"; # fp +$K="\$31"; # ra + +sub BODY_00_14 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if (!$big_endian); + srl $t0,@X[$i],24 # byte swap($i) + srl $t1,@X[$i],8 + andi $t2,@X[$i],0xFF00 + sll @X[$i],@X[$i],24 + andi $t1,0xFF00 + sll $t2,$t2,8 + or @X[$i],$t0 + or $t1,$t2 + or @X[$i],$t1 +___ +$code.=<<___; + lwl @X[$j],$j*4+$MSB($inp) + sll $t0,$a,5 # $i + addu $e,$K + lwr @X[$j],$j*4+$LSB($inp) + srl $t1,$a,27 + addu $e,$t0 + xor $t0,$c,$d + addu $e,$t1 + sll $t2,$b,30 + and $t0,$b + srl $b,$b,2 + xor $t0,$d + addu $e,@X[$i] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_15_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___ if (!$big_endian && $i==15); + srl $t0,@X[$i],24 # byte swap($i) + srl $t1,@X[$i],8 + andi $t2,@X[$i],0xFF00 + sll @X[$i],@X[$i],24 + andi $t1,0xFF00 + sll $t2,$t2,8 + or @X[$i],$t0 + or @X[$i],$t1 + or @X[$i],$t2 +___ +$code.=<<___; + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + and $t0,$b + srl $t1,@X[$j%16],31 + addu @X[$j%16],@X[$j%16] + srl $b,$b,2 + xor $t0,$d + or @X[$j%16],$t1 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + xor $t0,$b + srl $t1,@X[$j%16],31 + addu @X[$j%16],@X[$j%16] + srl $b,$b,2 + addu $e,@X[$i%16] + or @X[$j%16],$t1 + or $b,$t2 + addu $e,$t0 +___ +$code.=<<___ if ($i==79); + lw @X[0],0($ctx) + sll $t0,$a,5 # $i + addu $e,$K + lw @X[1],4($ctx) + srl $t1,$a,27 + addu $e,$t0 + lw @X[2],8($ctx) + xor $t0,$c,$d + addu $e,$t1 + lw @X[3],12($ctx) + sll $t2,$b,30 + xor $t0,$b + lw @X[4],16($ctx) + srl $b,$b,2 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + and $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + addu $e,$t0 + srl $t1,@X[$j%16],31 + xor $t0,$c,$d + addu @X[$j%16],@X[$j%16] + and $t0,$b + srl $b,$b,2 + or @X[$j%16],$t1 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +$FRAMESIZE=16; # large enough to accomodate NUBI saved registers +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +$code=<<___; +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +.text + +.set noat +.set noreorder +.align 5 +.globl sha1_block_data_order +.ent sha1_block_data_order +sha1_block_data_order: + .frame $sp,$FRAMESIZE*$SZREG,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder + $PTR_SUB $sp,$FRAMESIZE*$SZREG + $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) + $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) + $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) + $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) + $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) + $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) + $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) + $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) + $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) + $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) + $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) + $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) + $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) + $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) +___ +$code.=<<___; + $PTR_SLL $num,6 + $PTR_ADD $num,$inp + $REG_S $num,0($sp) + lw $A,0($ctx) + lw $B,4($ctx) + lw $C,8($ctx) + lw $D,12($ctx) + b .Loop + lw $E,16($ctx) +.align 4 +.Loop: + .set reorder + lwl @X[0],$MSB($inp) + lui $K,0x5a82 + lwr @X[0],$LSB($inp) + ori $K,0x7999 # K_00_19 +___ +for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } +for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0x6ed9 + ori $K,0xeba1 # K_20_39 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0x8f1b + ori $K,0xbcdc # K_40_59 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0xca62 + ori $K,0xc1d6 # K_60_79 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + $PTR_ADD $inp,64 + $REG_L $num,0($sp) + + addu $A,$X[0] + addu $B,$X[1] + sw $A,0($ctx) + addu $C,$X[2] + addu $D,$X[3] + sw $B,4($ctx) + addu $E,$X[4] + sw $C,8($ctx) + sw $D,12($ctx) + sw $E,16($ctx) + .set noreorder + bne $inp,$num,.Loop + nop + + .set noreorder + $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) + $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) + $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) + $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) + $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) + $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) + $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) + $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) + $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) + $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) + $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) + $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) + $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) + $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE*$SZREG +.end sha1_block_data_order +.rdata +.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" +___ +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-mips.s b/crypto/sha/asm/sha1-mips.s new file mode 100644 index 0000000..865da25 --- /dev/null +++ b/crypto/sha/asm/sha1-mips.s @@ -0,0 +1,1664 @@ +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +.text + +.set noat +.set noreorder +.align 5 +.globl sha1_block_data_order +.ent sha1_block_data_order +sha1_block_data_order: + .frame $29,16*4,$31 + .mask 3237937152,-4 + .set noreorder + sub $29,16*4 + sw $31,(16-1)*4($29) + sw $30,(16-2)*4($29) + sw $23,(16-3)*4($29) + sw $22,(16-4)*4($29) + sw $21,(16-5)*4($29) + sw $20,(16-6)*4($29) + sw $19,(16-7)*4($29) + sw $18,(16-8)*4($29) + sw $17,(16-9)*4($29) + sw $16,(16-10)*4($29) + sll $6,6 + add $6,$5 + sw $6,0($29) + lw $1,0($4) + lw $2,4($4) + lw $3,8($4) + lw $7,12($4) + b .Loop + lw $24,16($4) +.align 4 +.Loop: + .set reorder + lwl $8,3($5) + lui $31,0x5a82 + lwr $8,0($5) + ori $31,0x7999 # K_00_19 + srl $25,$8,24 # byte swap(0) + srl $6,$8,8 + andi $30,$8,0xFF00 + sll $8,$8,24 + andi $6,0xFF00 + sll $30,$30,8 + or $8,$25 + or $6,$30 + or $8,$6 + lwl $9,1*4+3($5) + sll $25,$1,5 # 0 + addu $24,$31 + lwr $9,1*4+0($5) + srl $6,$1,27 + addu $24,$25 + xor $25,$3,$7 + addu $24,$6 + sll $30,$2,30 + and $25,$2 + srl $2,$2,2 + xor $25,$7 + addu $24,$8 + or $2,$30 + addu $24,$25 + srl $25,$9,24 # byte swap(1) + srl $6,$9,8 + andi $30,$9,0xFF00 + sll $9,$9,24 + andi $6,0xFF00 + sll $30,$30,8 + or $9,$25 + or $6,$30 + or $9,$6 + lwl $10,2*4+3($5) + sll $25,$24,5 # 1 + addu $7,$31 + lwr $10,2*4+0($5) + srl $6,$24,27 + addu $7,$25 + xor $25,$2,$3 + addu $7,$6 + sll $30,$1,30 + and $25,$1 + srl $1,$1,2 + xor $25,$3 + addu $7,$9 + or $1,$30 + addu $7,$25 + srl $25,$10,24 # byte swap(2) + srl $6,$10,8 + andi $30,$10,0xFF00 + sll $10,$10,24 + andi $6,0xFF00 + sll $30,$30,8 + or $10,$25 + or $6,$30 + or $10,$6 + lwl $11,3*4+3($5) + sll $25,$7,5 # 2 + addu $3,$31 + lwr $11,3*4+0($5) + srl $6,$7,27 + addu $3,$25 + xor $25,$1,$2 + addu $3,$6 + sll $30,$24,30 + and $25,$24 + srl $24,$24,2 + xor $25,$2 + addu $3,$10 + or $24,$30 + addu $3,$25 + srl $25,$11,24 # byte swap(3) + srl $6,$11,8 + andi $30,$11,0xFF00 + sll $11,$11,24 + andi $6,0xFF00 + sll $30,$30,8 + or $11,$25 + or $6,$30 + or $11,$6 + lwl $12,4*4+3($5) + sll $25,$3,5 # 3 + addu $2,$31 + lwr $12,4*4+0($5) + srl $6,$3,27 + addu $2,$25 + xor $25,$24,$1 + addu $2,$6 + sll $30,$7,30 + and $25,$7 + srl $7,$7,2 + xor $25,$1 + addu $2,$11 + or $7,$30 + addu $2,$25 + srl $25,$12,24 # byte swap(4) + srl $6,$12,8 + andi $30,$12,0xFF00 + sll $12,$12,24 + andi $6,0xFF00 + sll $30,$30,8 + or $12,$25 + or $6,$30 + or $12,$6 + lwl $13,5*4+3($5) + sll $25,$2,5 # 4 + addu $1,$31 + lwr $13,5*4+0($5) + srl $6,$2,27 + addu $1,$25 + xor $25,$7,$24 + addu $1,$6 + sll $30,$3,30 + and $25,$3 + srl $3,$3,2 + xor $25,$24 + addu $1,$12 + or $3,$30 + addu $1,$25 + srl $25,$13,24 # byte swap(5) + srl $6,$13,8 + andi $30,$13,0xFF00 + sll $13,$13,24 + andi $6,0xFF00 + sll $30,$30,8 + or $13,$25 + or $6,$30 + or $13,$6 + lwl $14,6*4+3($5) + sll $25,$1,5 # 5 + addu $24,$31 + lwr $14,6*4+0($5) + srl $6,$1,27 + addu $24,$25 + xor $25,$3,$7 + addu $24,$6 + sll $30,$2,30 + and $25,$2 + srl $2,$2,2 + xor $25,$7 + addu $24,$13 + or $2,$30 + addu $24,$25 + srl $25,$14,24 # byte swap(6) + srl $6,$14,8 + andi $30,$14,0xFF00 + sll $14,$14,24 + andi $6,0xFF00 + sll $30,$30,8 + or $14,$25 + or $6,$30 + or $14,$6 + lwl $15,7*4+3($5) + sll $25,$24,5 # 6 + addu $7,$31 + lwr $15,7*4+0($5) + srl $6,$24,27 + addu $7,$25 + xor $25,$2,$3 + addu $7,$6 + sll $30,$1,30 + and $25,$1 + srl $1,$1,2 + xor $25,$3 + addu $7,$14 + or $1,$30 + addu $7,$25 + srl $25,$15,24 # byte swap(7) + srl $6,$15,8 + andi $30,$15,0xFF00 + sll $15,$15,24 + andi $6,0xFF00 + sll $30,$30,8 + or $15,$25 + or $6,$30 + or $15,$6 + lwl $16,8*4+3($5) + sll $25,$7,5 # 7 + addu $3,$31 + lwr $16,8*4+0($5) + srl $6,$7,27 + addu $3,$25 + xor $25,$1,$2 + addu $3,$6 + sll $30,$24,30 + and $25,$24 + srl $24,$24,2 + xor $25,$2 + addu $3,$15 + or $24,$30 + addu $3,$25 + srl $25,$16,24 # byte swap(8) + srl $6,$16,8 + andi $30,$16,0xFF00 + sll $16,$16,24 + andi $6,0xFF00 + sll $30,$30,8 + or $16,$25 + or $6,$30 + or $16,$6 + lwl $17,9*4+3($5) + sll $25,$3,5 # 8 + addu $2,$31 + lwr $17,9*4+0($5) + srl $6,$3,27 + addu $2,$25 + xor $25,$24,$1 + addu $2,$6 + sll $30,$7,30 + and $25,$7 + srl $7,$7,2 + xor $25,$1 + addu $2,$16 + or $7,$30 + addu $2,$25 + srl $25,$17,24 # byte swap(9) + srl $6,$17,8 + andi $30,$17,0xFF00 + sll $17,$17,24 + andi $6,0xFF00 + sll $30,$30,8 + or $17,$25 + or $6,$30 + or $17,$6 + lwl $18,10*4+3($5) + sll $25,$2,5 # 9 + addu $1,$31 + lwr $18,10*4+0($5) + srl $6,$2,27 + addu $1,$25 + xor $25,$7,$24 + addu $1,$6 + sll $30,$3,30 + and $25,$3 + srl $3,$3,2 + xor $25,$24 + addu $1,$17 + or $3,$30 + addu $1,$25 + srl $25,$18,24 # byte swap(10) + srl $6,$18,8 + andi $30,$18,0xFF00 + sll $18,$18,24 + andi $6,0xFF00 + sll $30,$30,8 + or $18,$25 + or $6,$30 + or $18,$6 + lwl $19,11*4+3($5) + sll $25,$1,5 # 10 + addu $24,$31 + lwr $19,11*4+0($5) + srl $6,$1,27 + addu $24,$25 + xor $25,$3,$7 + addu $24,$6 + sll $30,$2,30 + and $25,$2 + srl $2,$2,2 + xor $25,$7 + addu $24,$18 + or $2,$30 + addu $24,$25 + srl $25,$19,24 # byte swap(11) + srl $6,$19,8 + andi $30,$19,0xFF00 + sll $19,$19,24 + andi $6,0xFF00 + sll $30,$30,8 + or $19,$25 + or $6,$30 + or $19,$6 + lwl $20,12*4+3($5) + sll $25,$24,5 # 11 + addu $7,$31 + lwr $20,12*4+0($5) + srl $6,$24,27 + addu $7,$25 + xor $25,$2,$3 + addu $7,$6 + sll $30,$1,30 + and $25,$1 + srl $1,$1,2 + xor $25,$3 + addu $7,$19 + or $1,$30 + addu $7,$25 + srl $25,$20,24 # byte swap(12) + srl $6,$20,8 + andi $30,$20,0xFF00 + sll $20,$20,24 + andi $6,0xFF00 + sll $30,$30,8 + or $20,$25 + or $6,$30 + or $20,$6 + lwl $21,13*4+3($5) + sll $25,$7,5 # 12 + addu $3,$31 + lwr $21,13*4+0($5) + srl $6,$7,27 + addu $3,$25 + xor $25,$1,$2 + addu $3,$6 + sll $30,$24,30 + and $25,$24 + srl $24,$24,2 + xor $25,$2 + addu $3,$20 + or $24,$30 + addu $3,$25 + srl $25,$21,24 # byte swap(13) + srl $6,$21,8 + andi $30,$21,0xFF00 + sll $21,$21,24 + andi $6,0xFF00 + sll $30,$30,8 + or $21,$25 + or $6,$30 + or $21,$6 + lwl $22,14*4+3($5) + sll $25,$3,5 # 13 + addu $2,$31 + lwr $22,14*4+0($5) + srl $6,$3,27 + addu $2,$25 + xor $25,$24,$1 + addu $2,$6 + sll $30,$7,30 + and $25,$7 + srl $7,$7,2 + xor $25,$1 + addu $2,$21 + or $7,$30 + addu $2,$25 + srl $25,$22,24 # byte swap(14) + srl $6,$22,8 + andi $30,$22,0xFF00 + sll $22,$22,24 + andi $6,0xFF00 + sll $30,$30,8 + or $22,$25 + or $6,$30 + or $22,$6 + lwl $23,15*4+3($5) + sll $25,$2,5 # 14 + addu $1,$31 + lwr $23,15*4+0($5) + srl $6,$2,27 + addu $1,$25 + xor $25,$7,$24 + addu $1,$6 + sll $30,$3,30 + and $25,$3 + srl $3,$3,2 + xor $25,$24 + addu $1,$22 + or $3,$30 + addu $1,$25 + srl $25,$23,24 # byte swap(15) + srl $6,$23,8 + andi $30,$23,0xFF00 + sll $23,$23,24 + andi $6,0xFF00 + sll $30,$30,8 + or $23,$25 + or $23,$6 + or $23,$30 + xor $8,$10 + sll $25,$1,5 # 15 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $8,$16 + xor $25,$3,$7 + addu $24,$6 + xor $8,$21 + sll $30,$2,30 + and $25,$2 + srl $6,$8,31 + addu $8,$8 + srl $2,$2,2 + xor $25,$7 + or $8,$6 + addu $24,$23 + or $2,$30 + addu $24,$25 + xor $9,$11 + sll $25,$24,5 # 16 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $9,$17 + xor $25,$2,$3 + addu $7,$6 + xor $9,$22 + sll $30,$1,30 + and $25,$1 + srl $6,$9,31 + addu $9,$9 + srl $1,$1,2 + xor $25,$3 + or $9,$6 + addu $7,$8 + or $1,$30 + addu $7,$25 + xor $10,$12 + sll $25,$7,5 # 17 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $10,$18 + xor $25,$1,$2 + addu $3,$6 + xor $10,$23 + sll $30,$24,30 + and $25,$24 + srl $6,$10,31 + addu $10,$10 + srl $24,$24,2 + xor $25,$2 + or $10,$6 + addu $3,$9 + or $24,$30 + addu $3,$25 + xor $11,$13 + sll $25,$3,5 # 18 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $11,$19 + xor $25,$24,$1 + addu $2,$6 + xor $11,$8 + sll $30,$7,30 + and $25,$7 + srl $6,$11,31 + addu $11,$11 + srl $7,$7,2 + xor $25,$1 + or $11,$6 + addu $2,$10 + or $7,$30 + addu $2,$25 + xor $12,$14 + sll $25,$2,5 # 19 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $12,$20 + xor $25,$7,$24 + addu $1,$6 + xor $12,$9 + sll $30,$3,30 + and $25,$3 + srl $6,$12,31 + addu $12,$12 + srl $3,$3,2 + xor $25,$24 + or $12,$6 + addu $1,$11 + or $3,$30 + addu $1,$25 + lui $31,0x6ed9 + ori $31,0xeba1 # K_20_39 + xor $13,$15 + sll $25,$1,5 # 20 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $13,$21 + xor $25,$3,$7 + addu $24,$6 + xor $13,$10 + sll $30,$2,30 + xor $25,$2 + srl $6,$13,31 + addu $13,$13 + srl $2,$2,2 + addu $24,$12 + or $13,$6 + or $2,$30 + addu $24,$25 + xor $14,$16 + sll $25,$24,5 # 21 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $14,$22 + xor $25,$2,$3 + addu $7,$6 + xor $14,$11 + sll $30,$1,30 + xor $25,$1 + srl $6,$14,31 + addu $14,$14 + srl $1,$1,2 + addu $7,$13 + or $14,$6 + or $1,$30 + addu $7,$25 + xor $15,$17 + sll $25,$7,5 # 22 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $15,$23 + xor $25,$1,$2 + addu $3,$6 + xor $15,$12 + sll $30,$24,30 + xor $25,$24 + srl $6,$15,31 + addu $15,$15 + srl $24,$24,2 + addu $3,$14 + or $15,$6 + or $24,$30 + addu $3,$25 + xor $16,$18 + sll $25,$3,5 # 23 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $16,$8 + xor $25,$24,$1 + addu $2,$6 + xor $16,$13 + sll $30,$7,30 + xor $25,$7 + srl $6,$16,31 + addu $16,$16 + srl $7,$7,2 + addu $2,$15 + or $16,$6 + or $7,$30 + addu $2,$25 + xor $17,$19 + sll $25,$2,5 # 24 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $17,$9 + xor $25,$7,$24 + addu $1,$6 + xor $17,$14 + sll $30,$3,30 + xor $25,$3 + srl $6,$17,31 + addu $17,$17 + srl $3,$3,2 + addu $1,$16 + or $17,$6 + or $3,$30 + addu $1,$25 + xor $18,$20 + sll $25,$1,5 # 25 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $18,$10 + xor $25,$3,$7 + addu $24,$6 + xor $18,$15 + sll $30,$2,30 + xor $25,$2 + srl $6,$18,31 + addu $18,$18 + srl $2,$2,2 + addu $24,$17 + or $18,$6 + or $2,$30 + addu $24,$25 + xor $19,$21 + sll $25,$24,5 # 26 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $19,$11 + xor $25,$2,$3 + addu $7,$6 + xor $19,$16 + sll $30,$1,30 + xor $25,$1 + srl $6,$19,31 + addu $19,$19 + srl $1,$1,2 + addu $7,$18 + or $19,$6 + or $1,$30 + addu $7,$25 + xor $20,$22 + sll $25,$7,5 # 27 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $20,$12 + xor $25,$1,$2 + addu $3,$6 + xor $20,$17 + sll $30,$24,30 + xor $25,$24 + srl $6,$20,31 + addu $20,$20 + srl $24,$24,2 + addu $3,$19 + or $20,$6 + or $24,$30 + addu $3,$25 + xor $21,$23 + sll $25,$3,5 # 28 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $21,$13 + xor $25,$24,$1 + addu $2,$6 + xor $21,$18 + sll $30,$7,30 + xor $25,$7 + srl $6,$21,31 + addu $21,$21 + srl $7,$7,2 + addu $2,$20 + or $21,$6 + or $7,$30 + addu $2,$25 + xor $22,$8 + sll $25,$2,5 # 29 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $22,$14 + xor $25,$7,$24 + addu $1,$6 + xor $22,$19 + sll $30,$3,30 + xor $25,$3 + srl $6,$22,31 + addu $22,$22 + srl $3,$3,2 + addu $1,$21 + or $22,$6 + or $3,$30 + addu $1,$25 + xor $23,$9 + sll $25,$1,5 # 30 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $23,$15 + xor $25,$3,$7 + addu $24,$6 + xor $23,$20 + sll $30,$2,30 + xor $25,$2 + srl $6,$23,31 + addu $23,$23 + srl $2,$2,2 + addu $24,$22 + or $23,$6 + or $2,$30 + addu $24,$25 + xor $8,$10 + sll $25,$24,5 # 31 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $8,$16 + xor $25,$2,$3 + addu $7,$6 + xor $8,$21 + sll $30,$1,30 + xor $25,$1 + srl $6,$8,31 + addu $8,$8 + srl $1,$1,2 + addu $7,$23 + or $8,$6 + or $1,$30 + addu $7,$25 + xor $9,$11 + sll $25,$7,5 # 32 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $9,$17 + xor $25,$1,$2 + addu $3,$6 + xor $9,$22 + sll $30,$24,30 + xor $25,$24 + srl $6,$9,31 + addu $9,$9 + srl $24,$24,2 + addu $3,$8 + or $9,$6 + or $24,$30 + addu $3,$25 + xor $10,$12 + sll $25,$3,5 # 33 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $10,$18 + xor $25,$24,$1 + addu $2,$6 + xor $10,$23 + sll $30,$7,30 + xor $25,$7 + srl $6,$10,31 + addu $10,$10 + srl $7,$7,2 + addu $2,$9 + or $10,$6 + or $7,$30 + addu $2,$25 + xor $11,$13 + sll $25,$2,5 # 34 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $11,$19 + xor $25,$7,$24 + addu $1,$6 + xor $11,$8 + sll $30,$3,30 + xor $25,$3 + srl $6,$11,31 + addu $11,$11 + srl $3,$3,2 + addu $1,$10 + or $11,$6 + or $3,$30 + addu $1,$25 + xor $12,$14 + sll $25,$1,5 # 35 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $12,$20 + xor $25,$3,$7 + addu $24,$6 + xor $12,$9 + sll $30,$2,30 + xor $25,$2 + srl $6,$12,31 + addu $12,$12 + srl $2,$2,2 + addu $24,$11 + or $12,$6 + or $2,$30 + addu $24,$25 + xor $13,$15 + sll $25,$24,5 # 36 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $13,$21 + xor $25,$2,$3 + addu $7,$6 + xor $13,$10 + sll $30,$1,30 + xor $25,$1 + srl $6,$13,31 + addu $13,$13 + srl $1,$1,2 + addu $7,$12 + or $13,$6 + or $1,$30 + addu $7,$25 + xor $14,$16 + sll $25,$7,5 # 37 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $14,$22 + xor $25,$1,$2 + addu $3,$6 + xor $14,$11 + sll $30,$24,30 + xor $25,$24 + srl $6,$14,31 + addu $14,$14 + srl $24,$24,2 + addu $3,$13 + or $14,$6 + or $24,$30 + addu $3,$25 + xor $15,$17 + sll $25,$3,5 # 38 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $15,$23 + xor $25,$24,$1 + addu $2,$6 + xor $15,$12 + sll $30,$7,30 + xor $25,$7 + srl $6,$15,31 + addu $15,$15 + srl $7,$7,2 + addu $2,$14 + or $15,$6 + or $7,$30 + addu $2,$25 + xor $16,$18 + sll $25,$2,5 # 39 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $16,$8 + xor $25,$7,$24 + addu $1,$6 + xor $16,$13 + sll $30,$3,30 + xor $25,$3 + srl $6,$16,31 + addu $16,$16 + srl $3,$3,2 + addu $1,$15 + or $16,$6 + or $3,$30 + addu $1,$25 + lui $31,0x8f1b + ori $31,0xbcdc # K_40_59 + xor $17,$19 + sll $25,$1,5 # 40 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $17,$9 + and $25,$3,$7 + addu $24,$6 + xor $17,$14 + sll $30,$2,30 + addu $24,$25 + srl $6,$17,31 + xor $25,$3,$7 + addu $17,$17 + and $25,$2 + srl $2,$2,2 + or $17,$6 + addu $24,$16 + or $2,$30 + addu $24,$25 + xor $18,$20 + sll $25,$24,5 # 41 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $18,$10 + and $25,$2,$3 + addu $7,$6 + xor $18,$15 + sll $30,$1,30 + addu $7,$25 + srl $6,$18,31 + xor $25,$2,$3 + addu $18,$18 + and $25,$1 + srl $1,$1,2 + or $18,$6 + addu $7,$17 + or $1,$30 + addu $7,$25 + xor $19,$21 + sll $25,$7,5 # 42 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $19,$11 + and $25,$1,$2 + addu $3,$6 + xor $19,$16 + sll $30,$24,30 + addu $3,$25 + srl $6,$19,31 + xor $25,$1,$2 + addu $19,$19 + and $25,$24 + srl $24,$24,2 + or $19,$6 + addu $3,$18 + or $24,$30 + addu $3,$25 + xor $20,$22 + sll $25,$3,5 # 43 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $20,$12 + and $25,$24,$1 + addu $2,$6 + xor $20,$17 + sll $30,$7,30 + addu $2,$25 + srl $6,$20,31 + xor $25,$24,$1 + addu $20,$20 + and $25,$7 + srl $7,$7,2 + or $20,$6 + addu $2,$19 + or $7,$30 + addu $2,$25 + xor $21,$23 + sll $25,$2,5 # 44 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $21,$13 + and $25,$7,$24 + addu $1,$6 + xor $21,$18 + sll $30,$3,30 + addu $1,$25 + srl $6,$21,31 + xor $25,$7,$24 + addu $21,$21 + and $25,$3 + srl $3,$3,2 + or $21,$6 + addu $1,$20 + or $3,$30 + addu $1,$25 + xor $22,$8 + sll $25,$1,5 # 45 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $22,$14 + and $25,$3,$7 + addu $24,$6 + xor $22,$19 + sll $30,$2,30 + addu $24,$25 + srl $6,$22,31 + xor $25,$3,$7 + addu $22,$22 + and $25,$2 + srl $2,$2,2 + or $22,$6 + addu $24,$21 + or $2,$30 + addu $24,$25 + xor $23,$9 + sll $25,$24,5 # 46 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $23,$15 + and $25,$2,$3 + addu $7,$6 + xor $23,$20 + sll $30,$1,30 + addu $7,$25 + srl $6,$23,31 + xor $25,$2,$3 + addu $23,$23 + and $25,$1 + srl $1,$1,2 + or $23,$6 + addu $7,$22 + or $1,$30 + addu $7,$25 + xor $8,$10 + sll $25,$7,5 # 47 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $8,$16 + and $25,$1,$2 + addu $3,$6 + xor $8,$21 + sll $30,$24,30 + addu $3,$25 + srl $6,$8,31 + xor $25,$1,$2 + addu $8,$8 + and $25,$24 + srl $24,$24,2 + or $8,$6 + addu $3,$23 + or $24,$30 + addu $3,$25 + xor $9,$11 + sll $25,$3,5 # 48 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $9,$17 + and $25,$24,$1 + addu $2,$6 + xor $9,$22 + sll $30,$7,30 + addu $2,$25 + srl $6,$9,31 + xor $25,$24,$1 + addu $9,$9 + and $25,$7 + srl $7,$7,2 + or $9,$6 + addu $2,$8 + or $7,$30 + addu $2,$25 + xor $10,$12 + sll $25,$2,5 # 49 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $10,$18 + and $25,$7,$24 + addu $1,$6 + xor $10,$23 + sll $30,$3,30 + addu $1,$25 + srl $6,$10,31 + xor $25,$7,$24 + addu $10,$10 + and $25,$3 + srl $3,$3,2 + or $10,$6 + addu $1,$9 + or $3,$30 + addu $1,$25 + xor $11,$13 + sll $25,$1,5 # 50 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $11,$19 + and $25,$3,$7 + addu $24,$6 + xor $11,$8 + sll $30,$2,30 + addu $24,$25 + srl $6,$11,31 + xor $25,$3,$7 + addu $11,$11 + and $25,$2 + srl $2,$2,2 + or $11,$6 + addu $24,$10 + or $2,$30 + addu $24,$25 + xor $12,$14 + sll $25,$24,5 # 51 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $12,$20 + and $25,$2,$3 + addu $7,$6 + xor $12,$9 + sll $30,$1,30 + addu $7,$25 + srl $6,$12,31 + xor $25,$2,$3 + addu $12,$12 + and $25,$1 + srl $1,$1,2 + or $12,$6 + addu $7,$11 + or $1,$30 + addu $7,$25 + xor $13,$15 + sll $25,$7,5 # 52 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $13,$21 + and $25,$1,$2 + addu $3,$6 + xor $13,$10 + sll $30,$24,30 + addu $3,$25 + srl $6,$13,31 + xor $25,$1,$2 + addu $13,$13 + and $25,$24 + srl $24,$24,2 + or $13,$6 + addu $3,$12 + or $24,$30 + addu $3,$25 + xor $14,$16 + sll $25,$3,5 # 53 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $14,$22 + and $25,$24,$1 + addu $2,$6 + xor $14,$11 + sll $30,$7,30 + addu $2,$25 + srl $6,$14,31 + xor $25,$24,$1 + addu $14,$14 + and $25,$7 + srl $7,$7,2 + or $14,$6 + addu $2,$13 + or $7,$30 + addu $2,$25 + xor $15,$17 + sll $25,$2,5 # 54 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $15,$23 + and $25,$7,$24 + addu $1,$6 + xor $15,$12 + sll $30,$3,30 + addu $1,$25 + srl $6,$15,31 + xor $25,$7,$24 + addu $15,$15 + and $25,$3 + srl $3,$3,2 + or $15,$6 + addu $1,$14 + or $3,$30 + addu $1,$25 + xor $16,$18 + sll $25,$1,5 # 55 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $16,$8 + and $25,$3,$7 + addu $24,$6 + xor $16,$13 + sll $30,$2,30 + addu $24,$25 + srl $6,$16,31 + xor $25,$3,$7 + addu $16,$16 + and $25,$2 + srl $2,$2,2 + or $16,$6 + addu $24,$15 + or $2,$30 + addu $24,$25 + xor $17,$19 + sll $25,$24,5 # 56 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $17,$9 + and $25,$2,$3 + addu $7,$6 + xor $17,$14 + sll $30,$1,30 + addu $7,$25 + srl $6,$17,31 + xor $25,$2,$3 + addu $17,$17 + and $25,$1 + srl $1,$1,2 + or $17,$6 + addu $7,$16 + or $1,$30 + addu $7,$25 + xor $18,$20 + sll $25,$7,5 # 57 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $18,$10 + and $25,$1,$2 + addu $3,$6 + xor $18,$15 + sll $30,$24,30 + addu $3,$25 + srl $6,$18,31 + xor $25,$1,$2 + addu $18,$18 + and $25,$24 + srl $24,$24,2 + or $18,$6 + addu $3,$17 + or $24,$30 + addu $3,$25 + xor $19,$21 + sll $25,$3,5 # 58 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $19,$11 + and $25,$24,$1 + addu $2,$6 + xor $19,$16 + sll $30,$7,30 + addu $2,$25 + srl $6,$19,31 + xor $25,$24,$1 + addu $19,$19 + and $25,$7 + srl $7,$7,2 + or $19,$6 + addu $2,$18 + or $7,$30 + addu $2,$25 + xor $20,$22 + sll $25,$2,5 # 59 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $20,$12 + and $25,$7,$24 + addu $1,$6 + xor $20,$17 + sll $30,$3,30 + addu $1,$25 + srl $6,$20,31 + xor $25,$7,$24 + addu $20,$20 + and $25,$3 + srl $3,$3,2 + or $20,$6 + addu $1,$19 + or $3,$30 + addu $1,$25 + lui $31,0xca62 + ori $31,0xc1d6 # K_60_79 + xor $21,$23 + sll $25,$1,5 # 60 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $21,$13 + xor $25,$3,$7 + addu $24,$6 + xor $21,$18 + sll $30,$2,30 + xor $25,$2 + srl $6,$21,31 + addu $21,$21 + srl $2,$2,2 + addu $24,$20 + or $21,$6 + or $2,$30 + addu $24,$25 + xor $22,$8 + sll $25,$24,5 # 61 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $22,$14 + xor $25,$2,$3 + addu $7,$6 + xor $22,$19 + sll $30,$1,30 + xor $25,$1 + srl $6,$22,31 + addu $22,$22 + srl $1,$1,2 + addu $7,$21 + or $22,$6 + or $1,$30 + addu $7,$25 + xor $23,$9 + sll $25,$7,5 # 62 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $23,$15 + xor $25,$1,$2 + addu $3,$6 + xor $23,$20 + sll $30,$24,30 + xor $25,$24 + srl $6,$23,31 + addu $23,$23 + srl $24,$24,2 + addu $3,$22 + or $23,$6 + or $24,$30 + addu $3,$25 + xor $8,$10 + sll $25,$3,5 # 63 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $8,$16 + xor $25,$24,$1 + addu $2,$6 + xor $8,$21 + sll $30,$7,30 + xor $25,$7 + srl $6,$8,31 + addu $8,$8 + srl $7,$7,2 + addu $2,$23 + or $8,$6 + or $7,$30 + addu $2,$25 + xor $9,$11 + sll $25,$2,5 # 64 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $9,$17 + xor $25,$7,$24 + addu $1,$6 + xor $9,$22 + sll $30,$3,30 + xor $25,$3 + srl $6,$9,31 + addu $9,$9 + srl $3,$3,2 + addu $1,$8 + or $9,$6 + or $3,$30 + addu $1,$25 + xor $10,$12 + sll $25,$1,5 # 65 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $10,$18 + xor $25,$3,$7 + addu $24,$6 + xor $10,$23 + sll $30,$2,30 + xor $25,$2 + srl $6,$10,31 + addu $10,$10 + srl $2,$2,2 + addu $24,$9 + or $10,$6 + or $2,$30 + addu $24,$25 + xor $11,$13 + sll $25,$24,5 # 66 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $11,$19 + xor $25,$2,$3 + addu $7,$6 + xor $11,$8 + sll $30,$1,30 + xor $25,$1 + srl $6,$11,31 + addu $11,$11 + srl $1,$1,2 + addu $7,$10 + or $11,$6 + or $1,$30 + addu $7,$25 + xor $12,$14 + sll $25,$7,5 # 67 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $12,$20 + xor $25,$1,$2 + addu $3,$6 + xor $12,$9 + sll $30,$24,30 + xor $25,$24 + srl $6,$12,31 + addu $12,$12 + srl $24,$24,2 + addu $3,$11 + or $12,$6 + or $24,$30 + addu $3,$25 + xor $13,$15 + sll $25,$3,5 # 68 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $13,$21 + xor $25,$24,$1 + addu $2,$6 + xor $13,$10 + sll $30,$7,30 + xor $25,$7 + srl $6,$13,31 + addu $13,$13 + srl $7,$7,2 + addu $2,$12 + or $13,$6 + or $7,$30 + addu $2,$25 + xor $14,$16 + sll $25,$2,5 # 69 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $14,$22 + xor $25,$7,$24 + addu $1,$6 + xor $14,$11 + sll $30,$3,30 + xor $25,$3 + srl $6,$14,31 + addu $14,$14 + srl $3,$3,2 + addu $1,$13 + or $14,$6 + or $3,$30 + addu $1,$25 + xor $15,$17 + sll $25,$1,5 # 70 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $15,$23 + xor $25,$3,$7 + addu $24,$6 + xor $15,$12 + sll $30,$2,30 + xor $25,$2 + srl $6,$15,31 + addu $15,$15 + srl $2,$2,2 + addu $24,$14 + or $15,$6 + or $2,$30 + addu $24,$25 + xor $16,$18 + sll $25,$24,5 # 71 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $16,$8 + xor $25,$2,$3 + addu $7,$6 + xor $16,$13 + sll $30,$1,30 + xor $25,$1 + srl $6,$16,31 + addu $16,$16 + srl $1,$1,2 + addu $7,$15 + or $16,$6 + or $1,$30 + addu $7,$25 + xor $17,$19 + sll $25,$7,5 # 72 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $17,$9 + xor $25,$1,$2 + addu $3,$6 + xor $17,$14 + sll $30,$24,30 + xor $25,$24 + srl $6,$17,31 + addu $17,$17 + srl $24,$24,2 + addu $3,$16 + or $17,$6 + or $24,$30 + addu $3,$25 + xor $18,$20 + sll $25,$3,5 # 73 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $18,$10 + xor $25,$24,$1 + addu $2,$6 + xor $18,$15 + sll $30,$7,30 + xor $25,$7 + srl $6,$18,31 + addu $18,$18 + srl $7,$7,2 + addu $2,$17 + or $18,$6 + or $7,$30 + addu $2,$25 + xor $19,$21 + sll $25,$2,5 # 74 + addu $1,$31 + srl $6,$2,27 + addu $1,$25 + xor $19,$11 + xor $25,$7,$24 + addu $1,$6 + xor $19,$16 + sll $30,$3,30 + xor $25,$3 + srl $6,$19,31 + addu $19,$19 + srl $3,$3,2 + addu $1,$18 + or $19,$6 + or $3,$30 + addu $1,$25 + xor $20,$22 + sll $25,$1,5 # 75 + addu $24,$31 + srl $6,$1,27 + addu $24,$25 + xor $20,$12 + xor $25,$3,$7 + addu $24,$6 + xor $20,$17 + sll $30,$2,30 + xor $25,$2 + srl $6,$20,31 + addu $20,$20 + srl $2,$2,2 + addu $24,$19 + or $20,$6 + or $2,$30 + addu $24,$25 + xor $21,$23 + sll $25,$24,5 # 76 + addu $7,$31 + srl $6,$24,27 + addu $7,$25 + xor $21,$13 + xor $25,$2,$3 + addu $7,$6 + xor $21,$18 + sll $30,$1,30 + xor $25,$1 + srl $6,$21,31 + addu $21,$21 + srl $1,$1,2 + addu $7,$20 + or $21,$6 + or $1,$30 + addu $7,$25 + xor $22,$8 + sll $25,$7,5 # 77 + addu $3,$31 + srl $6,$7,27 + addu $3,$25 + xor $22,$14 + xor $25,$1,$2 + addu $3,$6 + xor $22,$19 + sll $30,$24,30 + xor $25,$24 + srl $6,$22,31 + addu $22,$22 + srl $24,$24,2 + addu $3,$21 + or $22,$6 + or $24,$30 + addu $3,$25 + xor $23,$9 + sll $25,$3,5 # 78 + addu $2,$31 + srl $6,$3,27 + addu $2,$25 + xor $23,$15 + xor $25,$24,$1 + addu $2,$6 + xor $23,$20 + sll $30,$7,30 + xor $25,$7 + srl $6,$23,31 + addu $23,$23 + srl $7,$7,2 + addu $2,$22 + or $23,$6 + or $7,$30 + addu $2,$25 + lw $8,0($4) + sll $25,$2,5 # 79 + addu $1,$31 + lw $9,4($4) + srl $6,$2,27 + addu $1,$25 + lw $10,8($4) + xor $25,$7,$24 + addu $1,$6 + lw $11,12($4) + sll $30,$3,30 + xor $25,$3 + lw $12,16($4) + srl $3,$3,2 + addu $1,$23 + or $3,$30 + addu $1,$25 + add $5,64 + lw $6,0($29) + + addu $1,$8 + addu $2,$9 + sw $1,0($4) + addu $3,$10 + addu $7,$11 + sw $2,4($4) + addu $24,$12 + sw $3,8($4) + sw $7,12($4) + sw $24,16($4) + .set noreorder + bne $5,$6,.Loop + nop + + .set noreorder + lw $31,(16-1)*4($29) + lw $30,(16-2)*4($29) + lw $23,(16-3)*4($29) + lw $22,(16-4)*4($29) + lw $21,(16-5)*4($29) + lw $20,(16-6)*4($29) + lw $19,(16-7)*4($29) + lw $18,(16-8)*4($29) + lw $17,(16-9)*4($29) + lw $16,(16-10)*4($29) + jr $31 + add $29,16*4 +.end sha1_block_data_order +.rdata +.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro@openssl.org>" diff --git a/crypto/sha/asm/sha256-mips.s b/crypto/sha/asm/sha256-mips.s new file mode 100644 index 0000000..83876d9 --- /dev/null +++ b/crypto/sha/asm/sha256-mips.s @@ -0,0 +1,1999 @@ +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +.text +.set noat +#if !defined(__vxworks) || defined(__pic__) +.option pic2 +#endif + +.align 5 +.globl sha256_block_data_order +.ent sha256_block_data_order +sha256_block_data_order: + .frame $29,128,$31 + .mask 3237937152,-4 + .set noreorder + sub $29,128 + sw $31,128-1*4($29) + sw $30,128-2*4($29) + sw $23,128-3*4($29) + sw $22,128-4*4($29) + sw $21,128-5*4($29) + sw $20,128-6*4($29) + sw $19,128-7*4($29) + sw $18,128-8*4($29) + sw $17,128-9*4($29) + sw $16,128-10*4($29) + sll $23,$6,6 + .cplocal $6 + .cpsetup $25,$0,sha256_block_data_order + .set reorder + la $6,K256 # PIC-ified 'load address' + + lw $1,0*4($4) # load context + lw $2,1*4($4) + lw $3,2*4($4) + lw $7,3*4($4) + lw $24,4*4($4) + lw $25,5*4($4) + lw $30,6*4($4) + lw $31,7*4($4) + + add $23,$5 # pointer to the end of input + sw $23,16*4($29) + b .Loop + +.align 5 +.Loop: + lwl $8,3($5) + lwr $8,0($5) + lwl $9,7($5) + lwr $9,4($5) + srl $13,$8,24 # byte swap(0) + srl $14,$8,8 + andi $15,$8,0xFF00 + sll $8,$8,24 + andi $14,0xFF00 + sll $15,$15,8 + or $8,$13 + or $14,$15 + or $8,$14 + addu $12,$8,$31 # 0 + srl $31,$24,6 + xor $15,$25,$30 + sll $14,$24,7 + and $15,$24 + srl $13,$24,11 + xor $31,$14 + sll $14,$24,21 + xor $31,$13 + srl $13,$24,25 + xor $31,$14 + sll $14,$24,26 + xor $31,$13 + xor $15,$30 # Ch(e,f,g) + xor $13,$14,$31 # Sigma1(e) + + srl $31,$1,2 + addu $12,$15 + lw $15,0($6) # K[0] + sll $14,$1,10 + addu $12,$13 + srl $13,$1,13 + xor $31,$14 + sll $14,$1,19 + xor $31,$13 + srl $13,$1,22 + xor $31,$14 + sll $14,$1,30 + xor $31,$13 + sw $8,0($29) # offload to ring buffer + xor $31,$14 # Sigma0(a) + + or $13,$1,$2 + and $14,$1,$2 + and $13,$3 + or $14,$13 # Maj(a,b,c) + addu $12,$15 # +=K[0] + addu $31,$14 + + addu $7,$12 + addu $31,$12 + lwl $10,11($5) + lwr $10,8($5) + srl $14,$9,24 # byte swap(1) + srl $15,$9,8 + andi $16,$9,0xFF00 + sll $9,$9,24 + andi $15,0xFF00 + sll $16,$16,8 + or $9,$14 + or $15,$16 + or $9,$15 + addu $13,$9,$30 # 1 + srl $30,$7,6 + xor $16,$24,$25 + sll $15,$7,7 + and $16,$7 + srl $14,$7,11 + xor $30,$15 + sll $15,$7,21 + xor $30,$14 + srl $14,$7,25 + xor $30,$15 + sll $15,$7,26 + xor $30,$14 + xor $16,$25 # Ch(e,f,g) + xor $14,$15,$30 # Sigma1(e) + + srl $30,$31,2 + addu $13,$16 + lw $16,4($6) # K[1] + sll $15,$31,10 + addu $13,$14 + srl $14,$31,13 + xor $30,$15 + sll $15,$31,19 + xor $30,$14 + srl $14,$31,22 + xor $30,$15 + sll $15,$31,30 + xor $30,$14 + sw $9,4($29) # offload to ring buffer + xor $30,$15 # Sigma0(a) + + or $14,$31,$1 + and $15,$31,$1 + and $14,$2 + or $15,$14 # Maj(a,b,c) + addu $13,$16 # +=K[1] + addu $30,$15 + + addu $3,$13 + addu $30,$13 + lwl $11,15($5) + lwr $11,12($5) + srl $15,$10,24 # byte swap(2) + srl $16,$10,8 + andi $17,$10,0xFF00 + sll $10,$10,24 + andi $16,0xFF00 + sll $17,$17,8 + or $10,$15 + or $16,$17 + or $10,$16 + addu $14,$10,$25 # 2 + srl $25,$3,6 + xor $17,$7,$24 + sll $16,$3,7 + and $17,$3 + srl $15,$3,11 + xor $25,$16 + sll $16,$3,21 + xor $25,$15 + srl $15,$3,25 + xor $25,$16 + sll $16,$3,26 + xor $25,$15 + xor $17,$24 # Ch(e,f,g) + xor $15,$16,$25 # Sigma1(e) + + srl $25,$30,2 + addu $14,$17 + lw $17,8($6) # K[2] + sll $16,$30,10 + addu $14,$15 + srl $15,$30,13 + xor $25,$16 + sll $16,$30,19 + xor $25,$15 + srl $15,$30,22 + xor $25,$16 + sll $16,$30,30 + xor $25,$15 + sw $10,8($29) # offload to ring buffer + xor $25,$16 # Sigma0(a) + + or $15,$30,$31 + and $16,$30,$31 + and $15,$1 + or $16,$15 # Maj(a,b,c) + addu $14,$17 # +=K[2] + addu $25,$16 + + addu $2,$14 + addu $25,$14 + lwl $12,19($5) + lwr $12,16($5) + srl $16,$11,24 # byte swap(3) + srl $17,$11,8 + andi $18,$11,0xFF00 + sll $11,$11,24 + andi $17,0xFF00 + sll $18,$18,8 + or $11,$16 + or $17,$18 + or $11,$17 + addu $15,$11,$24 # 3 + srl $24,$2,6 + xor $18,$3,$7 + sll $17,$2,7 + and $18,$2 + srl $16,$2,11 + xor $24,$17 + sll $17,$2,21 + xor $24,$16 + srl $16,$2,25 + xor $24,$17 + sll $17,$2,26 + xor $24,$16 + xor $18,$7 # Ch(e,f,g) + xor $16,$17,$24 # Sigma1(e) + + srl $24,$25,2 + addu $15,$18 + lw $18,12($6) # K[3] + sll $17,$25,10 + addu $15,$16 + srl $16,$25,13 + xor $24,$17 + sll $17,$25,19 + xor $24,$16 + srl $16,$25,22 + xor $24,$17 + sll $17,$25,30 + xor $24,$16 + sw $11,12($29) # offload to ring buffer + xor $24,$17 # Sigma0(a) + + or $16,$25,$30 + and $17,$25,$30 + and $16,$31 + or $17,$16 # Maj(a,b,c) + addu $15,$18 # +=K[3] + addu $24,$17 + + addu $1,$15 + addu $24,$15 + lwl $13,23($5) + lwr $13,20($5) + srl $17,$12,24 # byte swap(4) + srl $18,$12,8 + andi $19,$12,0xFF00 + sll $12,$12,24 + andi $18,0xFF00 + sll $19,$19,8 + or $12,$17 + or $18,$19 + or $12,$18 + addu $16,$12,$7 # 4 + srl $7,$1,6 + xor $19,$2,$3 + sll $18,$1,7 + and $19,$1 + srl $17,$1,11 + xor $7,$18 + sll $18,$1,21 + xor $7,$17 + srl $17,$1,25 + xor $7,$18 + sll $18,$1,26 + xor $7,$17 + xor $19,$3 # Ch(e,f,g) + xor $17,$18,$7 # Sigma1(e) + + srl $7,$24,2 + addu $16,$19 + lw $19,16($6) # K[4] + sll $18,$24,10 + addu $16,$17 + srl $17,$24,13 + xor $7,$18 + sll $18,$24,19 + xor $7,$17 + srl $17,$24,22 + xor $7,$18 + sll $18,$24,30 + xor $7,$17 + sw $12,16($29) # offload to ring buffer + xor $7,$18 # Sigma0(a) + + or $17,$24,$25 + and $18,$24,$25 + and $17,$30 + or $18,$17 # Maj(a,b,c) + addu $16,$19 # +=K[4] + addu $7,$18 + + addu $31,$16 + addu $7,$16 + lwl $14,27($5) + lwr $14,24($5) + srl $18,$13,24 # byte swap(5) + srl $19,$13,8 + andi $20,$13,0xFF00 + sll $13,$13,24 + andi $19,0xFF00 + sll $20,$20,8 + or $13,$18 + or $19,$20 + or $13,$19 + addu $17,$13,$3 # 5 + srl $3,$31,6 + xor $20,$1,$2 + sll $19,$31,7 + and $20,$31 + srl $18,$31,11 + xor $3,$19 + sll $19,$31,21 + xor $3,$18 + srl $18,$31,25 + xor $3,$19 + sll $19,$31,26 + xor $3,$18 + xor $20,$2 # Ch(e,f,g) + xor $18,$19,$3 # Sigma1(e) + + srl $3,$7,2 + addu $17,$20 + lw $20,20($6) # K[5] + sll $19,$7,10 + addu $17,$18 + srl $18,$7,13 + xor $3,$19 + sll $19,$7,19 + xor $3,$18 + srl $18,$7,22 + xor $3,$19 + sll $19,$7,30 + xor $3,$18 + sw $13,20($29) # offload to ring buffer + xor $3,$19 # Sigma0(a) + + or $18,$7,$24 + and $19,$7,$24 + and $18,$25 + or $19,$18 # Maj(a,b,c) + addu $17,$20 # +=K[5] + addu $3,$19 + + addu $30,$17 + addu $3,$17 + lwl $15,31($5) + lwr $15,28($5) + srl $19,$14,24 # byte swap(6) + srl $20,$14,8 + andi $21,$14,0xFF00 + sll $14,$14,24 + andi $20,0xFF00 + sll $21,$21,8 + or $14,$19 + or $20,$21 + or $14,$20 + addu $18,$14,$2 # 6 + srl $2,$30,6 + xor $21,$31,$1 + sll $20,$30,7 + and $21,$30 + srl $19,$30,11 + xor $2,$20 + sll $20,$30,21 + xor $2,$19 + srl $19,$30,25 + xor $2,$20 + sll $20,$30,26 + xor $2,$19 + xor $21,$1 # Ch(e,f,g) + xor $19,$20,$2 # Sigma1(e) + + srl $2,$3,2 + addu $18,$21 + lw $21,24($6) # K[6] + sll $20,$3,10 + addu $18,$19 + srl $19,$3,13 + xor $2,$20 + sll $20,$3,19 + xor $2,$19 + srl $19,$3,22 + xor $2,$20 + sll $20,$3,30 + xor $2,$19 + sw $14,24($29) # offload to ring buffer + xor $2,$20 # Sigma0(a) + + or $19,$3,$7 + and $20,$3,$7 + and $19,$24 + or $20,$19 # Maj(a,b,c) + addu $18,$21 # +=K[6] + addu $2,$20 + + addu $25,$18 + addu $2,$18 + lwl $16,35($5) + lwr $16,32($5) + srl $20,$15,24 # byte swap(7) + srl $21,$15,8 + andi $22,$15,0xFF00 + sll $15,$15,24 + andi $21,0xFF00 + sll $22,$22,8 + or $15,$20 + or $21,$22 + or $15,$21 + addu $19,$15,$1 # 7 + srl $1,$25,6 + xor $22,$30,$31 + sll $21,$25,7 + and $22,$25 + srl $20,$25,11 + xor $1,$21 + sll $21,$25,21 + xor $1,$20 + srl $20,$25,25 + xor $1,$21 + sll $21,$25,26 + xor $1,$20 + xor $22,$31 # Ch(e,f,g) + xor $20,$21,$1 # Sigma1(e) + + srl $1,$2,2 + addu $19,$22 + lw $22,28($6) # K[7] + sll $21,$2,10 + addu $19,$20 + srl $20,$2,13 + xor $1,$21 + sll $21,$2,19 + xor $1,$20 + srl $20,$2,22 + xor $1,$21 + sll $21,$2,30 + xor $1,$20 + sw $15,28($29) # offload to ring buffer + xor $1,$21 # Sigma0(a) + + or $20,$2,$3 + and $21,$2,$3 + and $20,$7 + or $21,$20 # Maj(a,b,c) + addu $19,$22 # +=K[7] + addu $1,$21 + + addu $24,$19 + addu $1,$19 + lwl $17,39($5) + lwr $17,36($5) + srl $21,$16,24 # byte swap(8) + srl $22,$16,8 + andi $23,$16,0xFF00 + sll $16,$16,24 + andi $22,0xFF00 + sll $23,$23,8 + or $16,$21 + or $22,$23 + or $16,$22 + addu $20,$16,$31 # 8 + srl $31,$24,6 + xor $23,$25,$30 + sll $22,$24,7 + and $23,$24 + srl $21,$24,11 + xor $31,$22 + sll $22,$24,21 + xor $31,$21 + srl $21,$24,25 + xor $31,$22 + sll $22,$24,26 + xor $31,$21 + xor $23,$30 # Ch(e,f,g) + xor $21,$22,$31 # Sigma1(e) + + srl $31,$1,2 + addu $20,$23 + lw $23,32($6) # K[8] + sll $22,$1,10 + addu $20,$21 + srl $21,$1,13 + xor $31,$22 + sll $22,$1,19 + xor $31,$21 + srl $21,$1,22 + xor $31,$22 + sll $22,$1,30 + xor $31,$21 + sw $16,32($29) # offload to ring buffer + xor $31,$22 # Sigma0(a) + + or $21,$1,$2 + and $22,$1,$2 + and $21,$3 + or $22,$21 # Maj(a,b,c) + addu $20,$23 # +=K[8] + addu $31,$22 + + addu $7,$20 + addu $31,$20 + lwl $18,43($5) + lwr $18,40($5) + srl $22,$17,24 # byte swap(9) + srl $23,$17,8 + andi $8,$17,0xFF00 + sll $17,$17,24 + andi $23,0xFF00 + sll $8,$8,8 + or $17,$22 + or $23,$8 + or $17,$23 + addu $21,$17,$30 # 9 + srl $30,$7,6 + xor $8,$24,$25 + sll $23,$7,7 + and $8,$7 + srl $22,$7,11 + xor $30,$23 + sll $23,$7,21 + xor $30,$22 + srl $22,$7,25 + xor $30,$23 + sll $23,$7,26 + xor $30,$22 + xor $8,$25 # Ch(e,f,g) + xor $22,$23,$30 # Sigma1(e) + + srl $30,$31,2 + addu $21,$8 + lw $8,36($6) # K[9] + sll $23,$31,10 + addu $21,$22 + srl $22,$31,13 + xor $30,$23 + sll $23,$31,19 + xor $30,$22 + srl $22,$31,22 + xor $30,$23 + sll $23,$31,30 + xor $30,$22 + sw $17,36($29) # offload to ring buffer + xor $30,$23 # Sigma0(a) + + or $22,$31,$1 + and $23,$31,$1 + and $22,$2 + or $23,$22 # Maj(a,b,c) + addu $21,$8 # +=K[9] + addu $30,$23 + + addu $3,$21 + addu $30,$21 + lwl $19,47($5) + lwr $19,44($5) + srl $23,$18,24 # byte swap(10) + srl $8,$18,8 + andi $9,$18,0xFF00 + sll $18,$18,24 + andi $8,0xFF00 + sll $9,$9,8 + or $18,$23 + or $8,$9 + or $18,$8 + addu $22,$18,$25 # 10 + srl $25,$3,6 + xor $9,$7,$24 + sll $8,$3,7 + and $9,$3 + srl $23,$3,11 + xor $25,$8 + sll $8,$3,21 + xor $25,$23 + srl $23,$3,25 + xor $25,$8 + sll $8,$3,26 + xor $25,$23 + xor $9,$24 # Ch(e,f,g) + xor $23,$8,$25 # Sigma1(e) + + srl $25,$30,2 + addu $22,$9 + lw $9,40($6) # K[10] + sll $8,$30,10 + addu $22,$23 + srl $23,$30,13 + xor $25,$8 + sll $8,$30,19 + xor $25,$23 + srl $23,$30,22 + xor $25,$8 + sll $8,$30,30 + xor $25,$23 + sw $18,40($29) # offload to ring buffer + xor $25,$8 # Sigma0(a) + + or $23,$30,$31 + and $8,$30,$31 + and $23,$1 + or $8,$23 # Maj(a,b,c) + addu $22,$9 # +=K[10] + addu $25,$8 + + addu $2,$22 + addu $25,$22 + lwl $20,51($5) + lwr $20,48($5) + srl $8,$19,24 # byte swap(11) + srl $9,$19,8 + andi $10,$19,0xFF00 + sll $19,$19,24 + andi $9,0xFF00 + sll $10,$10,8 + or $19,$8 + or $9,$10 + or $19,$9 + addu $23,$19,$24 # 11 + srl $24,$2,6 + xor $10,$3,$7 + sll $9,$2,7 + and $10,$2 + srl $8,$2,11 + xor $24,$9 + sll $9,$2,21 + xor $24,$8 + srl $8,$2,25 + xor $24,$9 + sll $9,$2,26 + xor $24,$8 + xor $10,$7 # Ch(e,f,g) + xor $8,$9,$24 # Sigma1(e) + + srl $24,$25,2 + addu $23,$10 + lw $10,44($6) # K[11] + sll $9,$25,10 + addu $23,$8 + srl $8,$25,13 + xor $24,$9 + sll $9,$25,19 + xor $24,$8 + srl $8,$25,22 + xor $24,$9 + sll $9,$25,30 + xor $24,$8 + sw $19,44($29) # offload to ring buffer + xor $24,$9 # Sigma0(a) + + or $8,$25,$30 + and $9,$25,$30 + and $8,$31 + or $9,$8 # Maj(a,b,c) + addu $23,$10 # +=K[11] + addu $24,$9 + + addu $1,$23 + addu $24,$23 + lwl $21,55($5) + lwr $21,52($5) + srl $9,$20,24 # byte swap(12) + srl $10,$20,8 + andi $11,$20,0xFF00 + sll $20,$20,24 + andi $10,0xFF00 + sll $11,$11,8 + or $20,$9 + or $10,$11 + or $20,$10 + addu $8,$20,$7 # 12 + srl $7,$1,6 + xor $11,$2,$3 + sll $10,$1,7 + and $11,$1 + srl $9,$1,11 + xor $7,$10 + sll $10,$1,21 + xor $7,$9 + srl $9,$1,25 + xor $7,$10 + sll $10,$1,26 + xor $7,$9 + xor $11,$3 # Ch(e,f,g) + xor $9,$10,$7 # Sigma1(e) + + srl $7,$24,2 + addu $8,$11 + lw $11,48($6) # K[12] + sll $10,$24,10 + addu $8,$9 + srl $9,$24,13 + xor $7,$10 + sll $10,$24,19 + xor $7,$9 + srl $9,$24,22 + xor $7,$10 + sll $10,$24,30 + xor $7,$9 + sw $20,48($29) # offload to ring buffer + xor $7,$10 # Sigma0(a) + + or $9,$24,$25 + and $10,$24,$25 + and $9,$30 + or $10,$9 # Maj(a,b,c) + addu $8,$11 # +=K[12] + addu $7,$10 + + addu $31,$8 + addu $7,$8 + lwl $22,59($5) + lwr $22,56($5) + srl $10,$21,24 # byte swap(13) + srl $11,$21,8 + andi $12,$21,0xFF00 + sll $21,$21,24 + andi $11,0xFF00 + sll $12,$12,8 + or $21,$10 + or $11,$12 + or $21,$11 + addu $9,$21,$3 # 13 + srl $3,$31,6 + xor $12,$1,$2 + sll $11,$31,7 + and $12,$31 + srl $10,$31,11 + xor $3,$11 + sll $11,$31,21 + xor $3,$10 + srl $10,$31,25 + xor $3,$11 + sll $11,$31,26 + xor $3,$10 + xor $12,$2 # Ch(e,f,g) + xor $10,$11,$3 # Sigma1(e) + + srl $3,$7,2 + addu $9,$12 + lw $12,52($6) # K[13] + sll $11,$7,10 + addu $9,$10 + srl $10,$7,13 + xor $3,$11 + sll $11,$7,19 + xor $3,$10 + srl $10,$7,22 + xor $3,$11 + sll $11,$7,30 + xor $3,$10 + sw $21,52($29) # offload to ring buffer + xor $3,$11 # Sigma0(a) + + or $10,$7,$24 + and $11,$7,$24 + and $10,$25 + or $11,$10 # Maj(a,b,c) + addu $9,$12 # +=K[13] + addu $3,$11 + + addu $30,$9 + addu $3,$9 + lw $8,0($29) # prefetch from ring buffer + lwl $23,63($5) + lwr $23,60($5) + srl $11,$22,24 # byte swap(14) + srl $12,$22,8 + andi $13,$22,0xFF00 + sll $22,$22,24 + andi $12,0xFF00 + sll $13,$13,8 + or $22,$11 + or $12,$13 + or $22,$12 + addu $10,$22,$2 # 14 + srl $2,$30,6 + xor $13,$31,$1 + sll $12,$30,7 + and $13,$30 + srl $11,$30,11 + xor $2,$12 + sll $12,$30,21 + xor $2,$11 + srl $11,$30,25 + xor $2,$12 + sll $12,$30,26 + xor $2,$11 + xor $13,$1 # Ch(e,f,g) + xor $11,$12,$2 # Sigma1(e) + + srl $2,$3,2 + addu $10,$13 + lw $13,56($6) # K[14] + sll $12,$3,10 + addu $10,$11 + srl $11,$3,13 + xor $2,$12 + sll $12,$3,19 + xor $2,$11 + srl $11,$3,22 + xor $2,$12 + sll $12,$3,30 + xor $2,$11 + sw $22,56($29) # offload to ring buffer + xor $2,$12 # Sigma0(a) + + or $11,$3,$7 + and $12,$3,$7 + and $11,$24 + or $12,$11 # Maj(a,b,c) + addu $10,$13 # +=K[14] + addu $2,$12 + + addu $25,$10 + addu $2,$10 + lw $9,4($29) # prefetch from ring buffer + srl $12,$23,24 # byte swap(15) + srl $13,$23,8 + andi $14,$23,0xFF00 + sll $23,$23,24 + andi $13,0xFF00 + sll $14,$14,8 + or $23,$12 + or $13,$14 + or $23,$13 + addu $11,$23,$1 # 15 + srl $1,$25,6 + xor $14,$30,$31 + sll $13,$25,7 + and $14,$25 + srl $12,$25,11 + xor $1,$13 + sll $13,$25,21 + xor $1,$12 + srl $12,$25,25 + xor $1,$13 + sll $13,$25,26 + xor $1,$12 + xor $14,$31 # Ch(e,f,g) + xor $12,$13,$1 # Sigma1(e) + + srl $1,$2,2 + addu $11,$14 + lw $14,60($6) # K[15] + sll $13,$2,10 + addu $11,$12 + srl $12,$2,13 + xor $1,$13 + sll $13,$2,19 + xor $1,$12 + srl $12,$2,22 + xor $1,$13 + sll $13,$2,30 + xor $1,$12 + sw $23,60($29) # offload to ring buffer + xor $1,$13 # Sigma0(a) + + or $12,$2,$3 + and $13,$2,$3 + and $12,$7 + or $13,$12 # Maj(a,b,c) + addu $11,$14 # +=K[15] + addu $1,$13 + + addu $24,$11 + addu $1,$11 + lw $10,8($29) # prefetch from ring buffer + b .L16_xx +.align 4 +.L16_xx: + srl $14,$9,3 # Xupdate(16) + addu $8,$17 # +=X[i+9] + sll $13,$9,14 + srl $12,$9,7 + xor $14,$13 + sll $13,11 + xor $14,$12 + srl $12,$9,18 + xor $14,$13 + + srl $15,$22,10 + xor $14,$12 # sigma0(X[i+1]) + sll $13,$22,13 + addu $8,$14 + srl $12,$22,17 + xor $15,$13 + sll $13,2 + xor $15,$12 + srl $12,$22,19 + xor $15,$13 + + xor $15,$12 # sigma1(X[i+14]) + addu $8,$15 + addu $12,$8,$31 # 16 + srl $31,$24,6 + xor $15,$25,$30 + sll $14,$24,7 + and $15,$24 + srl $13,$24,11 + xor $31,$14 + sll $14,$24,21 + xor $31,$13 + srl $13,$24,25 + xor $31,$14 + sll $14,$24,26 + xor $31,$13 + xor $15,$30 # Ch(e,f,g) + xor $13,$14,$31 # Sigma1(e) + + srl $31,$1,2 + addu $12,$15 + lw $15,64($6) # K[16] + sll $14,$1,10 + addu $12,$13 + srl $13,$1,13 + xor $31,$14 + sll $14,$1,19 + xor $31,$13 + srl $13,$1,22 + xor $31,$14 + sll $14,$1,30 + xor $31,$13 + sw $8,0($29) # offload to ring buffer + xor $31,$14 # Sigma0(a) + + or $13,$1,$2 + and $14,$1,$2 + and $13,$3 + or $14,$13 # Maj(a,b,c) + addu $12,$15 # +=K[16] + addu $31,$14 + + addu $7,$12 + addu $31,$12 + lw $11,12($29) # prefetch from ring buffer + srl $15,$10,3 # Xupdate(17) + addu $9,$18 # +=X[i+9] + sll $14,$10,14 + srl $13,$10,7 + xor $15,$14 + sll $14,11 + xor $15,$13 + srl $13,$10,18 + xor $15,$14 + + srl $16,$23,10 + xor $15,$13 # sigma0(X[i+1]) + sll $14,$23,13 + addu $9,$15 + srl $13,$23,17 + xor $16,$14 + sll $14,2 + xor $16,$13 + srl $13,$23,19 + xor $16,$14 + + xor $16,$13 # sigma1(X[i+14]) + addu $9,$16 + addu $13,$9,$30 # 17 + srl $30,$7,6 + xor $16,$24,$25 + sll $15,$7,7 + and $16,$7 + srl $14,$7,11 + xor $30,$15 + sll $15,$7,21 + xor $30,$14 + srl $14,$7,25 + xor $30,$15 + sll $15,$7,26 + xor $30,$14 + xor $16,$25 # Ch(e,f,g) + xor $14,$15,$30 # Sigma1(e) + + srl $30,$31,2 + addu $13,$16 + lw $16,68($6) # K[17] + sll $15,$31,10 + addu $13,$14 + srl $14,$31,13 + xor $30,$15 + sll $15,$31,19 + xor $30,$14 + srl $14,$31,22 + xor $30,$15 + sll $15,$31,30 + xor $30,$14 + sw $9,4($29) # offload to ring buffer + xor $30,$15 # Sigma0(a) + + or $14,$31,$1 + and $15,$31,$1 + and $14,$2 + or $15,$14 # Maj(a,b,c) + addu $13,$16 # +=K[17] + addu $30,$15 + + addu $3,$13 + addu $30,$13 + lw $12,16($29) # prefetch from ring buffer + srl $16,$11,3 # Xupdate(18) + addu $10,$19 # +=X[i+9] + sll $15,$11,14 + srl $14,$11,7 + xor $16,$15 + sll $15,11 + xor $16,$14 + srl $14,$11,18 + xor $16,$15 + + srl $17,$8,10 + xor $16,$14 # sigma0(X[i+1]) + sll $15,$8,13 + addu $10,$16 + srl $14,$8,17 + xor $17,$15 + sll $15,2 + xor $17,$14 + srl $14,$8,19 + xor $17,$15 + + xor $17,$14 # sigma1(X[i+14]) + addu $10,$17 + addu $14,$10,$25 # 18 + srl $25,$3,6 + xor $17,$7,$24 + sll $16,$3,7 + and $17,$3 + srl $15,$3,11 + xor $25,$16 + sll $16,$3,21 + xor $25,$15 + srl $15,$3,25 + xor $25,$16 + sll $16,$3,26 + xor $25,$15 + xor $17,$24 # Ch(e,f,g) + xor $15,$16,$25 # Sigma1(e) + + srl $25,$30,2 + addu $14,$17 + lw $17,72($6) # K[18] + sll $16,$30,10 + addu $14,$15 + srl $15,$30,13 + xor $25,$16 + sll $16,$30,19 + xor $25,$15 + srl $15,$30,22 + xor $25,$16 + sll $16,$30,30 + xor $25,$15 + sw $10,8($29) # offload to ring buffer + xor $25,$16 # Sigma0(a) + + or $15,$30,$31 + and $16,$30,$31 + and $15,$1 + or $16,$15 # Maj(a,b,c) + addu $14,$17 # +=K[18] + addu $25,$16 + + addu $2,$14 + addu $25,$14 + lw $13,20($29) # prefetch from ring buffer + srl $17,$12,3 # Xupdate(19) + addu $11,$20 # +=X[i+9] + sll $16,$12,14 + srl $15,$12,7 + xor $17,$16 + sll $16,11 + xor $17,$15 + srl $15,$12,18 + xor $17,$16 + + srl $18,$9,10 + xor $17,$15 # sigma0(X[i+1]) + sll $16,$9,13 + addu $11,$17 + srl $15,$9,17 + xor $18,$16 + sll $16,2 + xor $18,$15 + srl $15,$9,19 + xor $18,$16 + + xor $18,$15 # sigma1(X[i+14]) + addu $11,$18 + addu $15,$11,$24 # 19 + srl $24,$2,6 + xor $18,$3,$7 + sll $17,$2,7 + and $18,$2 + srl $16,$2,11 + xor $24,$17 + sll $17,$2,21 + xor $24,$16 + srl $16,$2,25 + xor $24,$17 + sll $17,$2,26 + xor $24,$16 + xor $18,$7 # Ch(e,f,g) + xor $16,$17,$24 # Sigma1(e) + + srl $24,$25,2 + addu $15,$18 + lw $18,76($6) # K[19] + sll $17,$25,10 + addu $15,$16 + srl $16,$25,13 + xor $24,$17 + sll $17,$25,19 + xor $24,$16 + srl $16,$25,22 + xor $24,$17 + sll $17,$25,30 + xor $24,$16 + sw $11,12($29) # offload to ring buffer + xor $24,$17 # Sigma0(a) + + or $16,$25,$30 + and $17,$25,$30 + and $16,$31 + or $17,$16 # Maj(a,b,c) + addu $15,$18 # +=K[19] + addu $24,$17 + + addu $1,$15 + addu $24,$15 + lw $14,24($29) # prefetch from ring buffer + srl $18,$13,3 # Xupdate(20) + addu $12,$21 # +=X[i+9] + sll $17,$13,14 + srl $16,$13,7 + xor $18,$17 + sll $17,11 + xor $18,$16 + srl $16,$13,18 + xor $18,$17 + + srl $19,$10,10 + xor $18,$16 # sigma0(X[i+1]) + sll $17,$10,13 + addu $12,$18 + srl $16,$10,17 + xor $19,$17 + sll $17,2 + xor $19,$16 + srl $16,$10,19 + xor $19,$17 + + xor $19,$16 # sigma1(X[i+14]) + addu $12,$19 + addu $16,$12,$7 # 20 + srl $7,$1,6 + xor $19,$2,$3 + sll $18,$1,7 + and $19,$1 + srl $17,$1,11 + xor $7,$18 + sll $18,$1,21 + xor $7,$17 + srl $17,$1,25 + xor $7,$18 + sll $18,$1,26 + xor $7,$17 + xor $19,$3 # Ch(e,f,g) + xor $17,$18,$7 # Sigma1(e) + + srl $7,$24,2 + addu $16,$19 + lw $19,80($6) # K[20] + sll $18,$24,10 + addu $16,$17 + srl $17,$24,13 + xor $7,$18 + sll $18,$24,19 + xor $7,$17 + srl $17,$24,22 + xor $7,$18 + sll $18,$24,30 + xor $7,$17 + sw $12,16($29) # offload to ring buffer + xor $7,$18 # Sigma0(a) + + or $17,$24,$25 + and $18,$24,$25 + and $17,$30 + or $18,$17 # Maj(a,b,c) + addu $16,$19 # +=K[20] + addu $7,$18 + + addu $31,$16 + addu $7,$16 + lw $15,28($29) # prefetch from ring buffer + srl $19,$14,3 # Xupdate(21) + addu $13,$22 # +=X[i+9] + sll $18,$14,14 + srl $17,$14,7 + xor $19,$18 + sll $18,11 + xor $19,$17 + srl $17,$14,18 + xor $19,$18 + + srl $20,$11,10 + xor $19,$17 # sigma0(X[i+1]) + sll $18,$11,13 + addu $13,$19 + srl $17,$11,17 + xor $20,$18 + sll $18,2 + xor $20,$17 + srl $17,$11,19 + xor $20,$18 + + xor $20,$17 # sigma1(X[i+14]) + addu $13,$20 + addu $17,$13,$3 # 21 + srl $3,$31,6 + xor $20,$1,$2 + sll $19,$31,7 + and $20,$31 + srl $18,$31,11 + xor $3,$19 + sll $19,$31,21 + xor $3,$18 + srl $18,$31,25 + xor $3,$19 + sll $19,$31,26 + xor $3,$18 + xor $20,$2 # Ch(e,f,g) + xor $18,$19,$3 # Sigma1(e) + + srl $3,$7,2 + addu $17,$20 + lw $20,84($6) # K[21] + sll $19,$7,10 + addu $17,$18 + srl $18,$7,13 + xor $3,$19 + sll $19,$7,19 + xor $3,$18 + srl $18,$7,22 + xor $3,$19 + sll $19,$7,30 + xor $3,$18 + sw $13,20($29) # offload to ring buffer + xor $3,$19 # Sigma0(a) + + or $18,$7,$24 + and $19,$7,$24 + and $18,$25 + or $19,$18 # Maj(a,b,c) + addu $17,$20 # +=K[21] + addu $3,$19 + + addu $30,$17 + addu $3,$17 + lw $16,32($29) # prefetch from ring buffer + srl $20,$15,3 # Xupdate(22) + addu $14,$23 # +=X[i+9] + sll $19,$15,14 + srl $18,$15,7 + xor $20,$19 + sll $19,11 + xor $20,$18 + srl $18,$15,18 + xor $20,$19 + + srl $21,$12,10 + xor $20,$18 # sigma0(X[i+1]) + sll $19,$12,13 + addu $14,$20 + srl $18,$12,17 + xor $21,$19 + sll $19,2 + xor $21,$18 + srl $18,$12,19 + xor $21,$19 + + xor $21,$18 # sigma1(X[i+14]) + addu $14,$21 + addu $18,$14,$2 # 22 + srl $2,$30,6 + xor $21,$31,$1 + sll $20,$30,7 + and $21,$30 + srl $19,$30,11 + xor $2,$20 + sll $20,$30,21 + xor $2,$19 + srl $19,$30,25 + xor $2,$20 + sll $20,$30,26 + xor $2,$19 + xor $21,$1 # Ch(e,f,g) + xor $19,$20,$2 # Sigma1(e) + + srl $2,$3,2 + addu $18,$21 + lw $21,88($6) # K[22] + sll $20,$3,10 + addu $18,$19 + srl $19,$3,13 + xor $2,$20 + sll $20,$3,19 + xor $2,$19 + srl $19,$3,22 + xor $2,$20 + sll $20,$3,30 + xor $2,$19 + sw $14,24($29) # offload to ring buffer + xor $2,$20 # Sigma0(a) + + or $19,$3,$7 + and $20,$3,$7 + and $19,$24 + or $20,$19 # Maj(a,b,c) + addu $18,$21 # +=K[22] + addu $2,$20 + + addu $25,$18 + addu $2,$18 + lw $17,36($29) # prefetch from ring buffer + srl $21,$16,3 # Xupdate(23) + addu $15,$8 # +=X[i+9] + sll $20,$16,14 + srl $19,$16,7 + xor $21,$20 + sll $20,11 + xor $21,$19 + srl $19,$16,18 + xor $21,$20 + + srl $22,$13,10 + xor $21,$19 # sigma0(X[i+1]) + sll $20,$13,13 + addu $15,$21 + srl $19,$13,17 + xor $22,$20 + sll $20,2 + xor $22,$19 + srl $19,$13,19 + xor $22,$20 + + xor $22,$19 # sigma1(X[i+14]) + addu $15,$22 + addu $19,$15,$1 # 23 + srl $1,$25,6 + xor $22,$30,$31 + sll $21,$25,7 + and $22,$25 + srl $20,$25,11 + xor $1,$21 + sll $21,$25,21 + xor $1,$20 + srl $20,$25,25 + xor $1,$21 + sll $21,$25,26 + xor $1,$20 + xor $22,$31 # Ch(e,f,g) + xor $20,$21,$1 # Sigma1(e) + + srl $1,$2,2 + addu $19,$22 + lw $22,92($6) # K[23] + sll $21,$2,10 + addu $19,$20 + srl $20,$2,13 + xor $1,$21 + sll $21,$2,19 + xor $1,$20 + srl $20,$2,22 + xor $1,$21 + sll $21,$2,30 + xor $1,$20 + sw $15,28($29) # offload to ring buffer + xor $1,$21 # Sigma0(a) + + or $20,$2,$3 + and $21,$2,$3 + and $20,$7 + or $21,$20 # Maj(a,b,c) + addu $19,$22 # +=K[23] + addu $1,$21 + + addu $24,$19 + addu $1,$19 + lw $18,40($29) # prefetch from ring buffer + srl $22,$17,3 # Xupdate(24) + addu $16,$9 # +=X[i+9] + sll $21,$17,14 + srl $20,$17,7 + xor $22,$21 + sll $21,11 + xor $22,$20 + srl $20,$17,18 + xor $22,$21 + + srl $23,$14,10 + xor $22,$20 # sigma0(X[i+1]) + sll $21,$14,13 + addu $16,$22 + srl $20,$14,17 + xor $23,$21 + sll $21,2 + xor $23,$20 + srl $20,$14,19 + xor $23,$21 + + xor $23,$20 # sigma1(X[i+14]) + addu $16,$23 + addu $20,$16,$31 # 24 + srl $31,$24,6 + xor $23,$25,$30 + sll $22,$24,7 + and $23,$24 + srl $21,$24,11 + xor $31,$22 + sll $22,$24,21 + xor $31,$21 + srl $21,$24,25 + xor $31,$22 + sll $22,$24,26 + xor $31,$21 + xor $23,$30 # Ch(e,f,g) + xor $21,$22,$31 # Sigma1(e) + + srl $31,$1,2 + addu $20,$23 + lw $23,96($6) # K[24] + sll $22,$1,10 + addu $20,$21 + srl $21,$1,13 + xor $31,$22 + sll $22,$1,19 + xor $31,$21 + srl $21,$1,22 + xor $31,$22 + sll $22,$1,30 + xor $31,$21 + sw $16,32($29) # offload to ring buffer + xor $31,$22 # Sigma0(a) + + or $21,$1,$2 + and $22,$1,$2 + and $21,$3 + or $22,$21 # Maj(a,b,c) + addu $20,$23 # +=K[24] + addu $31,$22 + + addu $7,$20 + addu $31,$20 + lw $19,44($29) # prefetch from ring buffer + srl $23,$18,3 # Xupdate(25) + addu $17,$10 # +=X[i+9] + sll $22,$18,14 + srl $21,$18,7 + xor $23,$22 + sll $22,11 + xor $23,$21 + srl $21,$18,18 + xor $23,$22 + + srl $8,$15,10 + xor $23,$21 # sigma0(X[i+1]) + sll $22,$15,13 + addu $17,$23 + srl $21,$15,17 + xor $8,$22 + sll $22,2 + xor $8,$21 + srl $21,$15,19 + xor $8,$22 + + xor $8,$21 # sigma1(X[i+14]) + addu $17,$8 + addu $21,$17,$30 # 25 + srl $30,$7,6 + xor $8,$24,$25 + sll $23,$7,7 + and $8,$7 + srl $22,$7,11 + xor $30,$23 + sll $23,$7,21 + xor $30,$22 + srl $22,$7,25 + xor $30,$23 + sll $23,$7,26 + xor $30,$22 + xor $8,$25 # Ch(e,f,g) + xor $22,$23,$30 # Sigma1(e) + + srl $30,$31,2 + addu $21,$8 + lw $8,100($6) # K[25] + sll $23,$31,10 + addu $21,$22 + srl $22,$31,13 + xor $30,$23 + sll $23,$31,19 + xor $30,$22 + srl $22,$31,22 + xor $30,$23 + sll $23,$31,30 + xor $30,$22 + sw $17,36($29) # offload to ring buffer + xor $30,$23 # Sigma0(a) + + or $22,$31,$1 + and $23,$31,$1 + and $22,$2 + or $23,$22 # Maj(a,b,c) + addu $21,$8 # +=K[25] + addu $30,$23 + + addu $3,$21 + addu $30,$21 + lw $20,48($29) # prefetch from ring buffer + srl $8,$19,3 # Xupdate(26) + addu $18,$11 # +=X[i+9] + sll $23,$19,14 + srl $22,$19,7 + xor $8,$23 + sll $23,11 + xor $8,$22 + srl $22,$19,18 + xor $8,$23 + + srl $9,$16,10 + xor $8,$22 # sigma0(X[i+1]) + sll $23,$16,13 + addu $18,$8 + srl $22,$16,17 + xor $9,$23 + sll $23,2 + xor $9,$22 + srl $22,$16,19 + xor $9,$23 + + xor $9,$22 # sigma1(X[i+14]) + addu $18,$9 + addu $22,$18,$25 # 26 + srl $25,$3,6 + xor $9,$7,$24 + sll $8,$3,7 + and $9,$3 + srl $23,$3,11 + xor $25,$8 + sll $8,$3,21 + xor $25,$23 + srl $23,$3,25 + xor $25,$8 + sll $8,$3,26 + xor $25,$23 + xor $9,$24 # Ch(e,f,g) + xor $23,$8,$25 # Sigma1(e) + + srl $25,$30,2 + addu $22,$9 + lw $9,104($6) # K[26] + sll $8,$30,10 + addu $22,$23 + srl $23,$30,13 + xor $25,$8 + sll $8,$30,19 + xor $25,$23 + srl $23,$30,22 + xor $25,$8 + sll $8,$30,30 + xor $25,$23 + sw $18,40($29) # offload to ring buffer + xor $25,$8 # Sigma0(a) + + or $23,$30,$31 + and $8,$30,$31 + and $23,$1 + or $8,$23 # Maj(a,b,c) + addu $22,$9 # +=K[26] + addu $25,$8 + + addu $2,$22 + addu $25,$22 + lw $21,52($29) # prefetch from ring buffer + srl $9,$20,3 # Xupdate(27) + addu $19,$12 # +=X[i+9] + sll $8,$20,14 + srl $23,$20,7 + xor $9,$8 + sll $8,11 + xor $9,$23 + srl $23,$20,18 + xor $9,$8 + + srl $10,$17,10 + xor $9,$23 # sigma0(X[i+1]) + sll $8,$17,13 + addu $19,$9 + srl $23,$17,17 + xor $10,$8 + sll $8,2 + xor $10,$23 + srl $23,$17,19 + xor $10,$8 + + xor $10,$23 # sigma1(X[i+14]) + addu $19,$10 + addu $23,$19,$24 # 27 + srl $24,$2,6 + xor $10,$3,$7 + sll $9,$2,7 + and $10,$2 + srl $8,$2,11 + xor $24,$9 + sll $9,$2,21 + xor $24,$8 + srl $8,$2,25 + xor $24,$9 + sll $9,$2,26 + xor $24,$8 + xor $10,$7 # Ch(e,f,g) + xor $8,$9,$24 # Sigma1(e) + + srl $24,$25,2 + addu $23,$10 + lw $10,108($6) # K[27] + sll $9,$25,10 + addu $23,$8 + srl $8,$25,13 + xor $24,$9 + sll $9,$25,19 + xor $24,$8 + srl $8,$25,22 + xor $24,$9 + sll $9,$25,30 + xor $24,$8 + sw $19,44($29) # offload to ring buffer + xor $24,$9 # Sigma0(a) + + or $8,$25,$30 + and $9,$25,$30 + and $8,$31 + or $9,$8 # Maj(a,b,c) + addu $23,$10 # +=K[27] + addu $24,$9 + + addu $1,$23 + addu $24,$23 + lw $22,56($29) # prefetch from ring buffer + srl $10,$21,3 # Xupdate(28) + addu $20,$13 # +=X[i+9] + sll $9,$21,14 + srl $8,$21,7 + xor $10,$9 + sll $9,11 + xor $10,$8 + srl $8,$21,18 + xor $10,$9 + + srl $11,$18,10 + xor $10,$8 # sigma0(X[i+1]) + sll $9,$18,13 + addu $20,$10 + srl $8,$18,17 + xor $11,$9 + sll $9,2 + xor $11,$8 + srl $8,$18,19 + xor $11,$9 + + xor $11,$8 # sigma1(X[i+14]) + addu $20,$11 + addu $8,$20,$7 # 28 + srl $7,$1,6 + xor $11,$2,$3 + sll $10,$1,7 + and $11,$1 + srl $9,$1,11 + xor $7,$10 + sll $10,$1,21 + xor $7,$9 + srl $9,$1,25 + xor $7,$10 + sll $10,$1,26 + xor $7,$9 + xor $11,$3 # Ch(e,f,g) + xor $9,$10,$7 # Sigma1(e) + + srl $7,$24,2 + addu $8,$11 + lw $11,112($6) # K[28] + sll $10,$24,10 + addu $8,$9 + srl $9,$24,13 + xor $7,$10 + sll $10,$24,19 + xor $7,$9 + srl $9,$24,22 + xor $7,$10 + sll $10,$24,30 + xor $7,$9 + sw $20,48($29) # offload to ring buffer + xor $7,$10 # Sigma0(a) + + or $9,$24,$25 + and $10,$24,$25 + and $9,$30 + or $10,$9 # Maj(a,b,c) + addu $8,$11 # +=K[28] + addu $7,$10 + + addu $31,$8 + addu $7,$8 + lw $23,60($29) # prefetch from ring buffer + srl $11,$22,3 # Xupdate(29) + addu $21,$14 # +=X[i+9] + sll $10,$22,14 + srl $9,$22,7 + xor $11,$10 + sll $10,11 + xor $11,$9 + srl $9,$22,18 + xor $11,$10 + + srl $12,$19,10 + xor $11,$9 # sigma0(X[i+1]) + sll $10,$19,13 + addu $21,$11 + srl $9,$19,17 + xor $12,$10 + sll $10,2 + xor $12,$9 + srl $9,$19,19 + xor $12,$10 + + xor $12,$9 # sigma1(X[i+14]) + addu $21,$12 + addu $9,$21,$3 # 29 + srl $3,$31,6 + xor $12,$1,$2 + sll $11,$31,7 + and $12,$31 + srl $10,$31,11 + xor $3,$11 + sll $11,$31,21 + xor $3,$10 + srl $10,$31,25 + xor $3,$11 + sll $11,$31,26 + xor $3,$10 + xor $12,$2 # Ch(e,f,g) + xor $10,$11,$3 # Sigma1(e) + + srl $3,$7,2 + addu $9,$12 + lw $12,116($6) # K[29] + sll $11,$7,10 + addu $9,$10 + srl $10,$7,13 + xor $3,$11 + sll $11,$7,19 + xor $3,$10 + srl $10,$7,22 + xor $3,$11 + sll $11,$7,30 + xor $3,$10 + sw $21,52($29) # offload to ring buffer + xor $3,$11 # Sigma0(a) + + or $10,$7,$24 + and $11,$7,$24 + and $10,$25 + or $11,$10 # Maj(a,b,c) + addu $9,$12 # +=K[29] + addu $3,$11 + + addu $30,$9 + addu $3,$9 + lw $8,0($29) # prefetch from ring buffer + srl $12,$23,3 # Xupdate(30) + addu $22,$15 # +=X[i+9] + sll $11,$23,14 + srl $10,$23,7 + xor $12,$11 + sll $11,11 + xor $12,$10 + srl $10,$23,18 + xor $12,$11 + + srl $13,$20,10 + xor $12,$10 # sigma0(X[i+1]) + sll $11,$20,13 + addu $22,$12 + srl $10,$20,17 + xor $13,$11 + sll $11,2 + xor $13,$10 + srl $10,$20,19 + xor $13,$11 + + xor $13,$10 # sigma1(X[i+14]) + addu $22,$13 + addu $10,$22,$2 # 30 + srl $2,$30,6 + xor $13,$31,$1 + sll $12,$30,7 + and $13,$30 + srl $11,$30,11 + xor $2,$12 + sll $12,$30,21 + xor $2,$11 + srl $11,$30,25 + xor $2,$12 + sll $12,$30,26 + xor $2,$11 + xor $13,$1 # Ch(e,f,g) + xor $11,$12,$2 # Sigma1(e) + + srl $2,$3,2 + addu $10,$13 + lw $13,120($6) # K[30] + sll $12,$3,10 + addu $10,$11 + srl $11,$3,13 + xor $2,$12 + sll $12,$3,19 + xor $2,$11 + srl $11,$3,22 + xor $2,$12 + sll $12,$3,30 + xor $2,$11 + sw $22,56($29) # offload to ring buffer + xor $2,$12 # Sigma0(a) + + or $11,$3,$7 + and $12,$3,$7 + and $11,$24 + or $12,$11 # Maj(a,b,c) + addu $10,$13 # +=K[30] + addu $2,$12 + + addu $25,$10 + addu $2,$10 + lw $9,4($29) # prefetch from ring buffer + srl $13,$8,3 # Xupdate(31) + addu $23,$16 # +=X[i+9] + sll $12,$8,14 + srl $11,$8,7 + xor $13,$12 + sll $12,11 + xor $13,$11 + srl $11,$8,18 + xor $13,$12 + + srl $14,$21,10 + xor $13,$11 # sigma0(X[i+1]) + sll $12,$21,13 + addu $23,$13 + srl $11,$21,17 + xor $14,$12 + sll $12,2 + xor $14,$11 + srl $11,$21,19 + xor $14,$12 + + xor $14,$11 # sigma1(X[i+14]) + addu $23,$14 + addu $11,$23,$1 # 31 + srl $1,$25,6 + xor $14,$30,$31 + sll $13,$25,7 + and $14,$25 + srl $12,$25,11 + xor $1,$13 + sll $13,$25,21 + xor $1,$12 + srl $12,$25,25 + xor $1,$13 + sll $13,$25,26 + xor $1,$12 + xor $14,$31 # Ch(e,f,g) + xor $12,$13,$1 # Sigma1(e) + + srl $1,$2,2 + addu $11,$14 + lw $14,124($6) # K[31] + sll $13,$2,10 + addu $11,$12 + srl $12,$2,13 + xor $1,$13 + sll $13,$2,19 + xor $1,$12 + srl $12,$2,22 + xor $1,$13 + sll $13,$2,30 + xor $1,$12 + sw $23,60($29) # offload to ring buffer + xor $1,$13 # Sigma0(a) + + or $12,$2,$3 + and $13,$2,$3 + and $12,$7 + or $13,$12 # Maj(a,b,c) + addu $11,$14 # +=K[31] + addu $1,$13 + + addu $24,$11 + addu $1,$11 + lw $10,8($29) # prefetch from ring buffer + and $14,0xfff + li $15,2290 + .set noreorder + bne $14,$15,.L16_xx + add $6,16*4 # Ktbl+=16 + + lw $23,16*4($29) # restore pointer to the end of input + lw $8,0*4($4) + lw $9,1*4($4) + lw $10,2*4($4) + add $5,16*4 + lw $11,3*4($4) + addu $1,$8 + lw $12,4*4($4) + addu $2,$9 + lw $13,5*4($4) + addu $3,$10 + lw $14,6*4($4) + addu $7,$11 + lw $15,7*4($4) + addu $24,$12 + sw $1,0*4($4) + addu $25,$13 + sw $2,1*4($4) + addu $30,$14 + sw $3,2*4($4) + addu $31,$15 + sw $7,3*4($4) + sw $24,4*4($4) + sw $25,5*4($4) + sw $30,6*4($4) + sw $31,7*4($4) + + bnel $5,$23,.Loop + sub $6,192 # rewind $6 + + lw $31,128-1*4($29) + lw $30,128-2*4($29) + lw $23,128-3*4($29) + lw $22,128-4*4($29) + lw $21,128-5*4($29) + lw $20,128-6*4($29) + lw $19,128-7*4($29) + lw $18,128-8*4($29) + lw $17,128-9*4($29) + lw $16,128-10*4($29) + jr $31 + add $29,128 +.end sha256_block_data_order + +.rdata +.align 5 +K256: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +.asciiz "SHA256 for MIPS, CRYPTOGAMS by <appro@openssl.org>" +.align 5 + diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl new file mode 100644 index 0000000..ba5b250 --- /dev/null +++ b/crypto/sha/asm/sha512-mips.pl @@ -0,0 +1,455 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA2 block procedures for MIPS. + +# October 2010. +# +# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc- +# generated code in o32 build and ~55% in n32/64 build. SHA512 [which +# for now can only be compiled for MIPS64 ISA] improvement is modest +# ~17%, but it comes for free, because it's same instruction sequence. +# Improvement coefficients are for aligned input. + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; +# +# <appro@openssl.org> +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } + +if ($output =~ /512/) { + $label="512"; + $SZ=8; + $LD="ld"; # load from memory + $ST="sd"; # store to memory + $SLL="dsll"; # shift left logical + $SRL="dsrl"; # shift right logical + $ADDU="daddu"; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=( 7, 1, 8); # right shift first + @sigma1=( 6,19,61); # right shift first + $lastK=0x817; + $rounds=80; +} else { + $label="256"; + $SZ=4; + $LD="lw"; # load from memory + $ST="sw"; # store to memory + $SLL="sll"; # shift left logical + $SRL="srl"; # shift right logical + $ADDU="addu"; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 3, 7,18); # right shift first + @sigma1=(10,17,19); # right shift first + $lastK=0x8f2; + $rounds=64; +} + +$MSB = $big_endian ? 0 : ($SZ-1); +$LSB = ($SZ-1)&~$MSB; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31)); +@X=map("\$$_",(8..23)); + +$ctx=$a0; +$inp=$a1; +$len=$a2; $Ktbl=$len; + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); + +$code.=<<___ if ($i<15); + ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) + ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) +___ +$code.=<<___ if (!$big_endian && $i<16 && $SZ==4); + srl $tmp0,@X[0],24 # byte swap($i) + srl $tmp1,@X[0],8 + andi $tmp2,@X[0],0xFF00 + sll @X[0],@X[0],24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or @X[0],$tmp0 + or $tmp1,$tmp2 + or @X[0],$tmp1 +___ +$code.=<<___ if (!$big_endian && $i<16 && $SZ==8); + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + and $tmp1,@X[0],$tmp0 # byte swap($i) + dsrl $tmp2,@X[0],24 + dsll $tmp1,24 + and $tmp2,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + and $tmp2,@X[0],$tmp0 + dsrl @X[0],8 + dsll $tmp2,8 + and @X[0],$tmp0 + or $tmp1,$tmp2 + or @X[0],$tmp1 + dsrl $tmp1,@X[0],32 + dsll @X[0],32 + or @X[0],$tmp1 +___ +$code.=<<___; + $ADDU $T1,$X[0],$h # $i + $SRL $h,$e,@Sigma1[0] + xor $tmp2,$f,$g + $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]` + and $tmp2,$e + $SRL $tmp0,$e,@Sigma1[1] + xor $h,$tmp1 + $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]` + xor $h,$tmp0 + $SRL $tmp0,$e,@Sigma1[2] + xor $h,$tmp1 + $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]` + xor $h,$tmp0 + xor $tmp2,$g # Ch(e,f,g) + xor $tmp0,$tmp1,$h # Sigma1(e) + + $SRL $h,$a,@Sigma0[0] + $ADDU $T1,$tmp2 + $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] + $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]` + $ADDU $T1,$tmp0 + $SRL $tmp0,$a,@Sigma0[1] + xor $h,$tmp1 + $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]` + xor $h,$tmp0 + $SRL $tmp0,$a,@Sigma0[2] + xor $h,$tmp1 + $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]` + xor $h,$tmp0 + $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer + xor $h,$tmp1 # Sigma0(a) + + or $tmp0,$a,$b + and $tmp1,$a,$b + and $tmp0,$c + or $tmp1,$tmp0 # Maj(a,b,c) + $ADDU $T1,$tmp2 # +=K[$i] + $ADDU $h,$tmp1 + + $ADDU $d,$T1 + $ADDU $h,$T1 +___ +$code.=<<___ if ($i>=13); + $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer +___ +} + +sub BODY_16_XX { +my $i=@_[0]; +my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]); + +$code.=<<___; + $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) + $ADDU @X[0],@X[9] # +=X[i+9] + $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]` + $SRL $tmp0,@X[1],@sigma0[1] + xor $tmp2,$tmp1 + $SLL $tmp1,`@sigma0[2]-@sigma0[1]` + xor $tmp2,$tmp0 + $SRL $tmp0,@X[1],@sigma0[2] + xor $tmp2,$tmp1 + + $SRL $tmp3,@X[14],@sigma1[0] + xor $tmp2,$tmp0 # sigma0(X[i+1]) + $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]` + $ADDU @X[0],$tmp2 + $SRL $tmp0,@X[14],@sigma1[1] + xor $tmp3,$tmp1 + $SLL $tmp1,`@sigma1[2]-@sigma1[1]` + xor $tmp3,$tmp0 + $SRL $tmp0,@X[14],@sigma1[2] + xor $tmp3,$tmp1 + + xor $tmp3,$tmp0 # sigma1(X[i+14]) + $ADDU @X[0],$tmp3 +___ + &BODY_00_15(@_); +} + +$FRAMESIZE=16*$SZ+16*$SZREG; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +$code.=<<___; +#ifdef OPENSSL_FIPSCANISTER +# include <openssl/fipssyms.h> +#endif + +.text +.set noat +#if !defined(__vxworks) || defined(__pic__) +.option pic2 +#endif + +.align 5 +.globl sha${label}_block_data_order +.ent sha${label}_block_data_order +sha${label}_block_data_order: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)` +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Ktbl + .cpsetup $pf,$zero,sha${label}_block_data_order +___ +$code.=<<___; + .set reorder + la $Ktbl,K${label} # PIC-ified 'load address' + + $LD $A,0*$SZ($ctx) # load context + $LD $B,1*$SZ($ctx) + $LD $C,2*$SZ($ctx) + $LD $D,3*$SZ($ctx) + $LD $E,4*$SZ($ctx) + $LD $F,5*$SZ($ctx) + $LD $G,6*$SZ($ctx) + $LD $H,7*$SZ($ctx) + + $PTR_ADD @X[15],$inp # pointer to the end of input + $REG_S @X[15],16*$SZ($sp) + b .Loop + +.align 5 +.Loop: + ${LD}l @X[0],$MSB($inp) + ${LD}r @X[0],$LSB($inp) +___ +for ($i=0;$i<16;$i++) +{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } +$code.=<<___; + b .L16_xx +.align 4 +.L16_xx: +___ +for (;$i<32;$i++) +{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } +$code.=<<___; + and @X[6],0xfff + li @X[7],$lastK + .set noreorder + bne @X[6],@X[7],.L16_xx + $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16 + + $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input + $LD @X[0],0*$SZ($ctx) + $LD @X[1],1*$SZ($ctx) + $LD @X[2],2*$SZ($ctx) + $PTR_ADD $inp,16*$SZ + $LD @X[3],3*$SZ($ctx) + $ADDU $A,@X[0] + $LD @X[4],4*$SZ($ctx) + $ADDU $B,@X[1] + $LD @X[5],5*$SZ($ctx) + $ADDU $C,@X[2] + $LD @X[6],6*$SZ($ctx) + $ADDU $D,@X[3] + $LD @X[7],7*$SZ($ctx) + $ADDU $E,@X[4] + $ST $A,0*$SZ($ctx) + $ADDU $F,@X[5] + $ST $B,1*$SZ($ctx) + $ADDU $G,@X[6] + $ST $C,2*$SZ($ctx) + $ADDU $H,@X[7] + $ST $D,3*$SZ($ctx) + $ST $E,4*$SZ($ctx) + $ST $F,5*$SZ($ctx) + $ST $G,6*$SZ($ctx) + $ST $H,7*$SZ($ctx) + + bnel $inp,@X[15],.Loop + $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl + + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end sha${label}_block_data_order + +.rdata +.align 5 +K${label}: +___ +if ($SZ==4) { +$code.=<<___; + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +___ +} else { +$code.=<<___; + .dword 0x428a2f98d728ae22, 0x7137449123ef65cd + .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .dword 0x3956c25bf348b538, 0x59f111f1b605d019 + .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .dword 0xd807aa98a3030242, 0x12835b0145706fbe + .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 + .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 + .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .dword 0x06ca6351e003826f, 0x142929670a0e6e70 + .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .dword 0x81c2c92e47edaee6, 0x92722c851482353b + .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .dword 0xd192e819d6ef5218, 0xd69906245565a910 + .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 + .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .dword 0x90befffa23631e28, 0xa4506cebde82bde9 + .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .dword 0xca273eceea26619c, 0xd186b8c721c0c207 + .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .dword 0x113f9804bef90dae, 0x1b710b35131c471b + .dword 0x28db77f523047d84, 0x32caab7b40c72493 + .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +___ +} +$code.=<<___; +.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>" +.align 5 + +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; |