diff options
author | Brian Carlstrom <bdc@google.com> | 2012-03-15 15:27:15 -0700 |
---|---|---|
committer | Android Git Automerger <android-git-automerger@android.com> | 2012-03-15 15:27:15 -0700 |
commit | 7f1d63479ce92a2a4a0874b007e49f8acb13a0d9 (patch) | |
tree | 6ae4607f75c2c531a170270264b6c7d88306152a /patches | |
parent | 21c841450af61d0a9119cdc863e93d019127bfe1 (diff) | |
parent | db166823303559663b1c209e14b326160519c51c (diff) | |
download | replicant_openssl-7f1d63479ce92a2a4a0874b007e49f8acb13a0d9.zip replicant_openssl-7f1d63479ce92a2a4a0874b007e49f8acb13a0d9.tar.gz replicant_openssl-7f1d63479ce92a2a4a0874b007e49f8acb13a0d9.tar.bz2 |
am db166823: Merge "From 67b1ae72527c9e173ace98e805e8b9c090455873 Mon Sep 17 00:00:00 2001 Subject: [MIPS] MIPS assembler pack update"
* commit 'db166823303559663b1c209e14b326160519c51c':
From 67b1ae72527c9e173ace98e805e8b9c090455873 Mon Sep 17 00:00:00 2001 Subject: [MIPS] MIPS assembler pack update
Diffstat (limited to 'patches')
-rw-r--r-- | patches/README | 4 | ||||
-rw-r--r-- | patches/crypto_Android.mk | 50 | ||||
-rw-r--r-- | patches/mips_asm.patch | 5461 | ||||
-rw-r--r-- | patches/ssl_Android.mk | 4 |
4 files changed, 5507 insertions, 12 deletions
diff --git a/patches/README b/patches/README index f7d886d..f70135c 100644 --- a/patches/README +++ b/patches/README @@ -33,3 +33,7 @@ Transport Layer Security (TLS) Next Protocol Negotiation Extension sha1_armv4_large.patch This patch eliminates memory stores to addresses below SP. + +mips_asm.patch + +MIPS assembly routines (AES, BN, SHA1, SHA256) diff --git a/patches/crypto_Android.mk b/patches/crypto_Android.mk index bde68fc..8090c12 100644 --- a/patches/crypto_Android.mk +++ b/patches/crypto_Android.mk @@ -1,13 +1,26 @@ LOCAL_PATH:= $(call my-dir) arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM +mips_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM + arm_src_files := \ aes/asm/aes-armv4.s \ bn/asm/armv4-mont.s \ + bn/bn_asm.c \ sha/asm/sha1-armv4-large.s \ sha/asm/sha256-armv4.s \ sha/asm/sha512-armv4.s -non_arm_src_files := aes/aes_core.c + +mips_src_files := \ + aes/asm/aes-mips.s \ + bn/asm/bn-mips.s \ + bn/asm/mips-mont.s \ + sha/asm/sha1-mips.s \ + sha/asm/sha256-mips.s + +other_arch_src_files := \ + aes/aes_core.c \ + bn/bn_asm.c local_src_files := \ cryptlib.c \ @@ -131,7 +144,6 @@ local_src_files := \ bio/bss_null.c \ bio/bss_sock.c \ bn/bn_add.c \ - bn/bn_asm.c \ bn/bn_blind.c \ bn/bn_const.c \ bn/bn_ctx.c \ @@ -506,7 +518,7 @@ local_c_flags := -DNO_WINDOWS_BRAINDEATH include $(CLEAR_VARS) include $(LOCAL_PATH)/../android-config.mk -ifneq ($(TARGET_ARCH),x86) +ifeq ($(TARGET_ARCH),arm) LOCAL_NDK_VERSION := 5 LOCAL_SDK_VERSION := 9 endif @@ -517,8 +529,17 @@ LOCAL_C_INCLUDES += $(local_c_includes) ifeq ($(TARGET_ARCH),arm) LOCAL_SRC_FILES += $(arm_src_files) LOCAL_CFLAGS += $(arm_cflags) -else - LOCAL_SRC_FILES += $(non_arm_src_files) +endif +ifeq ($(TARGET_ARCH),mips) + ifneq (($TARGET_HAS_BIGENDIAN),true) + LOCAL_SRC_FILES += $(mips_src_files) + LOCAL_CFLAGS += $(mips_cflags) + else + LOCAL_SRC_FILES += $(other_arch_src_files) + endif +endif +ifeq ($(TARGET_ARCH),x86) + LOCAL_SRC_FILES += $(other_arch_src_files) endif LOCAL_MODULE_TAGS := optional LOCAL_MODULE:= libcrypto_static @@ -529,7 +550,7 @@ include $(BUILD_STATIC_LIBRARY) include $(CLEAR_VARS) include $(LOCAL_PATH)/../android-config.mk -ifneq ($(TARGET_ARCH),x86) +ifeq ($(TARGET_ARCH),arm) LOCAL_NDK_VERSION := 5 LOCAL_SDK_VERSION := 9 # Use the NDK prebuilt libz and libdl. @@ -544,8 +565,17 @@ LOCAL_C_INCLUDES += $(local_c_includes) ifeq ($(TARGET_ARCH),arm) LOCAL_SRC_FILES += $(arm_src_files) LOCAL_CFLAGS += $(arm_cflags) -else - LOCAL_SRC_FILES += $(non_arm_src_files) +endif +ifeq ($(TARGET_ARCH),mips) + ifneq (($TARGET_HAS_BIGENDIAN),true) + LOCAL_SRC_FILES += $(mips_src_files) + LOCAL_CFLAGS += $(mips_cflags) + else + LOCAL_SRC_FILES += $(other_arch_src_files) + endif +endif +ifeq ($(TARGET_ARCH),x86) + LOCAL_SRC_FILES += $(other_arch_src_files) endif LOCAL_MODULE_TAGS := optional LOCAL_MODULE:= libcrypto @@ -558,7 +588,7 @@ include $(LOCAL_PATH)/../android-config.mk LOCAL_SRC_FILES += $(local_src_files) LOCAL_CFLAGS += $(local_c_flags) -DPURIFY LOCAL_C_INCLUDES += $(local_c_includes) -LOCAL_SRC_FILES += $(non_arm_src_files) +LOCAL_SRC_FILES += $(other_arch_src_files) LOCAL_STATIC_LIBRARIES += libz LOCAL_LDLIBS += -ldl LOCAL_MODULE_TAGS := optional @@ -573,7 +603,7 @@ include $(LOCAL_PATH)/../android-config.mk LOCAL_SRC_FILES += $(local_src_files) LOCAL_CFLAGS += $(local_c_flags) -DPURIFY LOCAL_C_INCLUDES += $(local_c_includes) -LOCAL_SRC_FILES += $(non_arm_src_files) +LOCAL_SRC_FILES += $(other_arch_src_files) LOCAL_STATIC_LIBRARIES += libz LOCAL_LDLIBS += -ldl LOCAL_MODULE_TAGS := optional diff --git a/patches/mips_asm.patch b/patches/mips_asm.patch new file mode 100644 index 0000000..68a80f1 --- /dev/null +++ b/patches/mips_asm.patch @@ -0,0 +1,5461 @@ +diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl +new file mode 100644 +index 0000000..2ce6def +--- /dev/null ++++ b/crypto/aes/asm/aes-mips.pl +@@ -0,0 +1,1611 @@ ++#!/usr/bin/env perl ++ ++# ==================================================================== ++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL ++# project. The module is, however, dual licensed under OpenSSL and ++# CRYPTOGAMS licenses depending on where you obtain it. For further ++# details see http://www.openssl.org/~appro/cryptogams/. ++# ==================================================================== ++ ++# AES for MIPS ++ ++# October 2010 ++# ++# Code uses 1K[+256B] S-box and on single-issue core [such as R5000] ++# spends ~68 cycles per byte processed with 128-bit key. This is ~16% ++# faster than gcc-generated code, which is not very impressive. But ++# recall that compressed S-box requires extra processing, namely ++# additional rotations. Rotations are implemented with lwl/lwr pairs, ++# which is normally used for loading unaligned data. Another cool ++# thing about this module is its endian neutrality, which means that ++# it processes data without ever changing byte order... ++ ++###################################################################### ++# There is a number of MIPS ABI in use, O32 and N32/64 are most ++# widely used. Then there is a new contender: NUBI. It appears that if ++# one picks the latter, it's possible to arrange code in ABI neutral ++# manner. Therefore let's stick to NUBI register layout: ++# ++($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ++($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ++($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); ++# ++# The return value is placed in $a0. Following coding rules facilitate ++# interoperability: ++# ++# - never ever touch $tp, "thread pointer", former $gp; ++# - copy return value to $t0, former $v0 [or to $a0 if you're adapting ++# old code]; ++# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; ++# ++# For reference here is register layout for N32/64 MIPS ABIs: ++# ++# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); ++# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); ++# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); ++# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); ++# ++$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 ++ ++if ($flavour =~ /64|n32/i) { ++ $PTR_ADD="dadd"; # incidentally works even on n32 ++ $PTR_SUB="dsub"; # incidentally works even on n32 ++ $REG_S="sd"; ++ $REG_L="ld"; ++ $PTR_SLL="dsll"; # incidentally works even on n32 ++ $SZREG=8; ++} else { ++ $PTR_ADD="add"; ++ $PTR_SUB="sub"; ++ $REG_S="sw"; ++ $REG_L="lw"; ++ $PTR_SLL="sll"; ++ $SZREG=4; ++} ++$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; ++# ++# <appro@openssl.org> ++# ++###################################################################### ++ ++$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; ++ ++for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } ++open STDOUT,">$output"; ++ ++if (!defined($big_endian)) ++{ $big_endian=(unpack('L',pack('N',1))==1); } ++ ++while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} ++open STDOUT,">$output"; ++ ++my ($MSB,$LSB)=(0,3); # automatically converted to little-endian ++ ++$code.=<<___; ++.text ++#ifdef OPENSSL_FIPSCANISTER ++# include <openssl/fipssyms.h> ++#endif ++ ++#if !defined(__vxworks) || defined(__pic__) ++.option pic2 ++#endif ++.set noat ++___ ++ ++{{{ ++my $FRAMESIZE=16*$SZREG; ++my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; ++ ++my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7); ++my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); ++my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23)); ++my ($key0,$cnt)=($gp,$fp); ++ ++# instuction ordering is "stolen" from output from MIPSpro assembler ++# invoked with -mips3 -O3 arguments... ++$code.=<<___; ++.align 5 ++.ent _mips_AES_encrypt ++_mips_AES_encrypt: ++ .frame $sp,0,$ra ++ .set reorder ++ lw $t0,0($key) ++ lw $t1,4($key) ++ lw $t2,8($key) ++ lw $t3,12($key) ++ lw $cnt,240($key) ++ $PTR_ADD $key0,$key,16 ++ ++ xor $s0,$t0 ++ xor $s1,$t1 ++ xor $s2,$t2 ++ xor $s3,$t3 ++ ++ sub $cnt,1 ++ _xtr $i0,$s1,16-2 ++.Loop_enc: ++ _xtr $i1,$s2,16-2 ++ _xtr $i2,$s3,16-2 ++ _xtr $i3,$s0,16-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lwl $t0,3($i0) # Te1[s1>>16] ++ lwl $t1,3($i1) # Te1[s2>>16] ++ lwl $t2,3($i2) # Te1[s3>>16] ++ lwl $t3,3($i3) # Te1[s0>>16] ++ lwr $t0,2($i0) # Te1[s1>>16] ++ lwr $t1,2($i1) # Te1[s2>>16] ++ lwr $t2,2($i2) # Te1[s3>>16] ++ lwr $t3,2($i3) # Te1[s0>>16] ++ ++ _xtr $i0,$s2,8-2 ++ _xtr $i1,$s3,8-2 ++ _xtr $i2,$s0,8-2 ++ _xtr $i3,$s1,8-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lwl $t4,2($i0) # Te2[s2>>8] ++ lwl $t5,2($i1) # Te2[s3>>8] ++ lwl $t6,2($i2) # Te2[s0>>8] ++ lwl $t7,2($i3) # Te2[s1>>8] ++ lwr $t4,1($i0) # Te2[s2>>8] ++ lwr $t5,1($i1) # Te2[s3>>8] ++ lwr $t6,1($i2) # Te2[s0>>8] ++ lwr $t7,1($i3) # Te2[s1>>8] ++ ++ _xtr $i0,$s3,0-2 ++ _xtr $i1,$s0,0-2 ++ _xtr $i2,$s1,0-2 ++ _xtr $i3,$s2,0-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lwl $t8,1($i0) # Te3[s3] ++ lwl $t9,1($i1) # Te3[s0] ++ lwl $t10,1($i2) # Te3[s1] ++ lwl $t11,1($i3) # Te3[s2] ++ lwr $t8,0($i0) # Te3[s3] ++ lwr $t9,0($i1) # Te3[s0] ++ lwr $t10,0($i2) # Te3[s1] ++ lwr $t11,0($i3) # Te3[s2] ++ ++ _xtr $i0,$s0,24-2 ++ _xtr $i1,$s1,24-2 ++ _xtr $i2,$s2,24-2 ++ _xtr $i3,$s3,24-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ xor $t0,$t4 ++ xor $t1,$t5 ++ xor $t2,$t6 ++ xor $t3,$t7 ++ lw $t4,0($i0) # Te0[s0>>24] ++ lw $t5,0($i1) # Te0[s1>>24] ++ lw $t6,0($i2) # Te0[s2>>24] ++ lw $t7,0($i3) # Te0[s3>>24] ++ ++ lw $s0,0($key0) ++ lw $s1,4($key0) ++ lw $s2,8($key0) ++ lw $s3,12($key0) ++ ++ xor $t0,$t8 ++ xor $t1,$t9 ++ xor $t2,$t10 ++ xor $t3,$t11 ++ ++ xor $t0,$t4 ++ xor $t1,$t5 ++ xor $t2,$t6 ++ xor $t3,$t7 ++ ++ sub $cnt,1 ++ $PTR_ADD $key0,16 ++ xor $s0,$t0 ++ xor $s1,$t1 ++ xor $s2,$t2 ++ xor $s3,$t3 ++ .set noreorder ++ bnez $cnt,.Loop_enc ++ _xtr $i0,$s1,16-2 ++ ++ .set reorder ++ _xtr $i1,$s2,16-2 ++ _xtr $i2,$s3,16-2 ++ _xtr $i3,$s0,16-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $t0,2($i0) # Te4[s1>>16] ++ lbu $t1,2($i1) # Te4[s2>>16] ++ lbu $t2,2($i2) # Te4[s3>>16] ++ lbu $t3,2($i3) # Te4[s0>>16] ++ ++ _xtr $i0,$s2,8-2 ++ _xtr $i1,$s3,8-2 ++ _xtr $i2,$s0,8-2 ++ _xtr $i3,$s1,8-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $t4,2($i0) # Te4[s2>>8] ++ lbu $t5,2($i1) # Te4[s3>>8] ++ lbu $t6,2($i2) # Te4[s0>>8] ++ lbu $t7,2($i3) # Te4[s1>>8] ++ ++ _xtr $i0,$s0,24-2 ++ _xtr $i1,$s1,24-2 ++ _xtr $i2,$s2,24-2 ++ _xtr $i3,$s3,24-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $t8,2($i0) # Te4[s0>>24] ++ lbu $t9,2($i1) # Te4[s1>>24] ++ lbu $t10,2($i2) # Te4[s2>>24] ++ lbu $t11,2($i3) # Te4[s3>>24] ++ ++ _xtr $i0,$s3,0-2 ++ _xtr $i1,$s0,0-2 ++ _xtr $i2,$s1,0-2 ++ _xtr $i3,$s2,0-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ ++ _ins $t0,16 ++ _ins $t1,16 ++ _ins $t2,16 ++ _ins $t3,16 ++ ++ _ins $t4,8 ++ _ins $t5,8 ++ _ins $t6,8 ++ _ins $t7,8 ++ ++ xor $t0,$t4 ++ xor $t1,$t5 ++ xor $t2,$t6 ++ xor $t3,$t7 ++ ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $t4,2($i0) # Te4[s3] ++ lbu $t5,2($i1) # Te4[s0] ++ lbu $t6,2($i2) # Te4[s1] ++ lbu $t7,2($i3) # Te4[s2] ++ ++ _ins $t8,24 ++ _ins $t9,24 ++ _ins $t10,24 ++ _ins $t11,24 ++ ++ lw $s0,0($key0) ++ lw $s1,4($key0) ++ lw $s2,8($key0) ++ lw $s3,12($key0) ++ ++ xor $t0,$t8 ++ xor $t1,$t9 ++ xor $t2,$t10 ++ xor $t3,$t11 ++ ++ _ins $t4,0 ++ _ins $t5,0 ++ _ins $t6,0 ++ _ins $t7,0 ++ ++ xor $t0,$t4 ++ xor $t1,$t5 ++ xor $t2,$t6 ++ xor $t3,$t7 ++ ++ xor $s0,$t0 ++ xor $s1,$t1 ++ xor $s2,$t2 ++ xor $s3,$t3 ++ ++ jr $ra ++.end _mips_AES_encrypt ++ ++.align 5 ++.globl AES_encrypt ++.ent AES_encrypt ++AES_encrypt: ++ .frame $sp,$FRAMESIZE,$ra ++ .mask $SAVED_REGS_MASK,-$SZREG ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification ++ .cpload $pf ++___ ++$code.=<<___; ++ $PTR_SUB $sp,$FRAMESIZE ++ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) ++ $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) ++ $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) ++ $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) ++ $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) ++ $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) ++ $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) ++ $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) ++ $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue ++ $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) ++ $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) ++ $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) ++ $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) ++ $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ++ .cplocal $Tbl ++ .cpsetup $pf,$zero,AES_encrypt ++___ ++$code.=<<___; ++ .set reorder ++ la $Tbl,AES_Te # PIC-ified 'load address' ++ ++ lwl $s0,0+$MSB($inp) ++ lwl $s1,4+$MSB($inp) ++ lwl $s2,8+$MSB($inp) ++ lwl $s3,12+$MSB($inp) ++ lwr $s0,0+$LSB($inp) ++ lwr $s1,4+$LSB($inp) ++ lwr $s2,8+$LSB($inp) ++ lwr $s3,12+$LSB($inp) ++ ++ bal _mips_AES_encrypt ++ ++ swr $s0,0+$LSB($out) ++ swr $s1,4+$LSB($out) ++ swr $s2,8+$LSB($out) ++ swr $s3,12+$LSB($out) ++ swl $s0,0+$MSB($out) ++ swl $s1,4+$MSB($out) ++ swl $s2,8+$MSB($out) ++ swl $s3,12+$MSB($out) ++ ++ .set noreorder ++ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) ++ $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) ++ $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) ++ $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) ++ $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) ++ $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) ++ $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) ++ $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) ++ $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) ++ $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) ++ $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) ++ $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) ++ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) ++___ ++$code.=<<___; ++ jr $ra ++ $PTR_ADD $sp,$FRAMESIZE ++.end AES_encrypt ++___ ++ ++$code.=<<___; ++.align 5 ++.ent _mips_AES_decrypt ++_mips_AES_decrypt: ++ .frame $sp,0,$ra ++ .set reorder ++ lw $t0,0($key) ++ lw $t1,4($key) ++ lw $t2,8($key) ++ lw $t3,12($key) ++ lw $cnt,240($key) ++ $PTR_ADD $key0,$key,16 ++ ++ xor $s0,$t0 ++ xor $s1,$t1 ++ xor $s2,$t2 ++ xor $s3,$t3 ++ ++ sub $cnt,1 ++ _xtr $i0,$s3,16-2 ++.Loop_dec: ++ _xtr $i1,$s0,16-2 ++ _xtr $i2,$s1,16-2 ++ _xtr $i3,$s2,16-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lwl $t0,3($i0) # Td1[s3>>16] ++ lwl $t1,3($i1) # Td1[s0>>16] ++ lwl $t2,3($i2) # Td1[s1>>16] ++ lwl $t3,3($i3) # Td1[s2>>16] ++ lwr $t0,2($i0) # Td1[s3>>16] ++ lwr $t1,2($i1) # Td1[s0>>16] ++ lwr $t2,2($i2) # Td1[s1>>16] ++ lwr $t3,2($i3) # Td1[s2>>16] ++ ++ _xtr $i0,$s2,8-2 ++ _xtr $i1,$s3,8-2 ++ _xtr $i2,$s0,8-2 ++ _xtr $i3,$s1,8-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lwl $t4,2($i0) # Td2[s2>>8] ++ lwl $t5,2($i1) # Td2[s3>>8] ++ lwl $t6,2($i2) # Td2[s0>>8] ++ lwl $t7,2($i3) # Td2[s1>>8] ++ lwr $t4,1($i0) # Td2[s2>>8] ++ lwr $t5,1($i1) # Td2[s3>>8] ++ lwr $t6,1($i2) # Td2[s0>>8] ++ lwr $t7,1($i3) # Td2[s1>>8] ++ ++ _xtr $i0,$s1,0-2 ++ _xtr $i1,$s2,0-2 ++ _xtr $i2,$s3,0-2 ++ _xtr $i3,$s0,0-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lwl $t8,1($i0) # Td3[s1] ++ lwl $t9,1($i1) # Td3[s2] ++ lwl $t10,1($i2) # Td3[s3] ++ lwl $t11,1($i3) # Td3[s0] ++ lwr $t8,0($i0) # Td3[s1] ++ lwr $t9,0($i1) # Td3[s2] ++ lwr $t10,0($i2) # Td3[s3] ++ lwr $t11,0($i3) # Td3[s0] ++ ++ _xtr $i0,$s0,24-2 ++ _xtr $i1,$s1,24-2 ++ _xtr $i2,$s2,24-2 ++ _xtr $i3,$s3,24-2 ++ and $i0,0x3fc ++ and $i1,0x3fc ++ and $i2,0x3fc ++ and $i3,0x3fc ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ ++ xor $t0,$t4 ++ xor $t1,$t5 ++ xor $t2,$t6 ++ xor $t3,$t7 ++ ++ ++ lw $t4,0($i0) # Td0[s0>>24] ++ lw $t5,0($i1) # Td0[s1>>24] ++ lw $t6,0($i2) # Td0[s2>>24] ++ lw $t7,0($i3) # Td0[s3>>24] ++ ++ lw $s0,0($key0) ++ lw $s1,4($key0) ++ lw $s2,8($key0) ++ lw $s3,12($key0) ++ ++ xor $t0,$t8 ++ xor $t1,$t9 ++ xor $t2,$t10 ++ xor $t3,$t11 ++ ++ xor $t0,$t4 ++ xor $t1,$t5 ++ xor $t2,$t6 ++ xor $t3,$t7 ++ ++ sub $cnt,1 ++ $PTR_ADD $key0,16 ++ xor $s0,$t0 ++ xor $s1,$t1 ++ xor $s2,$t2 ++ xor $s3,$t3 ++ .set noreorder ++ bnez $cnt,.Loop_dec ++ _xtr $i0,$s3,16-2 ++ ++ .set reorder ++ lw $t4,1024($Tbl) # prefetch Td4 ++ lw $t5,1024+32($Tbl) ++ lw $t6,1024+64($Tbl) ++ lw $t7,1024+96($Tbl) ++ lw $t8,1024+128($Tbl) ++ lw $t9,1024+160($Tbl) ++ lw $t10,1024+192($Tbl) ++ lw $t11,1024+224($Tbl) ++ ++ _xtr $i0,$s3,16 ++ _xtr $i1,$s0,16 ++ _xtr $i2,$s1,16 ++ _xtr $i3,$s2,16 ++ and $i0,0xff ++ and $i1,0xff ++ and $i2,0xff ++ and $i3,0xff ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $t0,1024($i0) # Td4[s3>>16] ++ lbu $t1,1024($i1) # Td4[s0>>16] ++ lbu $t2,1024($i2) # Td4[s1>>16] ++ lbu $t3,1024($i3) # Td4[s2>>16] ++ ++ _xtr $i0,$s2,8 ++ _xtr $i1,$s3,8 ++ _xtr $i2,$s0,8 ++ _xtr $i3,$s1,8 ++ and $i0,0xff ++ and $i1,0xff ++ and $i2,0xff ++ and $i3,0xff ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $t4,1024($i0) # Td4[s2>>8] ++ lbu $t5,1024($i1) # Td4[s3>>8] ++ lbu $t6,1024($i2) # Td4[s0>>8] ++ lbu $t7,1024($i3) # Td4[s1>>8] ++ ++ _xtr $i0,$s0,24 ++ _xtr $i1,$s1,24 ++ _xtr $i2,$s2,24 ++ _xtr $i3,$s3,24 ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $t8,1024($i0) # Td4[s0>>24] ++ lbu $t9,1024($i1) # Td4[s1>>24] ++ lbu $t10,1024($i2) # Td4[s2>>24] ++ lbu $t11,1024($i3) # Td4[s3>>24] ++ ++ _xtr $i0,$s1,0 ++ _xtr $i1,$s2,0 ++ _xtr $i2,$s3,0 ++ _xtr $i3,$s0,0 ++ ++ _ins $t0,16 ++ _ins $t1,16 ++ _ins $t2,16 ++ _ins $t3,16 ++ ++ _ins $t4,8 ++ _ins $t5,8 ++ _ins $t6,8 ++ _ins $t7,8 ++ ++ xor $t0,$t4 ++ xor $t1,$t5 ++ xor $t2,$t6 ++ xor $t3,$t7 ++ ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $t4,1024($i0) # Td4[s1] ++ lbu $t5,1024($i1) # Td4[s2] ++ lbu $t6,1024($i2) # Td4[s3] ++ lbu $t7,1024($i3) # Td4[s0] ++ ++ _ins $t8,24 ++ _ins $t9,24 ++ _ins $t10,24 ++ _ins $t11,24 ++ ++ lw $s0,0($key0) ++ lw $s1,4($key0) ++ lw $s2,8($key0) ++ lw $s3,12($key0) ++ ++ _ins $t4,0 ++ _ins $t5,0 ++ _ins $t6,0 ++ _ins $t7,0 ++ ++ ++ xor $t0,$t8 ++ xor $t1,$t9 ++ xor $t2,$t10 ++ xor $t3,$t11 ++ ++ xor $t0,$t4 ++ xor $t1,$t5 ++ xor $t2,$t6 ++ xor $t3,$t7 ++ ++ xor $s0,$t0 ++ xor $s1,$t1 ++ xor $s2,$t2 ++ xor $s3,$t3 ++ ++ jr $ra ++.end _mips_AES_decrypt ++ ++.align 5 ++.globl AES_decrypt ++.ent AES_decrypt ++AES_decrypt: ++ .frame $sp,$FRAMESIZE,$ra ++ .mask $SAVED_REGS_MASK,-$SZREG ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification ++ .cpload $pf ++___ ++$code.=<<___; ++ $PTR_SUB $sp,$FRAMESIZE ++ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) ++ $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) ++ $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) ++ $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) ++ $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) ++ $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) ++ $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) ++ $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) ++ $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue ++ $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) ++ $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) ++ $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) ++ $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) ++ $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ++ .cplocal $Tbl ++ .cpsetup $pf,$zero,AES_decrypt ++___ ++$code.=<<___; ++ .set reorder ++ la $Tbl,AES_Td # PIC-ified 'load address' ++ ++ lwl $s0,0+$MSB($inp) ++ lwl $s1,4+$MSB($inp) ++ lwl $s2,8+$MSB($inp) ++ lwl $s3,12+$MSB($inp) ++ lwr $s0,0+$LSB($inp) ++ lwr $s1,4+$LSB($inp) ++ lwr $s2,8+$LSB($inp) ++ lwr $s3,12+$LSB($inp) ++ ++ bal _mips_AES_decrypt ++ ++ swr $s0,0+$LSB($out) ++ swr $s1,4+$LSB($out) ++ swr $s2,8+$LSB($out) ++ swr $s3,12+$LSB($out) ++ swl $s0,0+$MSB($out) ++ swl $s1,4+$MSB($out) ++ swl $s2,8+$MSB($out) ++ swl $s3,12+$MSB($out) ++ ++ .set noreorder ++ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) ++ $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) ++ $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) ++ $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) ++ $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) ++ $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) ++ $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) ++ $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) ++ $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) ++ $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) ++ $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) ++ $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) ++ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) ++___ ++$code.=<<___; ++ jr $ra ++ $PTR_ADD $sp,$FRAMESIZE ++.end AES_decrypt ++___ ++}}} ++ ++{{{ ++my $FRAMESIZE=8*$SZREG; ++my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000; ++ ++my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3); ++my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); ++my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); ++my ($rcon,$cnt)=($gp,$fp); ++ ++$code.=<<___; ++.align 5 ++.ent _mips_AES_set_encrypt_key ++_mips_AES_set_encrypt_key: ++ .frame $sp,0,$ra ++ .set noreorder ++ beqz $inp,.Lekey_done ++ li $t0,-1 ++ beqz $key,.Lekey_done ++ $PTR_ADD $rcon,$Tbl,1024+256 ++ ++ .set reorder ++ lwl $rk0,0+$MSB($inp) # load 128 bits ++ lwl $rk1,4+$MSB($inp) ++ lwl $rk2,8+$MSB($inp) ++ lwl $rk3,12+$MSB($inp) ++ li $at,128 ++ lwr $rk0,0+$LSB($inp) ++ lwr $rk1,4+$LSB($inp) ++ lwr $rk2,8+$LSB($inp) ++ lwr $rk3,12+$LSB($inp) ++ .set noreorder ++ beq $bits,$at,.L128bits ++ li $cnt,10 ++ ++ .set reorder ++ lwl $rk4,16+$MSB($inp) # load 192 bits ++ lwl $rk5,20+$MSB($inp) ++ li $at,192 ++ lwr $rk4,16+$LSB($inp) ++ lwr $rk5,20+$LSB($inp) ++ .set noreorder ++ beq $bits,$at,.L192bits ++ li $cnt,8 ++ ++ .set reorder ++ lwl $rk6,24+$MSB($inp) # load 256 bits ++ lwl $rk7,28+$MSB($inp) ++ li $at,256 ++ lwr $rk6,24+$LSB($inp) ++ lwr $rk7,28+$LSB($inp) ++ .set noreorder ++ beq $bits,$at,.L256bits ++ li $cnt,7 ++ ++ b .Lekey_done ++ li $t0,-2 ++ ++.align 4 ++.L128bits: ++ .set reorder ++ srl $i0,$rk3,16 ++ srl $i1,$rk3,8 ++ and $i0,0xff ++ and $i1,0xff ++ and $i2,$rk3,0xff ++ srl $i3,$rk3,24 ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $i0,1024($i0) ++ lbu $i1,1024($i1) ++ lbu $i2,1024($i2) ++ lbu $i3,1024($i3) ++ ++ sw $rk0,0($key) ++ sw $rk1,4($key) ++ sw $rk2,8($key) ++ sw $rk3,12($key) ++ sub $cnt,1 ++ $PTR_ADD $key,16 ++ ++ _bias $i0,24 ++ _bias $i1,16 ++ _bias $i2,8 ++ _bias $i3,0 ++ ++ xor $rk0,$i0 ++ lw $i0,0($rcon) ++ xor $rk0,$i1 ++ xor $rk0,$i2 ++ xor $rk0,$i3 ++ xor $rk0,$i0 ++ ++ xor $rk1,$rk0 ++ xor $rk2,$rk1 ++ xor $rk3,$rk2 ++ ++ .set noreorder ++ bnez $cnt,.L128bits ++ $PTR_ADD $rcon,4 ++ ++ sw $rk0,0($key) ++ sw $rk1,4($key) ++ sw $rk2,8($key) ++ li $cnt,10 ++ sw $rk3,12($key) ++ li $t0,0 ++ sw $cnt,80($key) ++ b .Lekey_done ++ $PTR_SUB $key,10*16 ++ ++.align 4 ++.L192bits: ++ .set reorder ++ srl $i0,$rk5,16 ++ srl $i1,$rk5,8 ++ and $i0,0xff ++ and $i1,0xff ++ and $i2,$rk5,0xff ++ srl $i3,$rk5,24 ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $i0,1024($i0) ++ lbu $i1,1024($i1) ++ lbu $i2,1024($i2) ++ lbu $i3,1024($i3) ++ ++ sw $rk0,0($key) ++ sw $rk1,4($key) ++ sw $rk2,8($key) ++ sw $rk3,12($key) ++ sw $rk4,16($key) ++ sw $rk5,20($key) ++ sub $cnt,1 ++ $PTR_ADD $key,24 ++ ++ _bias $i0,24 ++ _bias $i1,16 ++ _bias $i2,8 ++ _bias $i3,0 ++ ++ xor $rk0,$i0 ++ lw $i0,0($rcon) ++ xor $rk0,$i1 ++ xor $rk0,$i2 ++ xor $rk0,$i3 ++ xor $rk0,$i0 ++ ++ xor $rk1,$rk0 ++ xor $rk2,$rk1 ++ xor $rk3,$rk2 ++ xor $rk4,$rk3 ++ xor $rk5,$rk4 ++ ++ .set noreorder ++ bnez $cnt,.L192bits ++ $PTR_ADD $rcon,4 ++ ++ sw $rk0,0($key) ++ sw $rk1,4($key) ++ sw $rk2,8($key) ++ li $cnt,12 ++ sw $rk3,12($key) ++ li $t0,0 ++ sw $cnt,48($key) ++ b .Lekey_done ++ $PTR_SUB $key,12*16 ++ ++.align 4 ++.L256bits: ++ .set reorder ++ srl $i0,$rk7,16 ++ srl $i1,$rk7,8 ++ and $i0,0xff ++ and $i1,0xff ++ and $i2,$rk7,0xff ++ srl $i3,$rk7,24 ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $i0,1024($i0) ++ lbu $i1,1024($i1) ++ lbu $i2,1024($i2) ++ lbu $i3,1024($i3) ++ ++ sw $rk0,0($key) ++ sw $rk1,4($key) ++ sw $rk2,8($key) ++ sw $rk3,12($key) ++ sw $rk4,16($key) ++ sw $rk5,20($key) ++ sw $rk6,24($key) ++ sw $rk7,28($key) ++ sub $cnt,1 ++ ++ _bias $i0,24 ++ _bias $i1,16 ++ _bias $i2,8 ++ _bias $i3,0 ++ ++ xor $rk0,$i0 ++ lw $i0,0($rcon) ++ xor $rk0,$i1 ++ xor $rk0,$i2 ++ xor $rk0,$i3 ++ xor $rk0,$i0 ++ ++ xor $rk1,$rk0 ++ xor $rk2,$rk1 ++ xor $rk3,$rk2 ++ beqz $cnt,.L256bits_done ++ ++ srl $i0,$rk3,24 ++ srl $i1,$rk3,16 ++ srl $i2,$rk3,8 ++ and $i3,$rk3,0xff ++ and $i1,0xff ++ and $i2,0xff ++ $PTR_ADD $i0,$Tbl ++ $PTR_ADD $i1,$Tbl ++ $PTR_ADD $i2,$Tbl ++ $PTR_ADD $i3,$Tbl ++ lbu $i0,1024($i0) ++ lbu $i1,1024($i1) ++ lbu $i2,1024($i2) ++ lbu $i3,1024($i3) ++ sll $i0,24 ++ sll $i1,16 ++ sll $i2,8 ++ ++ xor $rk4,$i0 ++ xor $rk4,$i1 ++ xor $rk4,$i2 ++ xor $rk4,$i3 ++ ++ xor $rk5,$rk4 ++ xor $rk6,$rk5 ++ xor $rk7,$rk6 ++ ++ $PTR_ADD $key,32 ++ .set noreorder ++ b .L256bits ++ $PTR_ADD $rcon,4 ++ ++.L256bits_done: ++ sw $rk0,32($key) ++ sw $rk1,36($key) ++ sw $rk2,40($key) ++ li $cnt,14 ++ sw $rk3,44($key) ++ li $t0,0 ++ sw $cnt,48($key) ++ $PTR_SUB $key,12*16 ++ ++.Lekey_done: ++ jr $ra ++ nop ++.end _mips_AES_set_encrypt_key ++ ++.globl AES_set_encrypt_key ++.ent AES_set_encrypt_key ++AES_set_encrypt_key: ++ .frame $sp,$FRAMESIZE,$ra ++ .mask $SAVED_REGS_MASK,-$SZREG ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification ++ .cpload $pf ++___ ++$code.=<<___; ++ $PTR_SUB $sp,$FRAMESIZE ++ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue ++ $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) ++ $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) ++ $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) ++ $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) ++ $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ++ .cplocal $Tbl ++ .cpsetup $pf,$zero,AES_set_encrypt_key ++___ ++$code.=<<___; ++ .set reorder ++ la $Tbl,AES_Te # PIC-ified 'load address' ++ ++ bal _mips_AES_set_encrypt_key ++ ++ .set noreorder ++ move $a0,$t0 ++ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) ++ $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) ++ $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) ++ $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) ++ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) ++___ ++$code.=<<___; ++ jr $ra ++ $PTR_ADD $sp,$FRAMESIZE ++.end AES_set_encrypt_key ++___ ++ ++my ($head,$tail)=($inp,$bits); ++my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); ++my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2); ++$code.=<<___; ++.align 5 ++.globl AES_set_decrypt_key ++.ent AES_set_decrypt_key ++AES_set_decrypt_key: ++ .frame $sp,$FRAMESIZE,$ra ++ .mask $SAVED_REGS_MASK,-$SZREG ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification ++ .cpload $pf ++___ ++$code.=<<___; ++ $PTR_SUB $sp,$FRAMESIZE ++ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue ++ $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) ++ $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) ++ $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) ++ $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) ++ $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ++ .cplocal $Tbl ++ .cpsetup $pf,$zero,AES_set_decrypt_key ++___ ++$code.=<<___; ++ .set reorder ++ la $Tbl,AES_Te # PIC-ified 'load address' ++ ++ bal _mips_AES_set_encrypt_key ++ ++ bltz $t0,.Ldkey_done ++ ++ sll $at,$cnt,4 ++ $PTR_ADD $head,$key,0 ++ $PTR_ADD $tail,$key,$at ++.align 4 ++.Lswap: ++ lw $rk0,0($head) ++ lw $rk1,4($head) ++ lw $rk2,8($head) ++ lw $rk3,12($head) ++ lw $rk4,0($tail) ++ lw $rk5,4($tail) ++ lw $rk6,8($tail) ++ lw $rk7,12($tail) ++ sw $rk0,0($tail) ++ sw $rk1,4($tail) ++ sw $rk2,8($tail) ++ sw $rk3,12($tail) ++ $PTR_ADD $head,16 ++ $PTR_SUB $tail,16 ++ sw $rk4,-16($head) ++ sw $rk5,-12($head) ++ sw $rk6,-8($head) ++ sw $rk7,-4($head) ++ bne $head,$tail,.Lswap ++ ++ lw $tp1,16($key) # modulo-scheduled ++ lui $x80808080,0x8080 ++ sub $cnt,1 ++ or $x80808080,0x8080 ++ sll $cnt,2 ++ $PTR_ADD $key,16 ++ lui $x1b1b1b1b,0x1b1b ++ nor $x7f7f7f7f,$zero,$x80808080 ++ or $x1b1b1b1b,0x1b1b ++.align 4 ++.Lmix: ++ and $m,$tp1,$x80808080 ++ and $tp2,$tp1,$x7f7f7f7f ++ srl $tp4,$m,7 ++ addu $tp2,$tp2 # tp2<<1 ++ subu $m,$tp4 ++ and $m,$x1b1b1b1b ++ xor $tp2,$m ++ ++ and $m,$tp2,$x80808080 ++ and $tp4,$tp2,$x7f7f7f7f ++ srl $tp8,$m,7 ++ addu $tp4,$tp4 # tp4<<1 ++ subu $m,$tp8 ++ and $m,$x1b1b1b1b ++ xor $tp4,$m ++ ++ and $m,$tp4,$x80808080 ++ and $tp8,$tp4,$x7f7f7f7f ++ srl $tp9,$m,7 ++ addu $tp8,$tp8 # tp8<<1 ++ subu $m,$tp9 ++ and $m,$x1b1b1b1b ++ xor $tp8,$m ++ ++ xor $tp9,$tp8,$tp1 ++ xor $tpe,$tp8,$tp4 ++ xor $tpb,$tp9,$tp2 ++ xor $tpd,$tp9,$tp4 ++ ++ _ror $tp1,$tpd,16 ++ xor $tpe,$tp2 ++ _ror $tp2,$tpd,-16 ++ xor $tpe,$tp1 ++ _ror $tp1,$tp9,8 ++ xor $tpe,$tp2 ++ _ror $tp2,$tp9,-24 ++ xor $tpe,$tp1 ++ _ror $tp1,$tpb,24 ++ xor $tpe,$tp2 ++ _ror $tp2,$tpb,-8 ++ xor $tpe,$tp1 ++ lw $tp1,4($key) # modulo-scheduled ++ xor $tpe,$tp2 ++ sub $cnt,1 ++ sw $tpe,0($key) ++ $PTR_ADD $key,4 ++ bnez $cnt,.Lmix ++ ++ li $t0,0 ++.Ldkey_done: ++ .set noreorder ++ move $a0,$t0 ++ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) ++ $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) ++ $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) ++ $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) ++ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) ++___ ++$code.=<<___; ++ jr $ra ++ $PTR_ADD $sp,$FRAMESIZE ++.end AES_set_decrypt_key ++___ ++}}} ++ ++###################################################################### ++# Tables are kept in endian-neutral manner ++$code.=<<___; ++.rdata ++.align 6 ++AES_Te: ++.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 ++.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d ++.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd ++.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 ++.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 ++.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d ++.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 ++.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a ++.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d ++.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 ++.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb ++.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b ++.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 ++.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea ++.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 ++.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b ++.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c ++.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a ++.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 ++.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f ++.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 ++.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 ++.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 ++.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f ++.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 ++.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e ++.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 ++.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 ++.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 ++.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d ++.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 ++.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f ++.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e ++.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e ++.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 ++.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb ++.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d ++.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce ++.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e ++.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 ++.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 ++.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c ++.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f ++.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed ++.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 ++.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b ++.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 ++.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a ++.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a ++.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 ++.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 ++.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 ++.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 ++.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 ++.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 ++.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 ++.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe ++.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a ++.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc ++.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 ++.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 ++.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 ++.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a ++.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d ++.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 ++.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f ++.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 ++.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 ++.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 ++.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 ++.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 ++.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 ++.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 ++.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f ++.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e ++.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 ++.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 ++.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c ++.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 ++.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 ++.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 ++.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e ++.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a ++.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 ++.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e ++.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 ++.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 ++.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b ++.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 ++.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 ++.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 ++.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 ++.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa ++.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 ++.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e ++.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 ++.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 ++.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 ++.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 ++.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 ++.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c ++.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 ++.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc ++.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 ++.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 ++.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa ++.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 ++.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 ++.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f ++.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 ++.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 ++.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 ++.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 ++.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 ++.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 ++.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 ++.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 ++.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 ++.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff ++.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a ++.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 ++.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 ++.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 ++.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 ++.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 ++.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 ++.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc ++.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a ++ ++.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 ++.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 ++.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 ++.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 ++.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc ++.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 ++.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a ++.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 ++.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 ++.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 ++.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b ++.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf ++.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 ++.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 ++.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 ++.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 ++.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 ++.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 ++.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 ++.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb ++.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c ++.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 ++.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 ++.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 ++.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 ++.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a ++.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e ++.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e ++.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 ++.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf ++.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 ++.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 ++ ++.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon ++.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 ++.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 ++.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 ++.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 ++ ++.align 6 ++AES_Td: ++.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 ++.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 ++.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 ++.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 ++.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 ++.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 ++.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 ++.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f ++.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 ++.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 ++.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 ++.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 ++.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 ++.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda ++.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 ++.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 ++.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 ++.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd ++.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 ++.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 ++.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 ++.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 ++.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 ++.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 ++.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 ++.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 ++.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 ++.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a ++.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 ++.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 ++.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 ++.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c ++.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 ++.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 ++.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 ++.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a ++.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 ++.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 ++.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa ++.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 ++.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d ++.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 ++.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 ++.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff ++.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 ++.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 ++.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 ++.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb ++.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 ++.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 ++.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 ++.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e ++.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 ++.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 ++.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 ++.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a ++.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f ++.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e ++.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 ++.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 ++.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 ++.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d ++.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad ++.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 ++.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c ++.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd ++.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc ++.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 ++.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc ++.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 ++.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 ++.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 ++.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 ++.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d ++.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 ++.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 ++.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 ++.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 ++.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a ++.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef ++.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 ++.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 ++.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 ++.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 ++.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d ++.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 ++.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 ++.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 ++.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c ++.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 ++.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 ++.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b ++.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 ++.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 ++.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e ++.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 ++.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce ++.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 ++.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 ++.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 ++.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 ++.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 ++.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 ++.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f ++.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d ++.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf ++.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b ++.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f ++.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d ++.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e ++.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 ++.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 ++.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a ++.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 ++.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 ++.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c ++.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f ++.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf ++.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b ++.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 ++.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e ++.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f ++.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c ++.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 ++.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde ++.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 ++.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 ++.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 ++ ++.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4 ++.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb ++.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 ++.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb ++.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d ++.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e ++.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 ++.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 ++.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 ++.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 ++.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda ++.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 ++.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a ++.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 ++.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 ++.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b ++.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea ++.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 ++.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 ++.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e ++.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 ++.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b ++.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 ++.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 ++.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 ++.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f ++.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d ++.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef ++.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 ++.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 ++.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 ++.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d ++___ ++ ++foreach (split("\n",$code)) { ++ s/\`([^\`]*)\`/eval $1/ge; ++ ++ # made-up _instructions, _xtr, _ins, _ror and _bias, cope ++ # with byte order dependencies... ++ if (/^\s+_/) { ++ s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/; ++ ++ s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/ ++ sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) ++ : eval("24-$3"))/e or ++ s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ ++ sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) ++ : eval("24-$3"))/e or ++ s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/ ++ sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) ++ : eval("$3*-1"))/e or ++ s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ ++ sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) ++ : eval("($3-16)&31"))/e; ++ ++ s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/ ++ sprintf("sll\t$1,$2,$3")/e or ++ s/srl\s+(\$[0-9]+),(\$[0-9]+),0/ ++ sprintf("and\t$1,$2,0xff")/e or ++ s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/; ++ } ++ ++ # convert lwl/lwr and swr/swl to little-endian order ++ if (!$big_endian && /^\s+[sl]w[lr]\s+/) { ++ s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/ ++ sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or ++ s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/ ++ sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e; ++ } ++ ++ print $_,"\n"; ++} ++ ++close STDOUT; +diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl +new file mode 100644 +index 0000000..b944a12 +--- /dev/null ++++ b/crypto/bn/asm/mips-mont.pl +@@ -0,0 +1,426 @@ ++#!/usr/bin/env perl ++# ++# ==================================================================== ++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL ++# project. The module is, however, dual licensed under OpenSSL and ++# CRYPTOGAMS licenses depending on where you obtain it. For further ++# details see http://www.openssl.org/~appro/cryptogams/. ++# ==================================================================== ++ ++# This module doesn't present direct interest for OpenSSL, because it ++# doesn't provide better performance for longer keys, at least not on ++# in-order-execution cores. While 512-bit RSA sign operations can be ++# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and ++# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from ++# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA ++# verify:-( All comparisons are against bn_mul_mont-free assembler. ++# The module might be of interest to embedded system developers, as ++# the code is smaller than 1KB, yet offers >3x improvement on MIPS64 ++# and 75-30% [less for longer keys] on MIPS32 over compiler-generated ++# code. ++ ++###################################################################### ++# There is a number of MIPS ABI in use, O32 and N32/64 are most ++# widely used. Then there is a new contender: NUBI. It appears that if ++# one picks the latter, it's possible to arrange code in ABI neutral ++# manner. Therefore let's stick to NUBI register layout: ++# ++($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ++($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ++($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); ++# ++# The return value is placed in $a0. Following coding rules facilitate ++# interoperability: ++# ++# - never ever touch $tp, "thread pointer", former $gp; ++# - copy return value to $t0, former $v0 [or to $a0 if you're adapting ++# old code]; ++# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; ++# ++# For reference here is register layout for N32/64 MIPS ABIs: ++# ++# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); ++# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); ++# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); ++# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); ++# ++$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 ++ ++if ($flavour =~ /64|n32/i) { ++ $PTR_ADD="dadd"; # incidentally works even on n32 ++ $PTR_SUB="dsub"; # incidentally works even on n32 ++ $REG_S="sd"; ++ $REG_L="ld"; ++ $SZREG=8; ++} else { ++ $PTR_ADD="add"; ++ $PTR_SUB="sub"; ++ $REG_S="sw"; ++ $REG_L="lw"; ++ $SZREG=4; ++} ++$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; ++# ++# <appro@openssl.org> ++# ++###################################################################### ++ ++while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} ++open STDOUT,">$output"; ++ ++if ($flavour =~ /64|n32/i) { ++ $LD="ld"; ++ $ST="sd"; ++ $MULTU="dmultu"; ++ $ADDU="daddu"; ++ $SUBU="dsubu"; ++ $BNSZ=8; ++} else { ++ $LD="lw"; ++ $ST="sw"; ++ $MULTU="multu"; ++ $ADDU="addu"; ++ $SUBU="subu"; ++ $BNSZ=4; ++} ++ ++# int bn_mul_mont( ++$rp=$a0; # BN_ULONG *rp, ++$ap=$a1; # const BN_ULONG *ap, ++$bp=$a2; # const BN_ULONG *bp, ++$np=$a3; # const BN_ULONG *np, ++$n0=$a4; # const BN_ULONG *n0, ++$num=$a5; # int num); ++ ++$lo0=$a6; ++$hi0=$a7; ++$lo1=$t1; ++$hi1=$t2; ++$aj=$s0; ++$bi=$s1; ++$nj=$s2; ++$tp=$s3; ++$alo=$s4; ++$ahi=$s5; ++$nlo=$s6; ++$nhi=$s7; ++$tj=$s8; ++$i=$s9; ++$j=$s10; ++$m1=$s11; ++ ++$FRAMESIZE=14; ++ ++$code=<<___; ++.text ++ ++.set noat ++.set noreorder ++ ++.align 5 ++.globl bn_mul_mont ++.ent bn_mul_mont ++bn_mul_mont: ++___ ++$code.=<<___ if ($flavour =~ /o32/i); ++ lw $n0,16($sp) ++ lw $num,20($sp) ++___ ++$code.=<<___; ++ slt $at,$num,4 ++ bnez $at,1f ++ li $t0,0 ++ slt $at,$num,17 # on in-order CPU ++ bnezl $at,bn_mul_mont_internal ++ nop ++1: jr $ra ++ li $a0,0 ++.end bn_mul_mont ++ ++.align 5 ++.ent bn_mul_mont_internal ++bn_mul_mont_internal: ++ .frame $fp,$FRAMESIZE*$SZREG,$ra ++ .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG ++ $PTR_SUB $sp,$FRAMESIZE*$SZREG ++ $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) ++ $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) ++ $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) ++ $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) ++ $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) ++ $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) ++ $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) ++ $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) ++ $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) ++ $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) ++ $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) ++ $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) ++___ ++$code.=<<___; ++ move $fp,$sp ++ ++ .set reorder ++ $LD $n0,0($n0) ++ $LD $bi,0($bp) # bp[0] ++ $LD $aj,0($ap) # ap[0] ++ $LD $nj,0($np) # np[0] ++ ++ $PTR_SUB $sp,2*$BNSZ # place for two extra words ++ sll $num,`log($BNSZ)/log(2)` ++ li $at,-4096 ++ $PTR_SUB $sp,$num ++ and $sp,$at ++ ++ $MULTU $aj,$bi ++ $LD $alo,$BNSZ($ap) ++ $LD $nlo,$BNSZ($np) ++ mflo $lo0 ++ mfhi $hi0 ++ $MULTU $lo0,$n0 ++ mflo $m1 ++ ++ $MULTU $alo,$bi ++ mflo $alo ++ mfhi $ahi ++ ++ $MULTU $nj,$m1 ++ mflo $lo1 ++ mfhi $hi1 ++ $MULTU $nlo,$m1 ++ $ADDU $lo1,$lo0 ++ sltu $at,$lo1,$lo0 ++ $ADDU $hi1,$at ++ mflo $nlo ++ mfhi $nhi ++ ++ move $tp,$sp ++ li $j,2*$BNSZ ++.align 4 ++.L1st: ++ .set noreorder ++ $PTR_ADD $aj,$ap,$j ++ $PTR_ADD $nj,$np,$j ++ $LD $aj,($aj) ++ $LD $nj,($nj) ++ ++ $MULTU $aj,$bi ++ $ADDU $lo0,$alo,$hi0 ++ $ADDU $lo1,$nlo,$hi1 ++ sltu $at,$lo0,$hi0 ++ sltu $t0,$lo1,$hi1 ++ $ADDU $hi0,$ahi,$at ++ $ADDU $hi1,$nhi,$t0 ++ mflo $alo ++ mfhi $ahi ++ ++ $ADDU $lo1,$lo0 ++ sltu $at,$lo1,$lo0 ++ $MULTU $nj,$m1 ++ $ADDU $hi1,$at ++ addu $j,$BNSZ ++ $ST $lo1,($tp) ++ sltu $t0,$j,$num ++ mflo $nlo ++ mfhi $nhi ++ ++ bnez $t0,.L1st ++ $PTR_ADD $tp,$BNSZ ++ .set reorder ++ ++ $ADDU $lo0,$alo,$hi0 ++ sltu $at,$lo0,$hi0 ++ $ADDU $hi0,$ahi,$at ++ ++ $ADDU $lo1,$nlo,$hi1 ++ sltu $t0,$lo1,$hi1 ++ $ADDU $hi1,$nhi,$t0 ++ $ADDU $lo1,$lo0 ++ sltu $at,$lo1,$lo0 ++ $ADDU $hi1,$at ++ ++ $ST $lo1,($tp) ++ ++ $ADDU $hi1,$hi0 ++ sltu $at,$hi1,$hi0 ++ $ST $hi1,$BNSZ($tp) ++ $ST $at,2*$BNSZ($tp) ++ ++ li $i,$BNSZ ++.align 4 ++.Louter: ++ $PTR_ADD $bi,$bp,$i ++ $LD $bi,($bi) ++ $LD $aj,($ap) ++ $LD $alo,$BNSZ($ap) ++ $LD $tj,($sp) ++ ++ $MULTU $aj,$bi ++ $LD $nj,($np) ++ $LD $nlo,$BNSZ($np) ++ mflo $lo0 ++ mfhi $hi0 ++ $ADDU $lo0,$tj ++ $MULTU $lo0,$n0 ++ sltu $at,$lo0,$tj ++ $ADDU $hi0,$at ++ mflo $m1 ++ ++ $MULTU $alo,$bi ++ mflo $alo ++ mfhi $ahi ++ ++ $MULTU $nj,$m1 ++ mflo $lo1 ++ mfhi $hi1 ++ ++ $MULTU $nlo,$m1 ++ $ADDU $lo1,$lo0 ++ sltu $at,$lo1,$lo0 ++ $ADDU $hi1,$at ++ mflo $nlo ++ mfhi $nhi ++ ++ move $tp,$sp ++ li $j,2*$BNSZ ++ $LD $tj,$BNSZ($tp) ++.align 4 ++.Linner: ++ .set noreorder ++ $PTR_ADD $aj,$ap,$j ++ $PTR_ADD $nj,$np,$j ++ $LD $aj,($aj) ++ $LD $nj,($nj) ++ ++ $MULTU $aj,$bi ++ $ADDU $lo0,$alo,$hi0 ++ $ADDU $lo1,$nlo,$hi1 ++ sltu $at,$lo0,$hi0 ++ sltu $t0,$lo1,$hi1 ++ $ADDU $hi0,$ahi,$at ++ $ADDU $hi1,$nhi,$t0 ++ mflo $alo ++ mfhi $ahi ++ ++ $ADDU $lo0,$tj ++ addu $j,$BNSZ ++ $MULTU $nj,$m1 ++ sltu $at,$lo0,$tj ++ $ADDU $lo1,$lo0 ++ $ADDU $hi0,$at ++ sltu $t0,$lo1,$lo0 ++ $LD $tj,2*$BNSZ($tp) ++ $ADDU $hi1,$t0 ++ sltu $at,$j,$num ++ mflo $nlo ++ mfhi $nhi ++ $ST $lo1,($tp) ++ bnez $at,.Linner ++ $PTR_ADD $tp,$BNSZ ++ .set reorder ++ ++ $ADDU $lo0,$alo,$hi0 ++ sltu $at,$lo0,$hi0 ++ $ADDU $hi0,$ahi,$at ++ $ADDU $lo0,$tj ++ sltu $t0,$lo0,$tj ++ $ADDU $hi0,$t0 ++ ++ $LD $tj,2*$BNSZ($tp) ++ $ADDU $lo1,$nlo,$hi1 ++ sltu $at,$lo1,$hi1 ++ $ADDU $hi1,$nhi,$at ++ $ADDU $lo1,$lo0 ++ sltu $t0,$lo1,$lo0 ++ $ADDU $hi1,$t0 ++ $ST $lo1,($tp) ++ ++ $ADDU $lo1,$hi1,$hi0 ++ sltu $hi1,$lo1,$hi0 ++ $ADDU $lo1,$tj ++ sltu $at,$lo1,$tj ++ $ADDU $hi1,$at ++ $ST $lo1,$BNSZ($tp) ++ $ST $hi1,2*$BNSZ($tp) ++ ++ addu $i,$BNSZ ++ sltu $t0,$i,$num ++ bnez $t0,.Louter ++ ++ .set noreorder ++ $PTR_ADD $tj,$sp,$num # &tp[num] ++ move $tp,$sp ++ move $ap,$sp ++ li $hi0,0 # clear borrow bit ++ ++.align 4 ++.Lsub: $LD $lo0,($tp) ++ $LD $lo1,($np) ++ $PTR_ADD $tp,$BNSZ ++ $PTR_ADD $np,$BNSZ ++ $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] ++ sgtu $at,$lo1,$lo0 ++ $SUBU $lo0,$lo1,$hi0 ++ sgtu $hi0,$lo0,$lo1 ++ $ST $lo0,($rp) ++ or $hi0,$at ++ sltu $at,$tp,$tj ++ bnez $at,.Lsub ++ $PTR_ADD $rp,$BNSZ ++ ++ $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit ++ move $tp,$sp ++ $PTR_SUB $rp,$num # restore rp ++ not $hi1,$hi0 ++ ++ and $ap,$hi0,$sp ++ and $bp,$hi1,$rp ++ or $ap,$ap,$bp # ap=borrow?tp:rp ++ ++.align 4 ++.Lcopy: $LD $aj,($ap) ++ $PTR_ADD $ap,$BNSZ ++ $ST $zero,($tp) ++ $PTR_ADD $tp,$BNSZ ++ sltu $at,$tp,$tj ++ $ST $aj,($rp) ++ bnez $at,.Lcopy ++ $PTR_ADD $rp,$BNSZ ++ ++ li $a0,1 ++ li $t0,1 ++ ++ .set noreorder ++ move $sp,$fp ++ $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) ++ $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) ++ $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) ++ $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) ++ $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) ++ $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) ++ $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) ++ $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) ++ $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) ++ $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) ++ $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) ++ $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) ++___ ++$code.=<<___; ++ jr $ra ++ $PTR_ADD $sp,$FRAMESIZE*$SZREG ++.end bn_mul_mont_internal ++.rdata ++.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" ++___ ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++ ++print $code; ++close STDOUT; +diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl +new file mode 100644 +index 0000000..f04b3b9 +--- /dev/null ++++ b/crypto/bn/asm/mips.pl +@@ -0,0 +1,2585 @@ ++#!/usr/bin/env perl ++# ++# ==================================================================== ++# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL ++# project. ++# ++# Rights for redistribution and usage in source and binary forms are ++# granted according to the OpenSSL license. Warranty of any kind is ++# disclaimed. ++# ==================================================================== ++ ++ ++# July 1999 ++# ++# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. ++# ++# The module is designed to work with either of the "new" MIPS ABI(5), ++# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under ++# IRIX 5.x not only because it doesn't support new ABIs but also ++# because 5.x kernels put R4x00 CPU into 32-bit mode and all those ++# 64-bit instructions (daddu, dmultu, etc.) found below gonna only ++# cause illegal instruction exception:-( ++# ++# In addition the code depends on preprocessor flags set up by MIPSpro ++# compiler driver (either as or cc) and therefore (probably?) can't be ++# compiled by the GNU assembler. GNU C driver manages fine though... ++# I mean as long as -mmips-as is specified or is the default option, ++# because then it simply invokes /usr/bin/as which in turn takes ++# perfect care of the preprocessor definitions. Another neat feature ++# offered by the MIPSpro assembler is an optimization pass. This gave ++# me the opportunity to have the code looking more regular as all those ++# architecture dependent instruction rescheduling details were left to ++# the assembler. Cool, huh? ++# ++# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' ++# goes way over 3 times faster! ++# ++# <appro@fy.chalmers.se> ++ ++# October 2010 ++# ++# Adapt the module even for 32-bit ABIs and other OSes. The former was ++# achieved by mechanical replacement of 64-bit arithmetic instructions ++# such as dmultu, daddu, etc. with their 32-bit counterparts and ++# adjusting offsets denoting multiples of BN_ULONG. Above mentioned ++# >3x performance improvement naturally does not apply to 32-bit code ++# [because there is no instruction 32-bit compiler can't use], one ++# has to content with 40-85% improvement depending on benchmark and ++# key length, more for longer keys. ++ ++$flavour = shift; ++while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} ++open STDOUT,">$output"; ++ ++if ($flavour =~ /64|n32/i) { ++ $LD="ld"; ++ $ST="sd"; ++ $MULTU="dmultu"; ++ $DIVU="ddivu"; ++ $ADDU="daddu"; ++ $SUBU="dsubu"; ++ $SRL="dsrl"; ++ $SLL="dsll"; ++ $BNSZ=8; ++ $PTR_ADD="daddu"; ++ $PTR_SUB="dsubu"; ++ $SZREG=8; ++ $REG_S="sd"; ++ $REG_L="ld"; ++} else { ++ $LD="lw"; ++ $ST="sw"; ++ $MULTU="multu"; ++ $DIVU="divu"; ++ $ADDU="addu"; ++ $SUBU="subu"; ++ $SRL="srl"; ++ $SLL="sll"; ++ $BNSZ=4; ++ $PTR_ADD="addu"; ++ $PTR_SUB="subu"; ++ $SZREG=4; ++ $REG_S="sw"; ++ $REG_L="lw"; ++ $code=".set mips2\n"; ++} ++ ++# Below is N32/64 register layout used in the original module. ++# ++($zero,$at,$v0,$v1)=map("\$$_",(0..3)); ++($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); ++($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); ++($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); ++($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); ++# ++# No special adaptation is required for O32. NUBI on the other hand ++# is treated by saving/restoring ($v1,$t0..$t3). ++ ++$gp=$v1 if ($flavour =~ /nubi/i); ++ ++$minus4=$v1; ++ ++$code.=<<___; ++.rdata ++.asciiz "mips3.s, Version 1.2" ++.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" ++ ++.text ++.set noat ++ ++.align 5 ++.globl bn_mul_add_words ++.ent bn_mul_add_words ++bn_mul_add_words: ++ .set noreorder ++ bgtz $a2,bn_mul_add_words_internal ++ move $v0,$zero ++ jr $ra ++ move $a0,$v0 ++.end bn_mul_add_words ++ ++.align 5 ++.ent bn_mul_add_words_internal ++bn_mul_add_words_internal: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ li $minus4,-4 ++ and $ta0,$a2,$minus4 ++ $LD $t0,0($a1) ++ beqz $ta0,.L_bn_mul_add_words_tail ++ ++.L_bn_mul_add_words_loop: ++ $MULTU $t0,$a3 ++ $LD $t1,0($a0) ++ $LD $t2,$BNSZ($a1) ++ $LD $t3,$BNSZ($a0) ++ $LD $ta0,2*$BNSZ($a1) ++ $LD $ta1,2*$BNSZ($a0) ++ $ADDU $t1,$v0 ++ sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit ++ # values", but it seems to work fine ++ # even on 64-bit registers. ++ mflo $at ++ mfhi $t0 ++ $ADDU $t1,$at ++ $ADDU $v0,$t0 ++ $MULTU $t2,$a3 ++ sltu $at,$t1,$at ++ $ST $t1,0($a0) ++ $ADDU $v0,$at ++ ++ $LD $ta2,3*$BNSZ($a1) ++ $LD $ta3,3*$BNSZ($a0) ++ $ADDU $t3,$v0 ++ sltu $v0,$t3,$v0 ++ mflo $at ++ mfhi $t2 ++ $ADDU $t3,$at ++ $ADDU $v0,$t2 ++ $MULTU $ta0,$a3 ++ sltu $at,$t3,$at ++ $ST $t3,$BNSZ($a0) ++ $ADDU $v0,$at ++ ++ subu $a2,4 ++ $PTR_ADD $a0,4*$BNSZ ++ $PTR_ADD $a1,4*$BNSZ ++ $ADDU $ta1,$v0 ++ sltu $v0,$ta1,$v0 ++ mflo $at ++ mfhi $ta0 ++ $ADDU $ta1,$at ++ $ADDU $v0,$ta0 ++ $MULTU $ta2,$a3 ++ sltu $at,$ta1,$at ++ $ST $ta1,-2*$BNSZ($a0) ++ $ADDU $v0,$at ++ ++ ++ and $ta0,$a2,$minus4 ++ $ADDU $ta3,$v0 ++ sltu $v0,$ta3,$v0 ++ mflo $at ++ mfhi $ta2 ++ $ADDU $ta3,$at ++ $ADDU $v0,$ta2 ++ sltu $at,$ta3,$at ++ $ST $ta3,-$BNSZ($a0) ++ $ADDU $v0,$at ++ .set noreorder ++ bgtzl $ta0,.L_bn_mul_add_words_loop ++ $LD $t0,0($a1) ++ ++ beqz $a2,.L_bn_mul_add_words_return ++ nop ++ ++.L_bn_mul_add_words_tail: ++ .set reorder ++ $LD $t0,0($a1) ++ $MULTU $t0,$a3 ++ $LD $t1,0($a0) ++ subu $a2,1 ++ $ADDU $t1,$v0 ++ sltu $v0,$t1,$v0 ++ mflo $at ++ mfhi $t0 ++ $ADDU $t1,$at ++ $ADDU $v0,$t0 ++ sltu $at,$t1,$at ++ $ST $t1,0($a0) ++ $ADDU $v0,$at ++ beqz $a2,.L_bn_mul_add_words_return ++ ++ $LD $t0,$BNSZ($a1) ++ $MULTU $t0,$a3 ++ $LD $t1,$BNSZ($a0) ++ subu $a2,1 ++ $ADDU $t1,$v0 ++ sltu $v0,$t1,$v0 ++ mflo $at ++ mfhi $t0 ++ $ADDU $t1,$at ++ $ADDU $v0,$t0 ++ sltu $at,$t1,$at ++ $ST $t1,$BNSZ($a0) ++ $ADDU $v0,$at ++ beqz $a2,.L_bn_mul_add_words_return ++ ++ $LD $t0,2*$BNSZ($a1) ++ $MULTU $t0,$a3 ++ $LD $t1,2*$BNSZ($a0) ++ $ADDU $t1,$v0 ++ sltu $v0,$t1,$v0 ++ mflo $at ++ mfhi $t0 ++ $ADDU $t1,$at ++ $ADDU $v0,$t0 ++ sltu $at,$t1,$at ++ $ST $t1,2*$BNSZ($a0) ++ $ADDU $v0,$at ++ ++.L_bn_mul_add_words_return: ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ move $a0,$v0 ++.end bn_mul_add_words_internal ++ ++.align 5 ++.globl bn_mul_words ++.ent bn_mul_words ++bn_mul_words: ++ .set noreorder ++ bgtz $a2,bn_mul_words_internal ++ move $v0,$zero ++ jr $ra ++ move $a0,$v0 ++.end bn_mul_words ++ ++.align 5 ++.ent bn_mul_words_internal ++bn_mul_words_internal: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ li $minus4,-4 ++ and $ta0,$a2,$minus4 ++ $LD $t0,0($a1) ++ beqz $ta0,.L_bn_mul_words_tail ++ ++.L_bn_mul_words_loop: ++ $MULTU $t0,$a3 ++ $LD $t2,$BNSZ($a1) ++ $LD $ta0,2*$BNSZ($a1) ++ $LD $ta2,3*$BNSZ($a1) ++ mflo $at ++ mfhi $t0 ++ $ADDU $v0,$at ++ sltu $t1,$v0,$at ++ $MULTU $t2,$a3 ++ $ST $v0,0($a0) ++ $ADDU $v0,$t1,$t0 ++ ++ subu $a2,4 ++ $PTR_ADD $a0,4*$BNSZ ++ $PTR_ADD $a1,4*$BNSZ ++ mflo $at ++ mfhi $t2 ++ $ADDU $v0,$at ++ sltu $t3,$v0,$at ++ $MULTU $ta0,$a3 ++ $ST $v0,-3*$BNSZ($a0) ++ $ADDU $v0,$t3,$t2 ++ ++ mflo $at ++ mfhi $ta0 ++ $ADDU $v0,$at ++ sltu $ta1,$v0,$at ++ $MULTU $ta2,$a3 ++ $ST $v0,-2*$BNSZ($a0) ++ $ADDU $v0,$ta1,$ta0 ++ ++ and $ta0,$a2,$minus4 ++ mflo $at ++ mfhi $ta2 ++ $ADDU $v0,$at ++ sltu $ta3,$v0,$at ++ $ST $v0,-$BNSZ($a0) ++ $ADDU $v0,$ta3,$ta2 ++ .set noreorder ++ bgtzl $ta0,.L_bn_mul_words_loop ++ $LD $t0,0($a1) ++ ++ beqz $a2,.L_bn_mul_words_return ++ nop ++ ++.L_bn_mul_words_tail: ++ .set reorder ++ $LD $t0,0($a1) ++ $MULTU $t0,$a3 ++ subu $a2,1 ++ mflo $at ++ mfhi $t0 ++ $ADDU $v0,$at ++ sltu $t1,$v0,$at ++ $ST $v0,0($a0) ++ $ADDU $v0,$t1,$t0 ++ beqz $a2,.L_bn_mul_words_return ++ ++ $LD $t0,$BNSZ($a1) ++ $MULTU $t0,$a3 ++ subu $a2,1 ++ mflo $at ++ mfhi $t0 ++ $ADDU $v0,$at ++ sltu $t1,$v0,$at ++ $ST $v0,$BNSZ($a0) ++ $ADDU $v0,$t1,$t0 ++ beqz $a2,.L_bn_mul_words_return ++ ++ $LD $t0,2*$BNSZ($a1) ++ $MULTU $t0,$a3 ++ mflo $at ++ mfhi $t0 ++ $ADDU $v0,$at ++ sltu $t1,$v0,$at ++ $ST $v0,2*$BNSZ($a0) ++ $ADDU $v0,$t1,$t0 ++ ++.L_bn_mul_words_return: ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ move $a0,$v0 ++.end bn_mul_words_internal ++ ++.align 5 ++.globl bn_sqr_words ++.ent bn_sqr_words ++bn_sqr_words: ++ .set noreorder ++ bgtz $a2,bn_sqr_words_internal ++ move $v0,$zero ++ jr $ra ++ move $a0,$v0 ++.end bn_sqr_words ++ ++.align 5 ++.ent bn_sqr_words_internal ++bn_sqr_words_internal: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ li $minus4,-4 ++ and $ta0,$a2,$minus4 ++ $LD $t0,0($a1) ++ beqz $ta0,.L_bn_sqr_words_tail ++ ++.L_bn_sqr_words_loop: ++ $MULTU $t0,$t0 ++ $LD $t2,$BNSZ($a1) ++ $LD $ta0,2*$BNSZ($a1) ++ $LD $ta2,3*$BNSZ($a1) ++ mflo $t1 ++ mfhi $t0 ++ $ST $t1,0($a0) ++ $ST $t0,$BNSZ($a0) ++ ++ $MULTU $t2,$t2 ++ subu $a2,4 ++ $PTR_ADD $a0,8*$BNSZ ++ $PTR_ADD $a1,4*$BNSZ ++ mflo $t3 ++ mfhi $t2 ++ $ST $t3,-6*$BNSZ($a0) ++ $ST $t2,-5*$BNSZ($a0) ++ ++ $MULTU $ta0,$ta0 ++ mflo $ta1 ++ mfhi $ta0 ++ $ST $ta1,-4*$BNSZ($a0) ++ $ST $ta0,-3*$BNSZ($a0) ++ ++ ++ $MULTU $ta2,$ta2 ++ and $ta0,$a2,$minus4 ++ mflo $ta3 ++ mfhi $ta2 ++ $ST $ta3,-2*$BNSZ($a0) ++ $ST $ta2,-$BNSZ($a0) ++ ++ .set noreorder ++ bgtzl $ta0,.L_bn_sqr_words_loop ++ $LD $t0,0($a1) ++ ++ beqz $a2,.L_bn_sqr_words_return ++ nop ++ ++.L_bn_sqr_words_tail: ++ .set reorder ++ $LD $t0,0($a1) ++ $MULTU $t0,$t0 ++ subu $a2,1 ++ mflo $t1 ++ mfhi $t0 ++ $ST $t1,0($a0) ++ $ST $t0,$BNSZ($a0) ++ beqz $a2,.L_bn_sqr_words_return ++ ++ $LD $t0,$BNSZ($a1) ++ $MULTU $t0,$t0 ++ subu $a2,1 ++ mflo $t1 ++ mfhi $t0 ++ $ST $t1,2*$BNSZ($a0) ++ $ST $t0,3*$BNSZ($a0) ++ beqz $a2,.L_bn_sqr_words_return ++ ++ $LD $t0,2*$BNSZ($a1) ++ $MULTU $t0,$t0 ++ mflo $t1 ++ mfhi $t0 ++ $ST $t1,4*$BNSZ($a0) ++ $ST $t0,5*$BNSZ($a0) ++ ++.L_bn_sqr_words_return: ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ move $a0,$v0 ++ ++.end bn_sqr_words_internal ++ ++.align 5 ++.globl bn_add_words ++.ent bn_add_words ++bn_add_words: ++ .set noreorder ++ bgtz $a3,bn_add_words_internal ++ move $v0,$zero ++ jr $ra ++ move $a0,$v0 ++.end bn_add_words ++ ++.align 5 ++.ent bn_add_words_internal ++bn_add_words_internal: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ li $minus4,-4 ++ and $at,$a3,$minus4 ++ $LD $t0,0($a1) ++ beqz $at,.L_bn_add_words_tail ++ ++.L_bn_add_words_loop: ++ $LD $ta0,0($a2) ++ subu $a3,4 ++ $LD $t1,$BNSZ($a1) ++ and $at,$a3,$minus4 ++ $LD $t2,2*$BNSZ($a1) ++ $PTR_ADD $a2,4*$BNSZ ++ $LD $t3,3*$BNSZ($a1) ++ $PTR_ADD $a0,4*$BNSZ ++ $LD $ta1,-3*$BNSZ($a2) ++ $PTR_ADD $a1,4*$BNSZ ++ $LD $ta2,-2*$BNSZ($a2) ++ $LD $ta3,-$BNSZ($a2) ++ $ADDU $ta0,$t0 ++ sltu $t8,$ta0,$t0 ++ $ADDU $t0,$ta0,$v0 ++ sltu $v0,$t0,$ta0 ++ $ST $t0,-4*$BNSZ($a0) ++ $ADDU $v0,$t8 ++ ++ $ADDU $ta1,$t1 ++ sltu $t9,$ta1,$t1 ++ $ADDU $t1,$ta1,$v0 ++ sltu $v0,$t1,$ta1 ++ $ST $t1,-3*$BNSZ($a0) ++ $ADDU $v0,$t9 ++ ++ $ADDU $ta2,$t2 ++ sltu $t8,$ta2,$t2 ++ $ADDU $t2,$ta2,$v0 ++ sltu $v0,$t2,$ta2 ++ $ST $t2,-2*$BNSZ($a0) ++ $ADDU $v0,$t8 ++ ++ $ADDU $ta3,$t3 ++ sltu $t9,$ta3,$t3 ++ $ADDU $t3,$ta3,$v0 ++ sltu $v0,$t3,$ta3 ++ $ST $t3,-$BNSZ($a0) ++ $ADDU $v0,$t9 ++ ++ .set noreorder ++ bgtzl $at,.L_bn_add_words_loop ++ $LD $t0,0($a1) ++ ++ beqz $a3,.L_bn_add_words_return ++ nop ++ ++.L_bn_add_words_tail: ++ .set reorder ++ $LD $t0,0($a1) ++ $LD $ta0,0($a2) ++ $ADDU $ta0,$t0 ++ subu $a3,1 ++ sltu $t8,$ta0,$t0 ++ $ADDU $t0,$ta0,$v0 ++ sltu $v0,$t0,$ta0 ++ $ST $t0,0($a0) ++ $ADDU $v0,$t8 ++ beqz $a3,.L_bn_add_words_return ++ ++ $LD $t1,$BNSZ($a1) ++ $LD $ta1,$BNSZ($a2) ++ $ADDU $ta1,$t1 ++ subu $a3,1 ++ sltu $t9,$ta1,$t1 ++ $ADDU $t1,$ta1,$v0 ++ sltu $v0,$t1,$ta1 ++ $ST $t1,$BNSZ($a0) ++ $ADDU $v0,$t9 ++ beqz $a3,.L_bn_add_words_return ++ ++ $LD $t2,2*$BNSZ($a1) ++ $LD $ta2,2*$BNSZ($a2) ++ $ADDU $ta2,$t2 ++ sltu $t8,$ta2,$t2 ++ $ADDU $t2,$ta2,$v0 ++ sltu $v0,$t2,$ta2 ++ $ST $t2,2*$BNSZ($a0) ++ $ADDU $v0,$t8 ++ ++.L_bn_add_words_return: ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ move $a0,$v0 ++ ++.end bn_add_words_internal ++ ++.align 5 ++.globl bn_sub_words ++.ent bn_sub_words ++bn_sub_words: ++ .set noreorder ++ bgtz $a3,bn_sub_words_internal ++ move $v0,$zero ++ jr $ra ++ move $a0,$zero ++.end bn_sub_words ++ ++.align 5 ++.ent bn_sub_words_internal ++bn_sub_words_internal: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ li $minus4,-4 ++ and $at,$a3,$minus4 ++ $LD $t0,0($a1) ++ beqz $at,.L_bn_sub_words_tail ++ ++.L_bn_sub_words_loop: ++ $LD $ta0,0($a2) ++ subu $a3,4 ++ $LD $t1,$BNSZ($a1) ++ and $at,$a3,$minus4 ++ $LD $t2,2*$BNSZ($a1) ++ $PTR_ADD $a2,4*$BNSZ ++ $LD $t3,3*$BNSZ($a1) ++ $PTR_ADD $a0,4*$BNSZ ++ $LD $ta1,-3*$BNSZ($a2) ++ $PTR_ADD $a1,4*$BNSZ ++ $LD $ta2,-2*$BNSZ($a2) ++ $LD $ta3,-$BNSZ($a2) ++ sltu $t8,$t0,$ta0 ++ $SUBU $ta0,$t0,$ta0 ++ $SUBU $t0,$ta0,$v0 ++ sgtu $v0,$t0,$ta0 ++ $ST $t0,-4*$BNSZ($a0) ++ $ADDU $v0,$t8 ++ ++ sltu $t9,$t1,$ta1 ++ $SUBU $ta1,$t1,$ta1 ++ $SUBU $t1,$ta1,$v0 ++ sgtu $v0,$t1,$ta1 ++ $ST $t1,-3*$BNSZ($a0) ++ $ADDU $v0,$t9 ++ ++ ++ sltu $t8,$t2,$ta2 ++ $SUBU $ta2,$t2,$ta2 ++ $SUBU $t2,$ta2,$v0 ++ sgtu $v0,$t2,$ta2 ++ $ST $t2,-2*$BNSZ($a0) ++ $ADDU $v0,$t8 ++ ++ sltu $t9,$t3,$ta3 ++ $SUBU $ta3,$t3,$ta3 ++ $SUBU $t3,$ta3,$v0 ++ sgtu $v0,$t3,$ta3 ++ $ST $t3,-$BNSZ($a0) ++ $ADDU $v0,$t9 ++ ++ .set noreorder ++ bgtzl $at,.L_bn_sub_words_loop ++ $LD $t0,0($a1) ++ ++ beqz $a3,.L_bn_sub_words_return ++ nop ++ ++.L_bn_sub_words_tail: ++ .set reorder ++ $LD $t0,0($a1) ++ $LD $ta0,0($a2) ++ subu $a3,1 ++ sltu $t8,$t0,$ta0 ++ $SUBU $ta0,$t0,$ta0 ++ $SUBU $t0,$ta0,$v0 ++ sgtu $v0,$t0,$ta0 ++ $ST $t0,0($a0) ++ $ADDU $v0,$t8 ++ beqz $a3,.L_bn_sub_words_return ++ ++ $LD $t1,$BNSZ($a1) ++ subu $a3,1 ++ $LD $ta1,$BNSZ($a2) ++ sltu $t9,$t1,$ta1 ++ $SUBU $ta1,$t1,$ta1 ++ $SUBU $t1,$ta1,$v0 ++ sgtu $v0,$t1,$ta1 ++ $ST $t1,$BNSZ($a0) ++ $ADDU $v0,$t9 ++ beqz $a3,.L_bn_sub_words_return ++ ++ $LD $t2,2*$BNSZ($a1) ++ $LD $ta2,2*$BNSZ($a2) ++ sltu $t8,$t2,$ta2 ++ $SUBU $ta2,$t2,$ta2 ++ $SUBU $t2,$ta2,$v0 ++ sgtu $v0,$t2,$ta2 ++ $ST $t2,2*$BNSZ($a0) ++ $ADDU $v0,$t8 ++ ++.L_bn_sub_words_return: ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ move $a0,$v0 ++.end bn_sub_words_internal ++ ++.align 5 ++.globl bn_div_3_words ++.ent bn_div_3_words ++bn_div_3_words: ++ .set noreorder ++ move $a3,$a0 # we know that bn_div_words does not ++ # touch $a3, $ta2, $ta3 and preserves $a2 ++ # so that we can save two arguments ++ # and return address in registers ++ # instead of stack:-) ++ ++ $LD $a0,($a3) ++ move $ta2,$a1 ++ bne $a0,$a2,bn_div_3_words_internal ++ $LD $a1,-$BNSZ($a3) ++ li $v0,-1 ++ jr $ra ++ move $a0,$v0 ++.end bn_div_3_words ++ ++.align 5 ++.ent bn_div_3_words_internal ++bn_div_3_words_internal: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ move $ta3,$ra ++ bal bn_div_words ++ move $ra,$ta3 ++ $MULTU $ta2,$v0 ++ $LD $t2,-2*$BNSZ($a3) ++ move $ta0,$zero ++ mfhi $t1 ++ mflo $t0 ++ sltu $t8,$t1,$a1 ++.L_bn_div_3_words_inner_loop: ++ bnez $t8,.L_bn_div_3_words_inner_loop_done ++ sgeu $at,$t2,$t0 ++ seq $t9,$t1,$a1 ++ and $at,$t9 ++ sltu $t3,$t0,$ta2 ++ $ADDU $a1,$a2 ++ $SUBU $t1,$t3 ++ $SUBU $t0,$ta2 ++ sltu $t8,$t1,$a1 ++ sltu $ta0,$a1,$a2 ++ or $t8,$ta0 ++ .set noreorder ++ beqzl $at,.L_bn_div_3_words_inner_loop ++ $SUBU $v0,1 ++ .set reorder ++.L_bn_div_3_words_inner_loop_done: ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ move $a0,$v0 ++.end bn_div_3_words_internal ++ ++.align 5 ++.globl bn_div_words ++.ent bn_div_words ++bn_div_words: ++ .set noreorder ++ bnez $a2,bn_div_words_internal ++ li $v0,-1 # I would rather signal div-by-zero ++ # which can be done with 'break 7' ++ jr $ra ++ move $a0,$v0 ++.end bn_div_words ++ ++.align 5 ++.ent bn_div_words_internal ++bn_div_words_internal: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ move $v1,$zero ++ bltz $a2,.L_bn_div_words_body ++ move $t9,$v1 ++ $SLL $a2,1 ++ bgtz $a2,.-4 ++ addu $t9,1 ++ ++ .set reorder ++ negu $t1,$t9 ++ li $t2,-1 ++ $SLL $t2,$t1 ++ and $t2,$a0 ++ $SRL $at,$a1,$t1 ++ .set noreorder ++ bnezl $t2,.+8 ++ break 6 # signal overflow ++ .set reorder ++ $SLL $a0,$t9 ++ $SLL $a1,$t9 ++ or $a0,$at ++___ ++$QT=$ta0; ++$HH=$ta1; ++$DH=$v1; ++$code.=<<___; ++.L_bn_div_words_body: ++ $SRL $DH,$a2,4*$BNSZ # bits ++ sgeu $at,$a0,$a2 ++ .set noreorder ++ bnezl $at,.+8 ++ $SUBU $a0,$a2 ++ .set reorder ++ ++ li $QT,-1 ++ $SRL $HH,$a0,4*$BNSZ # bits ++ $SRL $QT,4*$BNSZ # q=0xffffffff ++ beq $DH,$HH,.L_bn_div_words_skip_div1 ++ $DIVU $zero,$a0,$DH ++ mflo $QT ++.L_bn_div_words_skip_div1: ++ $MULTU $a2,$QT ++ $SLL $t3,$a0,4*$BNSZ # bits ++ $SRL $at,$a1,4*$BNSZ # bits ++ or $t3,$at ++ mflo $t0 ++ mfhi $t1 ++.L_bn_div_words_inner_loop1: ++ sltu $t2,$t3,$t0 ++ seq $t8,$HH,$t1 ++ sltu $at,$HH,$t1 ++ and $t2,$t8 ++ sltu $v0,$t0,$a2 ++ or $at,$t2 ++ .set noreorder ++ beqz $at,.L_bn_div_words_inner_loop1_done ++ $SUBU $t1,$v0 ++ $SUBU $t0,$a2 ++ b .L_bn_div_words_inner_loop1 ++ $SUBU $QT,1 ++ .set reorder ++.L_bn_div_words_inner_loop1_done: ++ ++ $SLL $a1,4*$BNSZ # bits ++ $SUBU $a0,$t3,$t0 ++ $SLL $v0,$QT,4*$BNSZ # bits ++ ++ li $QT,-1 ++ $SRL $HH,$a0,4*$BNSZ # bits ++ $SRL $QT,4*$BNSZ # q=0xffffffff ++ beq $DH,$HH,.L_bn_div_words_skip_div2 ++ $DIVU $zero,$a0,$DH ++ mflo $QT ++.L_bn_div_words_skip_div2: ++ $MULTU $a2,$QT ++ $SLL $t3,$a0,4*$BNSZ # bits ++ $SRL $at,$a1,4*$BNSZ # bits ++ or $t3,$at ++ mflo $t0 ++ mfhi $t1 ++.L_bn_div_words_inner_loop2: ++ sltu $t2,$t3,$t0 ++ seq $t8,$HH,$t1 ++ sltu $at,$HH,$t1 ++ and $t2,$t8 ++ sltu $v1,$t0,$a2 ++ or $at,$t2 ++ .set noreorder ++ beqz $at,.L_bn_div_words_inner_loop2_done ++ $SUBU $t1,$v1 ++ $SUBU $t0,$a2 ++ b .L_bn_div_words_inner_loop2 ++ $SUBU $QT,1 ++ .set reorder ++.L_bn_div_words_inner_loop2_done: ++ ++ $SUBU $a0,$t3,$t0 ++ or $v0,$QT ++ $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it ++ $SRL $a2,$t9 # restore $a2 ++ ++ .set noreorder ++ move $a1,$v1 ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ move $a0,$v0 ++.end bn_div_words_internal ++___ ++undef $HH; undef $QT; undef $DH; ++ ++($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); ++($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); ++ ++($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 ++($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 ++ ++($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); ++ ++$code.=<<___; ++ ++.align 5 ++.globl bn_mul_comba8 ++.ent bn_mul_comba8 ++bn_mul_comba8: ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,12*$SZREG,$ra ++ .mask 0x803ff008,-$SZREG ++ $PTR_SUB $sp,12*$SZREG ++ $REG_S $ra,11*$SZREG($sp) ++ $REG_S $s5,10*$SZREG($sp) ++ $REG_S $s4,9*$SZREG($sp) ++ $REG_S $s3,8*$SZREG($sp) ++ $REG_S $s2,7*$SZREG($sp) ++ $REG_S $s1,6*$SZREG($sp) ++ $REG_S $s0,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour !~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x003f0000,-$SZREG ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $s5,5*$SZREG($sp) ++ $REG_S $s4,4*$SZREG($sp) ++ $REG_S $s3,3*$SZREG($sp) ++ $REG_S $s2,2*$SZREG($sp) ++ $REG_S $s1,1*$SZREG($sp) ++ $REG_S $s0,0*$SZREG($sp) ++___ ++$code.=<<___; ++ ++ .set reorder ++ $LD $a_0,0($a1) # If compiled with -mips3 option on ++ # R5000 box assembler barks on this ++ # 1ine with "should not have mult/div ++ # as last instruction in bb (R10K ++ # bug)" warning. If anybody out there ++ # has a clue about how to circumvent ++ # this do send me a note. ++ # <appro\@fy.chalmers.se> ++ ++ $LD $b_0,0($a2) ++ $LD $a_1,$BNSZ($a1) ++ $LD $a_2,2*$BNSZ($a1) ++ $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); ++ $LD $a_3,3*$BNSZ($a1) ++ $LD $b_1,$BNSZ($a2) ++ $LD $b_2,2*$BNSZ($a2) ++ $LD $b_3,3*$BNSZ($a2) ++ mflo $c_1 ++ mfhi $c_2 ++ ++ $LD $a_4,4*$BNSZ($a1) ++ $LD $a_5,5*$BNSZ($a1) ++ $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); ++ $LD $a_6,6*$BNSZ($a1) ++ $LD $a_7,7*$BNSZ($a1) ++ $LD $b_4,4*$BNSZ($a2) ++ $LD $b_5,5*$BNSZ($a2) ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); ++ $ADDU $c_3,$t_2,$at ++ $LD $b_6,6*$BNSZ($a2) ++ $LD $b_7,7*$BNSZ($a2) ++ $ST $c_1,0($a0) # r[0]=c1; ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $c_1,$c_3,$t_2 ++ $ST $c_2,$BNSZ($a0) # r[1]=c2; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $c_2,$c_1,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,2*$BNSZ($a0) # r[2]=c3; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $c_3,$c_2,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,3*$BNSZ($a0) # r[3]=c1; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $c_1,$c_3,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,4*$BNSZ($a0) # r[4]=c2; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $c_2,$c_1,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,5*$BNSZ($a0) # r[5]=c3; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $c_3,$c_2,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,6*$BNSZ($a0) # r[6]=c1; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $c_1,$c_3,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,7*$BNSZ($a0) # r[7]=c2; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $c_2,$c_1,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,8*$BNSZ($a0) # r[8]=c3; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $c_3,$c_2,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,9*$BNSZ($a0) # r[9]=c1; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $c_1,$c_3,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,10*$BNSZ($a0) # r[10]=c2; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $c_2,$c_1,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,11*$BNSZ($a0) # r[11]=c3; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $c_3,$c_2,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,12*$BNSZ($a0) # r[12]=c1; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $c_1,$c_3,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,13*$BNSZ($a0) # r[13]=c2; ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ $ST $c_3,14*$BNSZ($a0) # r[14]=c3; ++ $ST $c_1,15*$BNSZ($a0) # r[15]=c1; ++ ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $s5,10*$SZREG($sp) ++ $REG_L $s4,9*$SZREG($sp) ++ $REG_L $s3,8*$SZREG($sp) ++ $REG_L $s2,7*$SZREG($sp) ++ $REG_L $s1,6*$SZREG($sp) ++ $REG_L $s0,5*$SZREG($sp) ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ jr $ra ++ $PTR_ADD $sp,12*$SZREG ++___ ++$code.=<<___ if ($flavour !~ /nubi/i); ++ $REG_L $s5,5*$SZREG($sp) ++ $REG_L $s4,4*$SZREG($sp) ++ $REG_L $s3,3*$SZREG($sp) ++ $REG_L $s2,2*$SZREG($sp) ++ $REG_L $s1,1*$SZREG($sp) ++ $REG_L $s0,0*$SZREG($sp) ++ jr $ra ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++.end bn_mul_comba8 ++ ++.align 5 ++.globl bn_mul_comba4 ++.ent bn_mul_comba4 ++bn_mul_comba4: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ $LD $a_0,0($a1) ++ $LD $b_0,0($a2) ++ $LD $a_1,$BNSZ($a1) ++ $LD $a_2,2*$BNSZ($a1) ++ $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); ++ $LD $a_3,3*$BNSZ($a1) ++ $LD $b_1,$BNSZ($a2) ++ $LD $b_2,2*$BNSZ($a2) ++ $LD $b_3,3*$BNSZ($a2) ++ mflo $c_1 ++ mfhi $c_2 ++ $ST $c_1,0($a0) ++ ++ $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); ++ $ADDU $c_3,$t_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $c_1,$c_3,$t_2 ++ $ST $c_2,$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $c_2,$c_1,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,2*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $c_3,$c_2,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,3*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $c_1,$c_3,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,4*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $c_2,$c_1,$t_2 ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,5*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ $ST $c_1,6*$BNSZ($a0) ++ $ST $c_2,7*$BNSZ($a0) ++ ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ nop ++.end bn_mul_comba4 ++___ ++ ++($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); ++ ++$code.=<<___; ++ ++.align 5 ++.globl bn_sqr_comba8 ++.ent bn_sqr_comba8 ++bn_sqr_comba8: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ $LD $a_0,0($a1) ++ $LD $a_1,$BNSZ($a1) ++ $LD $a_2,2*$BNSZ($a1) ++ $LD $a_3,3*$BNSZ($a1) ++ ++ $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); ++ $LD $a_4,4*$BNSZ($a1) ++ $LD $a_5,5*$BNSZ($a1) ++ $LD $a_6,6*$BNSZ($a1) ++ $LD $a_7,7*$BNSZ($a1) ++ mflo $c_1 ++ mfhi $c_2 ++ $ST $c_1,0($a0) ++ ++ $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_1,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $c_3,$t_2,$at ++ $ST $c_2,$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_2,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,2*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_3,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_3,$at ++ $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,3*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_1,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_1,$at ++ $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,4*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_2,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_2,$at ++ $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); ++ $ADDU $c_2,$at ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,5*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_3,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_3,$at ++ $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_3,$at ++ $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,6*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_1,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_1,$at ++ $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_1,$at ++ $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_1,$at ++ $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,7*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_2,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_2,$at ++ $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_2,$at ++ $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,8*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_3,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_3,$at ++ $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_3,$at ++ $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,9*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_1,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_1,$at ++ $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,10*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_2,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_2,$at ++ $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,11*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_3,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,12*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_1,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,13*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ $ST $c_3,14*$BNSZ($a0) ++ $ST $c_1,15*$BNSZ($a0) ++ ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ nop ++.end bn_sqr_comba8 ++ ++.align 5 ++.globl bn_sqr_comba4 ++.ent bn_sqr_comba4 ++bn_sqr_comba4: ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ .frame $sp,6*$SZREG,$ra ++ .mask 0x8000f008,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,6*$SZREG ++ $REG_S $ra,5*$SZREG($sp) ++ $REG_S $t3,4*$SZREG($sp) ++ $REG_S $t2,3*$SZREG($sp) ++ $REG_S $t1,2*$SZREG($sp) ++ $REG_S $t0,1*$SZREG($sp) ++ $REG_S $gp,0*$SZREG($sp) ++___ ++$code.=<<___; ++ .set reorder ++ $LD $a_0,0($a1) ++ $LD $a_1,$BNSZ($a1) ++ $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); ++ $LD $a_2,2*$BNSZ($a1) ++ $LD $a_3,3*$BNSZ($a1) ++ mflo $c_1 ++ mfhi $c_2 ++ $ST $c_1,0($a0) ++ ++ $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_1,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $c_3,$t_2,$at ++ $ST $c_2,$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_2,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,2*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_3,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ mflo $t_1 ++ mfhi $t_2 ++ slt $at,$t_2,$zero ++ $ADDU $c_3,$at ++ $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); ++ $SLL $t_2,1 ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ sltu $at,$c_2,$t_2 ++ $ADDU $c_3,$at ++ $ST $c_1,3*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_1,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_2,$t_1 ++ sltu $at,$c_2,$t_1 ++ $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); ++ $ADDU $t_2,$at ++ $ADDU $c_3,$t_2 ++ sltu $at,$c_3,$t_2 ++ $ADDU $c_1,$at ++ $ST $c_2,4*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ slt $c_2,$t_2,$zero ++ $SLL $t_2,1 ++ $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); ++ slt $a2,$t_1,$zero ++ $ADDU $t_2,$a2 ++ $SLL $t_1,1 ++ $ADDU $c_3,$t_1 ++ sltu $at,$c_3,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_1,$t_2 ++ sltu $at,$c_1,$t_2 ++ $ADDU $c_2,$at ++ $ST $c_3,5*$BNSZ($a0) ++ ++ mflo $t_1 ++ mfhi $t_2 ++ $ADDU $c_1,$t_1 ++ sltu $at,$c_1,$t_1 ++ $ADDU $t_2,$at ++ $ADDU $c_2,$t_2 ++ $ST $c_1,6*$BNSZ($a0) ++ $ST $c_2,7*$BNSZ($a0) ++ ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $t3,4*$SZREG($sp) ++ $REG_L $t2,3*$SZREG($sp) ++ $REG_L $t1,2*$SZREG($sp) ++ $REG_L $t0,1*$SZREG($sp) ++ $REG_L $gp,0*$SZREG($sp) ++ $PTR_ADD $sp,6*$SZREG ++___ ++$code.=<<___; ++ jr $ra ++ nop ++.end bn_sqr_comba4 ++___ ++print $code; ++close STDOUT; +diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl +new file mode 100644 +index 0000000..f1a702f +--- /dev/null ++++ b/crypto/sha/asm/sha1-mips.pl +@@ -0,0 +1,354 @@ ++#!/usr/bin/env perl ++ ++# ==================================================================== ++# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL ++# project. The module is, however, dual licensed under OpenSSL and ++# CRYPTOGAMS licenses depending on where you obtain it. For further ++# details see http://www.openssl.org/~appro/cryptogams/. ++# ==================================================================== ++ ++# SHA1 block procedure for MIPS. ++ ++# Performance improvement is 30% on unaligned input. The "secret" is ++# to deploy lwl/lwr pair to load unaligned input. One could have ++# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- ++# compatible subroutine. There is room for minor optimization on ++# little-endian platforms... ++ ++###################################################################### ++# There is a number of MIPS ABI in use, O32 and N32/64 are most ++# widely used. Then there is a new contender: NUBI. It appears that if ++# one picks the latter, it's possible to arrange code in ABI neutral ++# manner. Therefore let's stick to NUBI register layout: ++# ++($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ++($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ++($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); ++# ++# The return value is placed in $a0. Following coding rules facilitate ++# interoperability: ++# ++# - never ever touch $tp, "thread pointer", former $gp; ++# - copy return value to $t0, former $v0 [or to $a0 if you're adapting ++# old code]; ++# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; ++# ++# For reference here is register layout for N32/64 MIPS ABIs: ++# ++# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); ++# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); ++# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); ++# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); ++# ++$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 ++ ++if ($flavour =~ /64|n32/i) { ++ $PTR_ADD="dadd"; # incidentally works even on n32 ++ $PTR_SUB="dsub"; # incidentally works even on n32 ++ $REG_S="sd"; ++ $REG_L="ld"; ++ $PTR_SLL="dsll"; # incidentally works even on n32 ++ $SZREG=8; ++} else { ++ $PTR_ADD="add"; ++ $PTR_SUB="sub"; ++ $REG_S="sw"; ++ $REG_L="lw"; ++ $PTR_SLL="sll"; ++ $SZREG=4; ++} ++# ++# <appro@openssl.org> ++# ++###################################################################### ++ ++$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; ++ ++for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } ++open STDOUT,">$output"; ++ ++if (!defined($big_endian)) ++ { $big_endian=(unpack('L',pack('N',1))==1); } ++ ++# offsets of the Most and Least Significant Bytes ++$MSB=$big_endian?0:3; ++$LSB=3&~$MSB; ++ ++@X=map("\$$_",(8..23)); # a4-a7,s0-s11 ++ ++$ctx=$a0; ++$inp=$a1; ++$num=$a2; ++$A="\$1"; ++$B="\$2"; ++$C="\$3"; ++$D="\$7"; ++$E="\$24"; @V=($A,$B,$C,$D,$E); ++$t0="\$25"; ++$t1=$num; # $num is offloaded to stack ++$t2="\$30"; # fp ++$K="\$31"; # ra ++ ++sub BODY_00_14 { ++my ($i,$a,$b,$c,$d,$e)=@_; ++my $j=$i+1; ++$code.=<<___ if (!$big_endian); ++ srl $t0,@X[$i],24 # byte swap($i) ++ srl $t1,@X[$i],8 ++ andi $t2,@X[$i],0xFF00 ++ sll @X[$i],@X[$i],24 ++ andi $t1,0xFF00 ++ sll $t2,$t2,8 ++ or @X[$i],$t0 ++ or $t1,$t2 ++ or @X[$i],$t1 ++___ ++$code.=<<___; ++ lwl @X[$j],$j*4+$MSB($inp) ++ sll $t0,$a,5 # $i ++ addu $e,$K ++ lwr @X[$j],$j*4+$LSB($inp) ++ srl $t1,$a,27 ++ addu $e,$t0 ++ xor $t0,$c,$d ++ addu $e,$t1 ++ sll $t2,$b,30 ++ and $t0,$b ++ srl $b,$b,2 ++ xor $t0,$d ++ addu $e,@X[$i] ++ or $b,$t2 ++ addu $e,$t0 ++___ ++} ++ ++sub BODY_15_19 { ++my ($i,$a,$b,$c,$d,$e)=@_; ++my $j=$i+1; ++ ++$code.=<<___ if (!$big_endian && $i==15); ++ srl $t0,@X[$i],24 # byte swap($i) ++ srl $t1,@X[$i],8 ++ andi $t2,@X[$i],0xFF00 ++ sll @X[$i],@X[$i],24 ++ andi $t1,0xFF00 ++ sll $t2,$t2,8 ++ or @X[$i],$t0 ++ or @X[$i],$t1 ++ or @X[$i],$t2 ++___ ++$code.=<<___; ++ xor @X[$j%16],@X[($j+2)%16] ++ sll $t0,$a,5 # $i ++ addu $e,$K ++ srl $t1,$a,27 ++ addu $e,$t0 ++ xor @X[$j%16],@X[($j+8)%16] ++ xor $t0,$c,$d ++ addu $e,$t1 ++ xor @X[$j%16],@X[($j+13)%16] ++ sll $t2,$b,30 ++ and $t0,$b ++ srl $t1,@X[$j%16],31 ++ addu @X[$j%16],@X[$j%16] ++ srl $b,$b,2 ++ xor $t0,$d ++ or @X[$j%16],$t1 ++ addu $e,@X[$i%16] ++ or $b,$t2 ++ addu $e,$t0 ++___ ++} ++ ++sub BODY_20_39 { ++my ($i,$a,$b,$c,$d,$e)=@_; ++my $j=$i+1; ++$code.=<<___ if ($i<79); ++ xor @X[$j%16],@X[($j+2)%16] ++ sll $t0,$a,5 # $i ++ addu $e,$K ++ srl $t1,$a,27 ++ addu $e,$t0 ++ xor @X[$j%16],@X[($j+8)%16] ++ xor $t0,$c,$d ++ addu $e,$t1 ++ xor @X[$j%16],@X[($j+13)%16] ++ sll $t2,$b,30 ++ xor $t0,$b ++ srl $t1,@X[$j%16],31 ++ addu @X[$j%16],@X[$j%16] ++ srl $b,$b,2 ++ addu $e,@X[$i%16] ++ or @X[$j%16],$t1 ++ or $b,$t2 ++ addu $e,$t0 ++___ ++$code.=<<___ if ($i==79); ++ lw @X[0],0($ctx) ++ sll $t0,$a,5 # $i ++ addu $e,$K ++ lw @X[1],4($ctx) ++ srl $t1,$a,27 ++ addu $e,$t0 ++ lw @X[2],8($ctx) ++ xor $t0,$c,$d ++ addu $e,$t1 ++ lw @X[3],12($ctx) ++ sll $t2,$b,30 ++ xor $t0,$b ++ lw @X[4],16($ctx) ++ srl $b,$b,2 ++ addu $e,@X[$i%16] ++ or $b,$t2 ++ addu $e,$t0 ++___ ++} ++ ++sub BODY_40_59 { ++my ($i,$a,$b,$c,$d,$e)=@_; ++my $j=$i+1; ++$code.=<<___ if ($i<79); ++ xor @X[$j%16],@X[($j+2)%16] ++ sll $t0,$a,5 # $i ++ addu $e,$K ++ srl $t1,$a,27 ++ addu $e,$t0 ++ xor @X[$j%16],@X[($j+8)%16] ++ and $t0,$c,$d ++ addu $e,$t1 ++ xor @X[$j%16],@X[($j+13)%16] ++ sll $t2,$b,30 ++ addu $e,$t0 ++ srl $t1,@X[$j%16],31 ++ xor $t0,$c,$d ++ addu @X[$j%16],@X[$j%16] ++ and $t0,$b ++ srl $b,$b,2 ++ or @X[$j%16],$t1 ++ addu $e,@X[$i%16] ++ or $b,$t2 ++ addu $e,$t0 ++___ ++} ++ ++$FRAMESIZE=16; # large enough to accomodate NUBI saved registers ++$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; ++ ++$code=<<___; ++#ifdef OPENSSL_FIPSCANISTER ++# include <openssl/fipssyms.h> ++#endif ++ ++.text ++ ++.set noat ++.set noreorder ++.align 5 ++.globl sha1_block_data_order ++.ent sha1_block_data_order ++sha1_block_data_order: ++ .frame $sp,$FRAMESIZE*$SZREG,$ra ++ .mask $SAVED_REGS_MASK,-$SZREG ++ .set noreorder ++ $PTR_SUB $sp,$FRAMESIZE*$SZREG ++ $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) ++ $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) ++ $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) ++ $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) ++ $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) ++ $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) ++ $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) ++ $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) ++ $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) ++ $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue ++ $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) ++ $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) ++ $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) ++ $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) ++ $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) ++___ ++$code.=<<___; ++ $PTR_SLL $num,6 ++ $PTR_ADD $num,$inp ++ $REG_S $num,0($sp) ++ lw $A,0($ctx) ++ lw $B,4($ctx) ++ lw $C,8($ctx) ++ lw $D,12($ctx) ++ b .Loop ++ lw $E,16($ctx) ++.align 4 ++.Loop: ++ .set reorder ++ lwl @X[0],$MSB($inp) ++ lui $K,0x5a82 ++ lwr @X[0],$LSB($inp) ++ ori $K,0x7999 # K_00_19 ++___ ++for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } ++for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } ++$code.=<<___; ++ lui $K,0x6ed9 ++ ori $K,0xeba1 # K_20_39 ++___ ++for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } ++$code.=<<___; ++ lui $K,0x8f1b ++ ori $K,0xbcdc # K_40_59 ++___ ++for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } ++$code.=<<___; ++ lui $K,0xca62 ++ ori $K,0xc1d6 # K_60_79 ++___ ++for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } ++$code.=<<___; ++ $PTR_ADD $inp,64 ++ $REG_L $num,0($sp) ++ ++ addu $A,$X[0] ++ addu $B,$X[1] ++ sw $A,0($ctx) ++ addu $C,$X[2] ++ addu $D,$X[3] ++ sw $B,4($ctx) ++ addu $E,$X[4] ++ sw $C,8($ctx) ++ sw $D,12($ctx) ++ sw $E,16($ctx) ++ .set noreorder ++ bne $inp,$num,.Loop ++ nop ++ ++ .set noreorder ++ $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) ++ $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) ++ $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) ++ $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) ++ $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) ++ $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) ++ $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) ++ $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) ++ $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) ++ $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) ++ $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) ++ $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) ++ $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) ++ $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) ++___ ++$code.=<<___; ++ jr $ra ++ $PTR_ADD $sp,$FRAMESIZE*$SZREG ++.end sha1_block_data_order ++.rdata ++.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>" ++___ ++print $code; ++close STDOUT; +diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl +new file mode 100644 +index 0000000..ba5b250 +--- /dev/null ++++ b/crypto/sha/asm/sha512-mips.pl +@@ -0,0 +1,455 @@ ++#!/usr/bin/env perl ++ ++# ==================================================================== ++# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL ++# project. The module is, however, dual licensed under OpenSSL and ++# CRYPTOGAMS licenses depending on where you obtain it. For further ++# details see http://www.openssl.org/~appro/cryptogams/. ++# ==================================================================== ++ ++# SHA2 block procedures for MIPS. ++ ++# October 2010. ++# ++# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc- ++# generated code in o32 build and ~55% in n32/64 build. SHA512 [which ++# for now can only be compiled for MIPS64 ISA] improvement is modest ++# ~17%, but it comes for free, because it's same instruction sequence. ++# Improvement coefficients are for aligned input. ++ ++###################################################################### ++# There is a number of MIPS ABI in use, O32 and N32/64 are most ++# widely used. Then there is a new contender: NUBI. It appears that if ++# one picks the latter, it's possible to arrange code in ABI neutral ++# manner. Therefore let's stick to NUBI register layout: ++# ++($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ++($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ++($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); ++# ++# The return value is placed in $a0. Following coding rules facilitate ++# interoperability: ++# ++# - never ever touch $tp, "thread pointer", former $gp [o32 can be ++# excluded from the rule, because it's specified volatile]; ++# - copy return value to $t0, former $v0 [or to $a0 if you're adapting ++# old code]; ++# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; ++# ++# For reference here is register layout for N32/64 MIPS ABIs: ++# ++# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); ++# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ++# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); ++# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); ++# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); ++# ++$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 ++ ++if ($flavour =~ /64|n32/i) { ++ $PTR_ADD="dadd"; # incidentally works even on n32 ++ $PTR_SUB="dsub"; # incidentally works even on n32 ++ $REG_S="sd"; ++ $REG_L="ld"; ++ $PTR_SLL="dsll"; # incidentally works even on n32 ++ $SZREG=8; ++} else { ++ $PTR_ADD="add"; ++ $PTR_SUB="sub"; ++ $REG_S="sw"; ++ $REG_L="lw"; ++ $PTR_SLL="sll"; ++ $SZREG=4; ++} ++$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; ++# ++# <appro@openssl.org> ++# ++###################################################################### ++ ++$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; ++ ++for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } ++open STDOUT,">$output"; ++ ++if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } ++ ++if ($output =~ /512/) { ++ $label="512"; ++ $SZ=8; ++ $LD="ld"; # load from memory ++ $ST="sd"; # store to memory ++ $SLL="dsll"; # shift left logical ++ $SRL="dsrl"; # shift right logical ++ $ADDU="daddu"; ++ @Sigma0=(28,34,39); ++ @Sigma1=(14,18,41); ++ @sigma0=( 7, 1, 8); # right shift first ++ @sigma1=( 6,19,61); # right shift first ++ $lastK=0x817; ++ $rounds=80; ++} else { ++ $label="256"; ++ $SZ=4; ++ $LD="lw"; # load from memory ++ $ST="sw"; # store to memory ++ $SLL="sll"; # shift left logical ++ $SRL="srl"; # shift right logical ++ $ADDU="addu"; ++ @Sigma0=( 2,13,22); ++ @Sigma1=( 6,11,25); ++ @sigma0=( 3, 7,18); # right shift first ++ @sigma1=(10,17,19); # right shift first ++ $lastK=0x8f2; ++ $rounds=64; ++} ++ ++$MSB = $big_endian ? 0 : ($SZ-1); ++$LSB = ($SZ-1)&~$MSB; ++ ++@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31)); ++@X=map("\$$_",(8..23)); ++ ++$ctx=$a0; ++$inp=$a1; ++$len=$a2; $Ktbl=$len; ++ ++sub BODY_00_15 { ++my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; ++my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); ++ ++$code.=<<___ if ($i<15); ++ ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) ++ ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) ++___ ++$code.=<<___ if (!$big_endian && $i<16 && $SZ==4); ++ srl $tmp0,@X[0],24 # byte swap($i) ++ srl $tmp1,@X[0],8 ++ andi $tmp2,@X[0],0xFF00 ++ sll @X[0],@X[0],24 ++ andi $tmp1,0xFF00 ++ sll $tmp2,$tmp2,8 ++ or @X[0],$tmp0 ++ or $tmp1,$tmp2 ++ or @X[0],$tmp1 ++___ ++$code.=<<___ if (!$big_endian && $i<16 && $SZ==8); ++ ori $tmp0,$zero,0xFF ++ dsll $tmp2,$tmp0,32 ++ or $tmp0,$tmp2 # 0x000000FF000000FF ++ and $tmp1,@X[0],$tmp0 # byte swap($i) ++ dsrl $tmp2,@X[0],24 ++ dsll $tmp1,24 ++ and $tmp2,$tmp0 ++ dsll $tmp0,8 # 0x0000FF000000FF00 ++ or $tmp1,$tmp2 ++ and $tmp2,@X[0],$tmp0 ++ dsrl @X[0],8 ++ dsll $tmp2,8 ++ and @X[0],$tmp0 ++ or $tmp1,$tmp2 ++ or @X[0],$tmp1 ++ dsrl $tmp1,@X[0],32 ++ dsll @X[0],32 ++ or @X[0],$tmp1 ++___ ++$code.=<<___; ++ $ADDU $T1,$X[0],$h # $i ++ $SRL $h,$e,@Sigma1[0] ++ xor $tmp2,$f,$g ++ $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]` ++ and $tmp2,$e ++ $SRL $tmp0,$e,@Sigma1[1] ++ xor $h,$tmp1 ++ $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]` ++ xor $h,$tmp0 ++ $SRL $tmp0,$e,@Sigma1[2] ++ xor $h,$tmp1 ++ $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]` ++ xor $h,$tmp0 ++ xor $tmp2,$g # Ch(e,f,g) ++ xor $tmp0,$tmp1,$h # Sigma1(e) ++ ++ $SRL $h,$a,@Sigma0[0] ++ $ADDU $T1,$tmp2 ++ $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] ++ $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]` ++ $ADDU $T1,$tmp0 ++ $SRL $tmp0,$a,@Sigma0[1] ++ xor $h,$tmp1 ++ $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]` ++ xor $h,$tmp0 ++ $SRL $tmp0,$a,@Sigma0[2] ++ xor $h,$tmp1 ++ $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]` ++ xor $h,$tmp0 ++ $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer ++ xor $h,$tmp1 # Sigma0(a) ++ ++ or $tmp0,$a,$b ++ and $tmp1,$a,$b ++ and $tmp0,$c ++ or $tmp1,$tmp0 # Maj(a,b,c) ++ $ADDU $T1,$tmp2 # +=K[$i] ++ $ADDU $h,$tmp1 ++ ++ $ADDU $d,$T1 ++ $ADDU $h,$T1 ++___ ++$code.=<<___ if ($i>=13); ++ $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer ++___ ++} ++ ++sub BODY_16_XX { ++my $i=@_[0]; ++my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]); ++ ++$code.=<<___; ++ $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) ++ $ADDU @X[0],@X[9] # +=X[i+9] ++ $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]` ++ $SRL $tmp0,@X[1],@sigma0[1] ++ xor $tmp2,$tmp1 ++ $SLL $tmp1,`@sigma0[2]-@sigma0[1]` ++ xor $tmp2,$tmp0 ++ $SRL $tmp0,@X[1],@sigma0[2] ++ xor $tmp2,$tmp1 ++ ++ $SRL $tmp3,@X[14],@sigma1[0] ++ xor $tmp2,$tmp0 # sigma0(X[i+1]) ++ $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]` ++ $ADDU @X[0],$tmp2 ++ $SRL $tmp0,@X[14],@sigma1[1] ++ xor $tmp3,$tmp1 ++ $SLL $tmp1,`@sigma1[2]-@sigma1[1]` ++ xor $tmp3,$tmp0 ++ $SRL $tmp0,@X[14],@sigma1[2] ++ xor $tmp3,$tmp1 ++ ++ xor $tmp3,$tmp0 # sigma1(X[i+14]) ++ $ADDU @X[0],$tmp3 ++___ ++ &BODY_00_15(@_); ++} ++ ++$FRAMESIZE=16*$SZ+16*$SZREG; ++$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; ++ ++$code.=<<___; ++#ifdef OPENSSL_FIPSCANISTER ++# include <openssl/fipssyms.h> ++#endif ++ ++.text ++.set noat ++#if !defined(__vxworks) || defined(__pic__) ++.option pic2 ++#endif ++ ++.align 5 ++.globl sha${label}_block_data_order ++.ent sha${label}_block_data_order ++sha${label}_block_data_order: ++ .frame $sp,$FRAMESIZE,$ra ++ .mask $SAVED_REGS_MASK,-$SZREG ++ .set noreorder ++___ ++$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification ++ .cpload $pf ++___ ++$code.=<<___; ++ $PTR_SUB $sp,$FRAMESIZE ++ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) ++ $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) ++ $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) ++ $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) ++ $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) ++ $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) ++ $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) ++ $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) ++ $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue ++ $REG_S $s3,$FRAMESIZE-11*$SZREG($sp) ++ $REG_S $s2,$FRAMESIZE-12*$SZREG($sp) ++ $REG_S $s1,$FRAMESIZE-13*$SZREG($sp) ++ $REG_S $s0,$FRAMESIZE-14*$SZREG($sp) ++ $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) ++___ ++$code.=<<___; ++ $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)` ++___ ++$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ++ .cplocal $Ktbl ++ .cpsetup $pf,$zero,sha${label}_block_data_order ++___ ++$code.=<<___; ++ .set reorder ++ la $Ktbl,K${label} # PIC-ified 'load address' ++ ++ $LD $A,0*$SZ($ctx) # load context ++ $LD $B,1*$SZ($ctx) ++ $LD $C,2*$SZ($ctx) ++ $LD $D,3*$SZ($ctx) ++ $LD $E,4*$SZ($ctx) ++ $LD $F,5*$SZ($ctx) ++ $LD $G,6*$SZ($ctx) ++ $LD $H,7*$SZ($ctx) ++ ++ $PTR_ADD @X[15],$inp # pointer to the end of input ++ $REG_S @X[15],16*$SZ($sp) ++ b .Loop ++ ++.align 5 ++.Loop: ++ ${LD}l @X[0],$MSB($inp) ++ ${LD}r @X[0],$LSB($inp) ++___ ++for ($i=0;$i<16;$i++) ++{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } ++$code.=<<___; ++ b .L16_xx ++.align 4 ++.L16_xx: ++___ ++for (;$i<32;$i++) ++{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } ++$code.=<<___; ++ and @X[6],0xfff ++ li @X[7],$lastK ++ .set noreorder ++ bne @X[6],@X[7],.L16_xx ++ $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16 ++ ++ $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input ++ $LD @X[0],0*$SZ($ctx) ++ $LD @X[1],1*$SZ($ctx) ++ $LD @X[2],2*$SZ($ctx) ++ $PTR_ADD $inp,16*$SZ ++ $LD @X[3],3*$SZ($ctx) ++ $ADDU $A,@X[0] ++ $LD @X[4],4*$SZ($ctx) ++ $ADDU $B,@X[1] ++ $LD @X[5],5*$SZ($ctx) ++ $ADDU $C,@X[2] ++ $LD @X[6],6*$SZ($ctx) ++ $ADDU $D,@X[3] ++ $LD @X[7],7*$SZ($ctx) ++ $ADDU $E,@X[4] ++ $ST $A,0*$SZ($ctx) ++ $ADDU $F,@X[5] ++ $ST $B,1*$SZ($ctx) ++ $ADDU $G,@X[6] ++ $ST $C,2*$SZ($ctx) ++ $ADDU $H,@X[7] ++ $ST $D,3*$SZ($ctx) ++ $ST $E,4*$SZ($ctx) ++ $ST $F,5*$SZ($ctx) ++ $ST $G,6*$SZ($ctx) ++ $ST $H,7*$SZ($ctx) ++ ++ bnel $inp,@X[15],.Loop ++ $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl ++ ++ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) ++ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) ++ $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) ++ $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) ++ $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) ++ $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) ++ $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) ++ $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) ++ $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) ++ $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) ++___ ++$code.=<<___ if ($flavour =~ /nubi/i); ++ $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) ++ $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) ++ $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) ++ $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) ++ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) ++___ ++$code.=<<___; ++ jr $ra ++ $PTR_ADD $sp,$FRAMESIZE ++.end sha${label}_block_data_order ++ ++.rdata ++.align 5 ++K${label}: ++___ ++if ($SZ==4) { ++$code.=<<___; ++ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 ++ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 ++ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 ++ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 ++ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc ++ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da ++ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 ++ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 ++ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 ++ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 ++ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 ++ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 ++ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 ++ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 ++ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 ++ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 ++___ ++} else { ++$code.=<<___; ++ .dword 0x428a2f98d728ae22, 0x7137449123ef65cd ++ .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc ++ .dword 0x3956c25bf348b538, 0x59f111f1b605d019 ++ .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 ++ .dword 0xd807aa98a3030242, 0x12835b0145706fbe ++ .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 ++ .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 ++ .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 ++ .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 ++ .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 ++ .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 ++ .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 ++ .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 ++ .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 ++ .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 ++ .dword 0x06ca6351e003826f, 0x142929670a0e6e70 ++ .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 ++ .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df ++ .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 ++ .dword 0x81c2c92e47edaee6, 0x92722c851482353b ++ .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 ++ .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 ++ .dword 0xd192e819d6ef5218, 0xd69906245565a910 ++ .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 ++ .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 ++ .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 ++ .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb ++ .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 ++ .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 ++ .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec ++ .dword 0x90befffa23631e28, 0xa4506cebde82bde9 ++ .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b ++ .dword 0xca273eceea26619c, 0xd186b8c721c0c207 ++ .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 ++ .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 ++ .dword 0x113f9804bef90dae, 0x1b710b35131c471b ++ .dword 0x28db77f523047d84, 0x32caab7b40c72493 ++ .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c ++ .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a ++ .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 ++___ ++} ++$code.=<<___; ++.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>" ++.align 5 ++ ++___ ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++print $code; ++close STDOUT; diff --git a/patches/ssl_Android.mk b/patches/ssl_Android.mk index 487aabb..40641a3 100644 --- a/patches/ssl_Android.mk +++ b/patches/ssl_Android.mk @@ -49,7 +49,7 @@ local_src_files:= \ include $(CLEAR_VARS) include $(LOCAL_PATH)/../android-config.mk -ifneq ($(TARGET_ARCH),x86) +ifeq ($(TARGET_ARCH),arm) LOCAL_NDK_VERSION := 5 LOCAL_SDK_VERSION := 9 endif @@ -64,7 +64,7 @@ include $(BUILD_STATIC_LIBRARY) include $(CLEAR_VARS) include $(LOCAL_PATH)/../android-config.mk -ifneq ($(TARGET_ARCH),x86) +ifeq ($(TARGET_ARCH),arm) LOCAL_NDK_VERSION := 5 LOCAL_SDK_VERSION := 9 endif |