1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl 2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and 6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further 7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/. 8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# On PA-7100LC this module performs ~90-50% better, less for longer 11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# that compiler utilized xmpyu instruction to perform 32x32=64-bit 13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# multiplication, which in turn means that "baseline" performance was 14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# optimal in respect to instruction set capabilities. Fair comparison 15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# with vendor compiler is problematic, because OpenSSL doesn't define 16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# BN_LLONG [presumably] for historical reasons, which drives compiler 17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# toward 4 times 16x16=32-bit multiplicatons [plus complementary 18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# shifts and additions] instead. This means that you should observe 19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# several times improvement over code generated by vendor compiler 20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# improvement coefficient was never collected on PA-7100LC, or any 22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# other 1.1 CPU, because I don't have access to such machine with 23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# of ~5x on PA-8600. 26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# reportedly ~2x faster than vendor compiler generated code [according 29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# this implementation is actually 32-bit one, in the sense that it 31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 64-bit BN_LONGs... How do they interoperate then? No problem. This 33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# module picks halves of 64-bit values in reverse order and pretends 34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# i.e. there is no "wider" multiplication like on most other 64-bit 38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# platforms. This means that even being effectively 32-bit, this 39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# implementation performs "64-bit" computational task in same amount 40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# of arithmetic operations, most notably multiplications. It requires 41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# more memory references, most notably to tp[num], but this doesn't 42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 2.0 code path, provides virtually same performance as pa-risc2[W].s: 44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# it's ~10% better for shortest key length and ~10% worse for longest 45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# one. 46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# In case it wasn't clear. The module has two distinct code paths: 48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# additions and 64-bit integer loads, not to mention specific 50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# instruction scheduling. In 64-bit build naturally only 2.0 code path 51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# is assembled. In 32-bit application context both code paths are 52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# is taken automatically. Also, in 32-bit build the module imposes 54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# couple of limitations: vector lengths has to be even and vector 55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# addresses has to be 64-bit aligned. Normally neither is a problem: 56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# most common key lengths are even and vectors are commonly malloc-ed, 57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# which ensures alignment. 58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Special thanks to polarhome.com for providing HP-UX account on 60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# PA-RISC 1.1 machine, and to correspondent who chose to remain 61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# anonymous for testing the code on PA-RISC 2.0 machine. 62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$flavour = shift; 66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$output = shift; 67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromopen STDOUT,">$output"; 69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($flavour =~ /64/) { 71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $LEVEL ="2.0W"; 72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $SIZE_T =8; 73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $FRAME_MARKER =80; 74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $SAVED_RP =16; 75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH ="std"; 76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSHMA ="std,ma"; 77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP ="ldd"; 78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POPMB ="ldd,mb"; 79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $BN_SZ =$SIZE_T; 80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} else { 81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $SIZE_T =4; 83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $FRAME_MARKER =48; 84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $SAVED_RP =20; 85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH ="stw"; 86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSHMA ="stwm"; 87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP ="ldw"; 88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POPMB ="ldwm"; 89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $BN_SZ =$SIZE_T; 90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom if (open CONF,"<${dir}../../opensslconf.h") { 91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom while(<CONF>) { 92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom if (m/#\s*define\s+SIXTY_FOUR_BIT/) { 93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $BN_SZ=8; 94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $LEVEL="2.0"; 95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom last; 96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom close CONF; 99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # [+ argument transfer] 104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$LOCALS=$FRAME-$FRAME_MARKER; 105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$FRAME+=32; # local variables 106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$tp="%r31"; 108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ti1="%r29"; 109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ti0="%r28"; 110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rp="%r26"; 112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ap="%r25"; 113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$bp="%r24"; 114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$np="%r23"; 115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$n0="%r22"; # passed through stack in 32-bit 116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$num="%r21"; # passed through stack in 32-bit 117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$idx="%r20"; 118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$arrsz="%r19"; 119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$nm1="%r7"; 121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$nm0="%r6"; 122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ab1="%r5"; 123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ab0="%r4"; 124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$fp="%r3"; 126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$hi1="%r2"; 127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$hi0="%r1"; 128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s 130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$fm0="%fr4"; $fti=$fm0; 132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$fbi="%fr5L"; 133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$fn0="%fr5R"; 134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code=<<___; 138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .LEVEL $LEVEL 139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .SPACE \$TEXT\$ 140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .ALIGN 64 144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombn_mul_mont 145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .PROC 146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .ENTRY 148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSHMA %r3,$FRAME(%sp) 150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo -$FRAME(%sp),$fp 158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($SIZE_T==4); 160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw `-$FRAME_MARKER-4`($fp),$n0 161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw `-$FRAME_MARKER-8`($fp),$num 162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop 163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop ; alignment 164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==4); 166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom comiclr,<= 6,$num,%r0 ; are vectors long enough? 167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom b L\$abort 168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldi 0,%r28 ; signal "unhandled" 169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add,ev %r0,$num,$num ; is $num even? 170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom b L\$abort 171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop 172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $ap,$np,$ti1 173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom b L\$abort 175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop 176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop ; alignment 177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop 178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 0($n0),${fn0} 180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws,ma 4($bp),${fbi} ; bp[0] 181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==8); 183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom comib,> 3,$num,L\$abort ; are vectors long enough? 184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldi 0,%r28 ; signal "unhandled" 185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $num,$num,$num ; I operate on 32-bit values 186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 4($n0),${fn0} ; only low part of n0 188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 4($bp),${fbi} ; bp[0] in flipped word order 189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldds 0($ap),${fai} ; ap[0,1] 192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldds 0($np),${fni} ; np[0,1] 193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sh2addl $num,%r0,$arrsz 195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldi 31,$hi0 196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 36($arrsz),$hi1 ; space for tp[num+1] 197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom andcm $hi1,$hi0,$hi1 ; align 198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,%sp,%sp 199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $PUSH $fp,-$SIZE_T(%sp) 200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+16`($fp),$xfer 202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32+4`($fp),$tp 203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fn0},${fab0}R,${fm0} 207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $arrsz,$ap,$ap ; point at the end 209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $arrsz,$np,$np 210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subi 0,$arrsz,$idx ; j=0 211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[2,3] 220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[2,3] 221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==4); 223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mtctl $hi0,%cr11 ; $hi0 still holds 31 224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom b L\$parisc11 226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop 227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; # PA-RISC 2.0 code-path 229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -16($xfer),$ab0 232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,31,32,$hi0 235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,63,32,$ab0 236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -8($xfer),$nm0 237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab0,$nm0,$nm0 ; low part is discarded 240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm0,31,32,$hi1 241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$1st 243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 0($xfer),$ab1 246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ab1,$ab1 248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,31,32,$hi0 249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 8($xfer),$nm1 250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,63,32,$ab1 252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm1,$nm1 253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[j,j+1] 254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[j,j+1] 255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab1,$nm1,$nm1 256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm1,31,32,$hi1 257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -16($xfer),$ab0 261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ab0,$ab0 263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,31,32,$hi0 264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -8($xfer),$nm0 265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,63,32,$ab0 267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm0,$nm0 268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nm1,-4($tp) ; tp[j-1] 269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab0,$nm0,$nm0 270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw,ma $nm0,8($tp) ; tp[j-1] 271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 8,$idx,L\$1st ; j++++ 272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm0,31,32,$hi1 273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 0($xfer),$ab1 277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ab1,$ab1 279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,31,32,$hi0 280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 8($xfer),$nm1 281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,63,32,$ab1 283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm1,$nm1 284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -16($xfer),$ab0 285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab1,$nm1,$nm1 286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -8($xfer),$nm0 287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm1,31,32,$hi1 288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ab0,$ab0 290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,31,32,$hi0 291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nm1,-4($tp) ; tp[j-1] 292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,63,32,$ab0 293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm0,$nm0 294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 0($xfer),$ab1 295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab0,$nm0,$nm0 296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd,mb 8($xfer),$nm1 297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm0,31,32,$hi1 298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw,ma $nm0,8($tp) ; tp[j-1] 299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo -1($num),$num ; i-- 301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subi 0,$arrsz,$idx ; j=0 302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==4); 304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws,ma 4($bp),${fbi} ; bp[1] 305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==8); 307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 0($bp),${fbi} ; bp[1] in flipped word order 308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[0,1] 311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[0,1] 312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 8($xfer),${fti}R ; tp[0] 313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ab1,$ab1 314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,31,32,$hi0 315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,63,32,$ab1 316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm1,$nm1 320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab1,$nm1,$nm1 321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm1,31,32,$hi1 322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstws,mb ${fab0}L,-8($xfer) ; save high part 323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nm1,-4($tp) ; tp[j-1] 324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcpy,sgl %fr0,${fti}L ; zero high part 326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcpy,sgl %fr0,${fab0}L 327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$hi0,$hi0 328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $hi0,31,32,$hi1 329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvxf,dbl,dbl ${fab0},${fab0} 331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi0,0($tp) 332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi1,4($tp) 333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fn0},${fab0}R,${fm0} 337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32+4`($fp),$tp 338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$outer 339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) ; 33-bit value 342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[2] 344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[2] 345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -16($xfer),$ab0 ; 33-bit value 347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -8($xfer),$nm0 348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($xfer),$hi0 ; high part 349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,31,32,$ti0 ; carry bit 353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,63,32,$ab0 354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti0,$hi0,$hi0 ; account carry bit 356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab0,$nm0,$nm0 ; low part is discarded 358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($tp),$ti1 ; tp[1] 359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm0,31,32,$hi1 360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$inner 364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 0($xfer),$ab1 367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ti1,$ti1 369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti1,$ab1,$ab1 370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 8($xfer),$nm1 371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,31,32,$hi0 373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,63,32,$ab1 374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[j,j+1] 375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[j,j+1] 376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm1,$nm1 377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab1,$nm1,$nm1 378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($tp),$ti0 ; tp[j] 379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nm1,-4($tp) ; tp[j-1] 380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -16($xfer),$ab0 384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ti0,$ti0 386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti0,$ab0,$ab0 387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -8($xfer),$nm0 388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,31,32,$hi0 390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm1,31,32,$hi1 391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 8($tp),$ti1 ; tp[j] 392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,63,32,$ab0 393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm0,$nm0 394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab0,$nm0,$nm0 395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw,ma $nm0,8($tp) ; tp[j-1] 396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 8,$idx,L\$inner ; j++++ 397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm0,31,32,$hi1 398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 0($xfer),$ab1 402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ti1,$ti1 404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti1,$ab1,$ab1 405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 8($xfer),$nm1 406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,31,32,$hi0 408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,63,32,$ab1 409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($tp),$ti0 ; tp[j] 410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm1,$nm1 411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab1,$nm1,$nm1 412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -16($xfer),$ab0 413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd -8($xfer),$nm0 414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm1,31,32,$hi1 415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ab0,$ab0 417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti0,$ab0,$ab0 418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nm1,-4($tp) ; tp[j-1] 419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,31,32,$hi0 420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 8($tp),$ti1 ; tp[j] 421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab0,63,32,$ab0 422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm0,$nm0 423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd 0($xfer),$ab1 424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab0,$nm0,$nm0 425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd,mb 8($xfer),$nm1 426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm0,31,32,$hi1 427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw,ma $nm0,8($tp) ; tp[j-1] 428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,= -1,$num,L\$outerdone ; i-- 430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subi 0,$arrsz,$idx ; j=0 431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==4); 433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws,ma 4($bp),${fbi} ; bp[i] 434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==8); 436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldi 12,$ti0 ; bp[i] in flipped word order 437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl,ev %r0,$num,$num 438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldi -4,$ti0 439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti0,$bp,$bp 440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 0($bp),${fbi} 441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[0] 444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ab1,$ab1 445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[0] 446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 8($xfer),${fti}R ; tp[0] 447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti1,$ab1,$ab1 448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,31,32,$hi0 449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,63,32,$ab1 450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($tp),$ti0 ; tp[j] 455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm1,$nm1 457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstws,mb ${fab0}L,-8($xfer) ; save high part 458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab1,$nm1,$nm1 459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm1,31,32,$hi1 460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcpy,sgl %fr0,${fti}L ; zero high part 461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcpy,sgl %fr0,${fab0}L 462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nm1,-4($tp) ; tp[j-1] 463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvxf,dbl,dbl ${fab0},${fab0} 466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$hi0,$hi0 467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti0,$hi0,$hi0 469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $hi0,31,32,$hi1 470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi0,0($tp) 472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi1,4($tp) 473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fn0},${fab0}R,${fm0} 474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom b L\$outer 476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32+4`($fp),$tp 477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$outerdone 479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi0,$ab1,$ab1 480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti1,$ab1,$ab1 481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,31,32,$hi0 482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ab1,63,32,$ab1 483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($tp),$ti0 ; tp[j] 485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$nm1,$nm1 487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ab1,$nm1,$nm1 488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $nm1,31,32,$hi1 489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nm1,-4($tp) ; tp[j-1] 490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $hi1,$hi0,$hi0 492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $ti0,$hi0,$hi0 493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $hi0,31,32,$hi1 494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi0,0($tp) 495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi1,4($tp) 496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32`($fp),$tp 498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub %r0,%r0,%r0 ; clear borrow 499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==4); 501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldws,ma 4($tp),$ti0 502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom b L\$sub_pa11 504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $tp,$arrsz,$tp 505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$sub 506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldwx $idx($np),$hi0 507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subb $ti0,$hi0,$hi1 508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldwx $idx($tp),$ti0 509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 4,$idx,L\$sub 510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stws,ma $hi1,4($rp) 511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subb $ti0,%r0,$hi1 513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo -4($tp),$tp 514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($BN_SZ==8); 516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd,ma 8($tp),$ti0 517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$sub 518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd $idx($np),$hi0 519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrpd $ti0,$ti0,32,$ti0 ; flip word order 520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom std $ti0,-8($tp) ; save flipped value 521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub,db $ti0,$hi0,$hi1 522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd,ma 8($tp),$ti0 523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 8,$idx,L\$sub 524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom std,ma $hi1,8($rp) 525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub,db $ti0,%r0,$hi1 528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo -8($tp),$tp 529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and $tp,$hi1,$ap 532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom andcm $rp,$hi1,$bp 533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $ap,$bp,$np 534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub $rp,$arrsz,$rp ; rewind rp 536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subi 0,$arrsz,$idx 537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32`($fp),$tp 538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$copy 539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldd $idx($np),$hi0 540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom std,ma %r0,8($tp) 541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 8,$idx,.-8 ; L\$copy 542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom std,ma $hi0,8($rp) 543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($BN_SZ==4) { # PA-RISC 1.1 code-path 546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ablo=$ab0; 547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$abhi=$ab1; 548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$nmlo0=$nm0; 549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$nmhi0=$nm1; 550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$nmlo1="%r9"; 551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$nmhi1="%r8"; 552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom b L\$done 555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop 556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .ALIGN 8 558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$parisc11 559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -12($xfer),$ablo 562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -16($xfer),$hi0 563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -4($xfer),$nmlo0 564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -8($xfer),$nmhi0 565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo0,$nmlo0 ; discarded 570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$hi1 571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($xfer),$ablo 572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($xfer),$abhi 573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop 574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$1st_pa11 576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[j,j+1] 578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[j,j+1] 580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 12($xfer),$nmlo1 582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 8($xfer),$nmhi1 584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo1,$nmlo1 585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$nmhi1 587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo1,$nmlo1 589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -12($xfer),$ablo 590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$hi1 591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -16($xfer),$abhi 592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -4($xfer),$nmlo0 595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -8($xfer),$nmhi0 597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nmlo1,-4($tp) ; tp[j-1] 599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo0,$nmlo0 602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$nmhi0 604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($xfer),$abhi 605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo0,$nmlo0 606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($xfer),$ablo 607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stws,ma $nmlo0,8($tp) ; tp[j-1] 608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 8,$idx,L\$1st_pa11 ; j++++ 609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$hi1 610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 8($xfer),$nmhi1 612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 12($xfer),$nmlo1 613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo1,$nmlo1 620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -16($xfer),$abhi 621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$nmhi1 622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -12($xfer),$ablo 623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo1,$nmlo1 624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -8($xfer),$nmhi0 625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$hi1 626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -4($xfer),$nmlo0 627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nmlo1,-4($tp) ; tp[j-1] 630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($xfer),$abhi 632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo0,$nmlo0 633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($xfer),$ablo 634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$nmhi0 635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldws,mb 8($xfer),$nmhi1 636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo0,$nmlo0 637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($xfer),$nmlo1 638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$hi1 639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stws,ma $nmlo0,8($tp) ; tp[j-1] 640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo -1($num),$num ; i-- 642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subi 0,$arrsz,$idx ; j=0 643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws,ma 4($bp),${fbi} ; bp[1] 645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[0,1] 646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[0,1] 647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 8($xfer),${fti}R ; tp[0] 648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo1,$nmlo1 654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$nmhi1 655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo1,$nmlo1 656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$hi1 657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstws,mb ${fab0}L,-8($xfer) ; save high part 658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nmlo1,-4($tp) ; tp[j-1] 659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcpy,sgl %fr0,${fti}L ; zero high part 661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcpy,sgl %fr0,${fab0}L 662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$hi0,$hi0 663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,%r0,$hi1 664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvxf,dbl,dbl ${fab0},${fab0} 666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi0,0($tp) 667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi1,4($tp) 668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fn0},${fab0}R,${fm0} 672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32+4`($fp),$tp 673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$outer_pa11 674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) ; 33-bit value 677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[2,3] 679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[2,3] 680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -16($xfer),$abhi ; carry bit actually 681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -12($xfer),$ablo 683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -8($xfer),$nmhi0 684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -4($xfer),$nmlo0 685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($xfer),$hi0 ; high part 686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $abhi,$hi0,$hi0 ; account carry bit 691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo0,$nmlo0 ; discarded 693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($tp),$ti1 ; tp[1] 694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$hi1 695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($xfer),$ablo 698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($xfer),$abhi 699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$inner_pa11 701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[j,j+1] 703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[j,j+1] 705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($tp),$ti0 ; tp[j] 707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$abhi 708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 12($xfer),$nmlo1 709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ti1,$ablo,$ablo 710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 8($xfer),$nmhi1 711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo1,$nmlo1 714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$nmhi1 716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -12($xfer),$ablo 717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo1,$nmlo1 718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -16($xfer),$abhi 719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$hi1 720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 8($tp),$ti1 ; tp[j] 723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -4($xfer),$nmlo0 725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -8($xfer),$nmhi0 727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$abhi 728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nmlo1,-4($tp) ; tp[j-1] 729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ti0,$ablo,$ablo 730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab0},-16($xfer) 731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm0},-8($xfer) 733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo0,$nmlo0 734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($xfer),$ablo 735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$nmhi0 736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($xfer),$abhi 737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo0,$nmlo0 738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stws,ma $nmlo0,8($tp) ; tp[j-1] 739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 8,$idx,L\$inner_pa11 ; j++++ 740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$hi1 741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 12($xfer),$nmlo1 744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 8($xfer),$nmhi1 746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($tp),$ti0 ; tp[j] 748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$abhi 749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fab1},0($xfer) 750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ti1,$ablo,$ablo 751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstds ${fnm1},8($xfer) 752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -16($xfer),$abhi 754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo1,$nmlo1 755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -12($xfer),$ablo 756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$nmhi1 757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -8($xfer),$nmhi0 758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo1,$nmlo1 759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -4($xfer),$nmlo0 760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$hi1 761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nmlo1,-4($tp) ; tp[j-1] 764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$abhi 765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ti0,$ablo,$ablo 766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 8($tp),$ti1 ; tp[j] 767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 0($xfer),$abhi 769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo0,$nmlo0 770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($xfer),$ablo 771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$nmhi0 772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldws,mb 8($xfer),$nmhi1 773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo0,$nmlo0 774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($xfer),$nmlo1 775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi0,$hi1 776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stws,ma $nmlo0,8($tp) ; tp[j-1] 777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,= -1,$num,L\$outerdone_pa11; i-- 779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subi 0,$arrsz,$idx ; j=0 780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws,ma 4($bp),${fbi} ; bp[i] 782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($ap),${fai} ; ap[0] 783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$abhi 785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom flddx $idx($np),${fni} ; np[0] 786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fldws 8($xfer),${fti}R ; tp[0] 787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ti1,$ablo,$ablo 788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo 8($idx),$idx ; j++++ 791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($tp),$ti0 ; tp[j] 794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo1,$nmlo1 796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$nmhi1 797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fstws,mb ${fab0}L,-8($xfer) ; save high part 798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo1,$nmlo1 799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$hi1 800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcpy,sgl %fr0,${fti}L ; zero high part 801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcpy,sgl %fr0,${fab0}L 802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nmlo1,-4($tp) ; tp[j-1] 803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvxf,dbl,dbl ${fab0},${fab0} 806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$hi0,$hi0 807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,%r0,$hi1 808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ti0,$hi0,$hi0 810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$hi1,$hi1 811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi0,0($tp) 813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi1,4($tp) 814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xmpyu ${fn0},${fab0}R,${fm0} 815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom b L\$outer_pa11 817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32+4`($fp),$tp 818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$outerdone_pa11 820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi0,$ablo,$ablo 821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$abhi 822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ti1,$ablo,$ablo 823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$abhi,$hi0 824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw 4($tp),$ti0 ; tp[j] 826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$nmlo1,$nmlo1 828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$nmhi1 829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ablo,$nmlo1,$nmlo1 830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$nmhi1,$hi1 831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $nmlo1,-4($tp) ; tp[j-1] 832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $hi1,$hi0,$hi0 834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,%r0,$hi1 835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $ti0,$hi0,$hi0 836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addc %r0,$hi1,$hi1 837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi0,0($tp) 838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stw $hi1,4($tp) 839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32+4`($fp),$tp 841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub %r0,%r0,%r0 ; clear borrow 842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldw -4($tp),$ti0 843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addl $tp,$arrsz,$tp 844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$sub_pa11 845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldwx $idx($np),$hi0 846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subb $ti0,$hi0,$hi1 847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldwx $idx($tp),$ti0 848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 4,$idx,L\$sub_pa11 849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stws,ma $hi1,4($rp) 850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subb $ti0,%r0,$hi1 852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo -4($tp),$tp 853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and $tp,$hi1,$ap 854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom andcm $rp,$hi1,$bp 855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom or $ap,$bp,$np 856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sub $rp,$arrsz,$rp ; rewind rp 858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom subi 0,$arrsz,$idx 859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo `$LOCALS+32`($fp),$tp 860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$copy_pa11 861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldwx $idx($np),$hi0 862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stws,ma %r0,4($tp) 863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom addib,<> 4,$idx,L\$copy_pa11 864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stws,ma $hi0,4($rp) 865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom nop ; alignment 867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$done 868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldi 1,%r28 ; signal "handled" 873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldo $FRAME($fp),%sp ; destroy tp[num+1] 874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromL\$abort 884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom bv (%r2) 885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .EXIT 886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $POPMB -$FRAME(%sp),%r3 887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .PROCEND 888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Explicitly encode PA-RISC 2.0 instructions used in this module, so 892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# that it can be compiled with .LEVEL 1.0. It should be noted that I 893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# directive... 895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $ldd = sub { 897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my ($mod,$args) = @_; 898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $orig = "ldd$mod\t$args"; 899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(1<<5) if ($mod =~ /^,m/); 908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(1<<13) if ($mod =~ /^,mb/); 909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom else { "\t".$orig; } 912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}; 913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $std = sub { 915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my ($mod,$args) = @_; 916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $orig = "std$mod\t$args"; 917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(1<<5) if ($mod =~ /^,m/); 922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(1<<13) if ($mod =~ /^,mb/); 923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom else { "\t".$orig; } 926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}; 927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $extrd = sub { 929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my ($mod,$args) = @_; 930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $orig = "extrd$mod\t$args"; 931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # I only have ",u" completer, it's implicitly encoded... 933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $len=32-$3; 936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $len=32-$2; 943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode |= (1<<13) if ($mod =~ /,\**=/); 945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom else { "\t".$orig; } 948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}; 949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $shrpd = sub { 951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my ($mod,$args) = @_; 952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $orig = "shrpd$mod\t$args"; 953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $cpos=63-$3; 957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom else { "\t".$orig; } 961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}; 962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $sub = sub { 964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my ($mod,$args) = @_; 965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $orig = "sub$mod\t$args"; 966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(1<<10); # e1 970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(1<<8); # e2 971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $opcode|=(1<<5); # d 972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom else { "\t".$orig; } 975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}; 976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub assemble { 978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my ($mnemonic,$mod,$args)=@_; 979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom my $opcode = eval("\$$mnemonic"); 980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromforeach (split("\n",$code)) { 985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom s/\`([^\`]*)\`/eval $1/ge; 986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # flip word order in 64-bit mode... 987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom # assemble 2.0 instructions in 32-bit mode... 989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom print $_,"\n"; 992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT; 994