1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl 2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and 6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further 7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/. 8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ==================================================================== 9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# March 2010 11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The module implements "4-bit" GCM GHASH function and underlying 13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# single multiplication operation in GF(2^128). "4-bit" means that it 14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# uses 256 bytes per-key table [+128 bytes shared table]. Streamed 15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# GHASH performance was measured to be 6.67 cycles per processed byte 16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# on Itanium 2, which is >90% better than Microsoft compiler generated 17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# code. To anchor to something else sha1-ia64.pl module processes one 18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per 19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# byte. 20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# September 2010 22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# It was originally thought that it makes lesser sense to implement 24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "528B" variant on Itanium 2 for following reason. Because number of 25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# functional units is naturally limited, it appeared impossible to 26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# implement "528B" loop in 4 cycles, only in 5. This would mean that 27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# theoretically performance improvement couldn't be more than 20%. 28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# But occasionally you prove yourself wrong:-) I figured out a way to 29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# fold couple of instructions and having freed yet another instruction 30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# slot by unrolling the loop... Resulting performance is 4.45 cycles 31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# per processed byte and 50% better than "256B" version. On original 32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Itanium performance should remain the same as the "256B" version, 33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# i.e. ~8.5 cycles. 34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); 36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($^O eq "hpux") { 38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP="addp4"; 39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} else { $ADDP="add"; } 41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); 42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $big_endian=0 if (/\-DL_ENDIAN/); } 43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif (!defined($big_endian)) 44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom { $big_endian=(unpack('L',pack('N',1))==1); } 45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub loop() { 47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $label=shift; 48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp 49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. 51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in scalable manner;-) Naturally assuming data in L1 cache... 52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Special note about 'dep' instruction, which is used to construct 53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# bytes boundary and lower 7 bits of its address are guaranteed to 55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# be zero. 56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$label: 58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p19) dep rem=Zlo,rem_4bitp,3,4 } 60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p19) xor Zhi=Zhi,Hhi 61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ($p17) xor xi[1]=xi[1],in[1] };; 62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p18) ld8 Hhi=[Hi[1]] 63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p19) shrp Zlo=Zhi,Zlo,4 } 64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p19) ld8 rem=[rem] 65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) and Hi[1]=mask0xf0,xi[2] };; 66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ($p16) ld1 in[0]=[inp],-1 67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) xor Zlo=Zlo,Hlo 68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p19) shr.u Zhi=Zhi,4 } 69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; (p19) xor Hhi=Hhi,rem 70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) add Hi[1]=Htbl,Hi[1] };; 71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) dep rem=Zlo,rem_4bitp,3,4 } 74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0 75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) xor Zhi=Zhi,Hhi };; 76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p18) ld8 Hhi=[Hi[1]] 77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) shrp Zlo=Zhi,Zlo,4 } 78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; (p18) ld8 rem=[rem] 79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) and Hi[0]=mask0xf0,Hi[0] };; 80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; (p16) ld1 xi[0]=[Xi],-1 81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) xor Zlo=Zlo,Hlo 82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p18) shr.u Zhi=Zhi,4 } 83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; (p18) xor Hhi=Hhi,rem 84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom (p17) add Hi[0]=Htbl,Hi[0] 85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ctop.sptk $label };; 86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code=<<___; 90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.explicit 91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.text 92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprevfs=r2; prevlc=r3; prevpr=r8; 94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommask0xf0=r21; 95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrem=r22; rem_4bitp=r23; 96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromXi=r24; Htbl=r25; 97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrominp=r26; end=r27; 98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromHhi=r28; Hlo=r29; 99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromZhi=r30; Zlo=r31; 100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 128 102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.skip 16 // aligns loop body 103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.global gcm_gmult_4bit# 104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc gcm_gmult_4bit# 105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromgcm_gmult_4bit: 106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .prologue 107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .save ar.pfs,prevfs 108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom alloc prevfs=ar.pfs,2,6,0,8 109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP Xi=15,in0 // &Xi[15] 110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov rem_4bitp=ip } 111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo 112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save ar.lc,prevlc 113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevlc=ar.lc 114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .save pr,prevpr 115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevpr=pr };; 116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .body 118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .rotr in[3],xi[3],Hi[2] 119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15] 121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov mask0xf0=0xf0 122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom brp.loop.imp .Loop1,.Lend1-16};; 123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] 124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom };; 125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; shladd Hi[1]=xi[2],4,r0 126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr.rot=0x7<<16 127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=13 };; 128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; and Hi[1]=mask0xf0,Hi[1] 129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.ec=3 130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor Zlo=Zlo,Zlo };; 131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo 132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp 133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor Zhi=Zhi,Zhi };; 134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &loop (".Loop1",1); 136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lend1: 138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact 139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; mux1 Zlo=Zlo,\@rev };; 140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; mux1 Zhi=Zhi,\@rev };; 141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent 142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add Hhi=1,Xi };; // pipeline flush on Itanium 143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; st8 [Hlo]=Zlo 144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov pr=prevpr,0x1ffff };; 145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; st8 [Hhi]=Zhi 146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov ar.lc=prevlc 147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ret.sptk.many b0 };; 148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp gcm_gmult_4bit# 149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom###################################################################### 152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "528B" (well, "512B" actualy) streamed GHASH 153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$Xip="in0"; 155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$Htbl="in1"; 156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$inp="in2"; 157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$len="in3"; 158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rem_8bit="loc0"; 159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$mask0xff="loc1"; 160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); 161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub load_htable() { 163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom for (my $i=0;$i<8;$i++) { 164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $code.=<<___; 165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi 166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo 167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi 168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo 169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $code.=shift if (($i+$#_)==7); 171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $code.="\t};;\n" 172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom } 173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprevsp=r3; 177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 32 179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.skip 16 // aligns loop body 180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.global gcm_ghash_4bit# 181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc gcm_ghash_4bit# 182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromgcm_ghash_4bit: 183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .prologue 184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; .save ar.pfs,prevfs 185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom alloc prevfs=ar.pfs,4,2,0,0 186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .vframe prevsp 187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov prevsp=sp 188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $rem_8bit=ip };; 189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .body 190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; $ADDP r8=0+0,$Htbl 191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP r9=0+8,$Htbl } 192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; $ADDP r10=128+0,$Htbl 193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom $ADDP r11=128+8,$Htbl };; 194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom &load_htable( 196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom " $ADDP $Xip=15,$Xip", # &Xi[15] 197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom " $ADDP $len=$len,$inp", # &inp[len] 198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom " $ADDP $inp=15,$inp", # &inp[15] 199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom " mov $mask0xff=0xff", 200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom " add sp=-512,sp", 201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom " andcm sp=sp,$mask0xff", # align stack frame 202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom " add r14=0,sp", 203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom " add r15=8,sp"); 204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; $sum 1<<1 // go big-endian 206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add r8=256+0,sp 207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add r9=256+8,sp } 208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; add r10=256+128+0,sp 209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add r11=256+128+8,sp 210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $len=-17,$len };; 211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor($i=0;$i<8;$i++) { # generate first half of Hshr4[] 213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); 214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo 216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom st8 [r9]=$rhi,16 // Htable[$i].hi 217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrp $rlo=$rhi,$rlo,4 }//;; 218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo 219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi 220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr.u $rhi=$rhi,4 };; 221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 r16=[r8],16 // Htable[8].lo 227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld8 r17=[r9],16 };; // Htable[8].hi 228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 r18=[r8],16 // Htable[9].lo 229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld8 r19=[r9],16 } // Htable[9].hi 230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; rum 1<<5 // clear um.mfh 231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrp r16=r17,r16,4 };; 232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor($i=0;$i<6;$i++) { # generate second half of Hshr4[] 234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo 236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi 237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; 245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } 248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; add $Htbl=256,sp // &Htable[0] 249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit 250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; 251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$in="r15"; 256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@xi=("r16","r17"); 257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@rem=("r18","r19"); 258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); 259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($Atbl,$Btbl)=("r26","r27"); 260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; # (p16) 262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- 263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom cmp.eq p0,p6=r0,r0 };; // clear p6 265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; # (p16),(p17) 269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii; ld1 $in=[$inp],-1 //(p16) *inp-- 272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo 273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 32 275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LOOP: 276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; 277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) st8 [$Xip]=$Zhi,13 278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Zlo 279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo 280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; # (p16),(p17),(p18) 284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo 291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld1 $in=[$inp],-1 } //(p16) *inp-- 293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi 295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor ($i=1;$i<14;$i++) { 305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Above and below fragments are derived from this one by removing 306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# unsuitable (p??) instructions. 307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; # (p16),(p17),(p18),(p19) 308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo 317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld1 $in=[$inp],-1 //(p16) *inp-- 322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- 328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} 335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; # (p17),(p18),(p19) 337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo 339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] 343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo 346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo 348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi 350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) 352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi 353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi 355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) 356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] 359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; # (p18),(p19) 363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi 364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo 367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi 368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo 369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi 371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi 372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi 375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi 376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) 377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff 378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers 381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; # (p19) 383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; cmp.ltu p6,p0=$inp,$len 384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $inp=32,$inp 385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] 387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo 388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom add $Xip=9,$Xip };; // &Xi.lo 389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] 390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) ld1 $in=[$inp],-1 //[p16] *inp-- 391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] 392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi 393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] 394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; st8 [$Xip]=$Zlo,-8 395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] 396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi; 398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) ld1 $in=[$inp],-1 //[p16] *inp-- 399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo 401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; 402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6) br.cond.dptk.many .LOOP };; 404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; st8 [$Xip]=$Zhi };; 406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib; $rum 1<<1 // return to little-endian 407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom .restore sp 408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom mov sp=prevsp 409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom br.ret.sptk.many b0 };; 410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp gcm_ghash_4bit# 411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___; 413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 128 414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type rem_4bit#,\@object 415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrem_4bit: 416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size rem_4bit#,128 421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type rem_8bit#,\@object 422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrem_8bit: 423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E 424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E 425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E 426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E 427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E 428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E 429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E 430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E 431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE 432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE 433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE 434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE 435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E 436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E 437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE 438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE 439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E 440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E 441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E 442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E 443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E 444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E 445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E 446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E 447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE 448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE 449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE 450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE 451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E 452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E 453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE 454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE 455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size rem_8bit#,512 456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromstringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>" 457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___ 458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); 460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code =~ s/\`([^\`]*)\`/eval $1/gem; 461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom 462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprint $code; 463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT; 464