1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl
2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and
6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further
7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/.
8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# March 2010
11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The module implements "4-bit" GCM GHASH function and underlying
13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# single multiplication operation in GF(2^128). "4-bit" means that it
14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# GHASH performance was measured to be 6.67 cycles per processed byte
16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# on Itanium 2, which is >90% better than Microsoft compiler generated
17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# code. To anchor to something else sha1-ia64.pl module processes one
18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# byte.
20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# September 2010
22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# It was originally thought that it makes lesser sense to implement
24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "528B" variant on Itanium 2 for following reason. Because number of
25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# functional units is naturally limited, it appeared impossible to
26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# implement "528B" loop in 4 cycles, only in 5. This would mean that
27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# theoretically performance improvement couldn't be more than 20%.
28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# But occasionally you prove yourself wrong:-) I figured out a way to
29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# fold couple of instructions and having freed yet another instruction
30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# slot by unrolling the loop... Resulting performance is 4.45 cycles
31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# per processed byte and 50% better than "256B" version. On original
32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Itanium performance should remain the same as the "256B" version,
33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# i.e. ~8.5 cycles.
34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($^O eq "hpux") {
38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $ADDP="addp4";
39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} else { $ADDP="add"; }
41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom                $big_endian=0 if (/\-DL_ENDIAN/);  }
43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif (!defined($big_endian))
44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom             {  $big_endian=(unpack('L',pack('N',1))==1);  }
45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub loop() {
47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $label=shift;
48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in scalable manner;-) Naturally assuming data in L1 cache...
52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Special note about 'dep' instruction, which is used to construct
53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# bytes boundary and lower 7 bits of its address are guaranteed to
55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# be zero.
56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$label:
58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	($p17)	xor	xi[1]=xi[1],in[1]	};;
62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p19)	shrp	Zlo=Zhi,Zlo,4		}
64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p19)	ld8	rem=[rem]
65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	($p16)	ld1	in[0]=[inp],-1
67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	xor	Zlo=Zlo,Hlo
68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p19)	shr.u	Zhi=Zhi,4		}
69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	(p19)	xor	Hhi=Hhi,rem
70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	xor	Zhi=Zhi,Hhi		};;
76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	shrp	Zlo=Zhi,Zlo,4		}
78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	(p18)	ld8	rem=[rem]
79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	xor	Zlo=Zlo,Hlo
82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p18)	shr.u	Zhi=Zhi,4		}
83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	(p18)	xor	Hhi=Hhi,rem
84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	(p17)	add	Hi[0]=Htbl,Hi[0]
85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ctop.sptk	$label			};;
86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code=<<___;
90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.explicit
91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.text
92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprevfs=r2;	prevlc=r3;	prevpr=r8;
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommask0xf0=r21;
95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrem=r22;	rem_4bitp=r23;
96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromXi=r24;		Htbl=r25;
97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrominp=r26;	end=r27;
98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromHhi=r28;	Hlo=r29;
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian CarlstromZhi=r30;	Zlo=r31;
100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	128
102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.skip	16					// aligns loop body
103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.global	gcm_gmult_4bit#
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc	gcm_gmult_4bit#
105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromgcm_gmult_4bit:
106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.prologue
107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.save	ar.pfs,prevfs
108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	alloc	prevfs=ar.pfs,2,6,0,8
109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP	Xi=15,in0			// &Xi[15]
110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	rem_4bitp=ip		}
111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save	ar.lc,prevlc
113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	prevlc=ar.lc
114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.save	pr,prevpr
115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	prevpr=pr		};;
116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.body
118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rotr	in[3],xi[3],Hi[2]
119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	mask0xf0=0xf0
122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	brp.loop.imp	.Loop1,.Lend1-16};;
123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					};;
125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	shladd	Hi[1]=xi[2],4,r0
126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	pr.rot=0x7<<16
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	ar.lc=13		};;
128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	ar.ec=3
130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	Zlo=Zlo,Zlo		};;
131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	Zhi=Zhi,Zhi		};;
134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&loop	(".Loop1",1);
136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lend1:
138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	mux1	Zlo=Zlo,\@rev		};;
140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	mux1	Zhi=Zhi,\@rev		};;
141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	st8	[Hlo]=Zlo
144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	pr=prevpr,0x1ffff	};;
145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	st8	[Hhi]=Zhi
146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	ar.lc=prevlc
147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ret.sptk.many	b0	};;
148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp	gcm_gmult_4bit#
149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom######################################################################
152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "528B" (well, "512B" actualy) streamed GHASH
153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$Xip="in0";
155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$Htbl="in1";
156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$inp="in2";
157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$len="in3";
158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rem_8bit="loc0";
159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$mask0xff="loc1";
160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub load_htable() {
163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for (my $i=0;$i<8;$i++) {
164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$code.=<<___;
165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$code.=shift	if (($i+$#_)==7);
171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$code.="\t};;\n"
172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprevsp=r3;
177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	32
179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.skip	16					// aligns loop body
180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.global	gcm_ghash_4bit#
181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.proc	gcm_ghash_4bit#
182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromgcm_ghash_4bit:
183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.prologue
184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	.save	ar.pfs,prevfs
185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	alloc	prevfs=ar.pfs,4,2,0,0
186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.vframe	prevsp
187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	prevsp=sp
188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rem_8bit=ip		};;
189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.body
190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	$ADDP	r8=0+0,$Htbl
191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP	r9=0+8,$Htbl		}
192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	$ADDP	r10=128+0,$Htbl
193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$ADDP	r11=128+8,$Htbl		};;
194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&load_htable(
196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	"	$ADDP	$len=$len,$inp",	# &inp[len]
198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	"	$ADDP	$inp=15,$inp",		# &inp[15]
199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	"	mov	$mask0xff=0xff",
200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	"	add	sp=-512,sp",
201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	"	andcm	sp=sp,$mask0xff",	# align stack frame
202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	"	add	r14=0,sp",
203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	"	add	r15=8,sp");
204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	$sum	1<<1				// go big-endian
206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	r8=256+0,sp
207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	r9=256+8,sp		}
208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	add	r10=256+128+0,sp
209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	r11=256+128+8,sp
210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$len=-17,$len		};;
211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	st8	[r9]=$rhi,16			// Htable[$i].hi
217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shrp	$rlo=$rhi,$rlo,4	}//;;
218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr.u	$rhi=$rhi,4		};;
221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld8	r17=[r9],16		};;	// Htable[8].hi
228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld8	r19=[r9],16		}	// Htable[9].hi
230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	rum	1<<5				// clear um.mfh
231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shrp	r16=r17,r16,4		};;
232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$in="r15";
256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@xi=("r16","r17");
257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@rem=("r18","r19");
258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom($Atbl,$Btbl)=("r26","r27");
260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;	# (p16)
262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;	# (p16),(p17)
269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	32
275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LOOP:
276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;
277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	st8	[$Xip]=$Zhi,13
278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Zlo
279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;	# (p16),(p17),(p18)
284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld1	$in=[$inp],-1		}	//(p16) *inp--
293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromfor ($i=1;$i<14;$i++) {
305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Above and below fragments are derived from this one by removing
306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# unsuitable (p??) instructions.
307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;	# (p16),(p17),(p18),(p19)
308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld1	$in=[$inp],-1			//(p16) *inp--
322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;	# (p17),(p18),(p19)
337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;	# (p18),(p19)
363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrompush (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;	# (p19)
383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	cmp.ltu	p6,p0=$inp,$len
384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$inp=32,$inp
385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	$Xip=9,$Xip		};;	//	&Xi.lo
389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;	st8	[$Xip]=$Zlo,-8
395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mmi;
398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;
402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom(p6)	br.cond.dptk.many	.LOOP	};;
404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	st8	[$Xip]=$Zhi		};;
406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ .mib;	$rum	1<<1				// return to little-endian
407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.restore	sp
408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	sp=prevsp
409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	br.ret.sptk.many	b0	};;
410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.endp	gcm_ghash_4bit#
411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	128
414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	rem_4bit#,\@object
415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrem_4bit:
416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	rem_4bit#,128
421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	rem_8bit#,\@object
422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromrem_8bit:
423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	rem_8bit#,512
456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromstringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code =~ s/\`([^\`]*)\`/eval $1/gem;
461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprint $code;
463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT;
464