ghash-ia64.pl revision 392aa7cc7d2b122614c5393c3e357da07fd07af3
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15# GHASH performance was measured to be 6.67 cycles per processed byte
16# on Itanium 2, which is >90% better than Microsoft compiler generated
17# code. To anchor to something else sha1-ia64.pl module processes one
18# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19# byte.
20
21# September 2010
22#
23# It was originally thought that it makes lesser sense to implement
24# "528B" variant on Itanium 2 for following reason. Because number of
25# functional units is naturally limited, it appeared impossible to
26# implement "528B" loop in 4 cycles, only in 5. This would mean that
27# theoretically performance improvement couldn't be more than 20%.
28# But occasionally you prove yourself wrong:-) I figured out a way to
29# fold couple of instructions and having freed yet another instruction
30# slot by unrolling the loop... Resulting performance is 4.45 cycles
31# per processed byte and 50% better than "256B" version. On original
32# Itanium performance should remain the same as the "256B" version,
33# i.e. ~8.5 cycles.
34
35$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
36
37if ($^O eq "hpux") {
38    $ADDP="addp4";
39    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40} else { $ADDP="add"; }
41for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
42                $big_endian=0 if (/\-DL_ENDIAN/);  }
43if (!defined($big_endian))
44             {  $big_endian=(unpack('L',pack('N',1))==1);  }
45
46sub loop() {
47my $label=shift;
48my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49
50# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51# in scalable manner;-) Naturally assuming data in L1 cache...
52# Special note about 'dep' instruction, which is used to construct
53# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54# bytes boundary and lower 7 bits of its address are guaranteed to
55# be zero.
56$code.=<<___;
57$label:
58{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
59	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
60{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
61	($p17)	xor	xi[1]=xi[1],in[1]	};;
62{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
63	(p19)	shrp	Zlo=Zhi,Zlo,4		}
64{ .mfi;	(p19)	ld8	rem=[rem]
65	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
66{ .mmi;	($p16)	ld1	in[0]=[inp],-1
67	(p18)	xor	Zlo=Zlo,Hlo
68	(p19)	shr.u	Zhi=Zhi,4		}
69{ .mib;	(p19)	xor	Hhi=Hhi,rem
70	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
71
72{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
73	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
74{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
75	(p18)	xor	Zhi=Zhi,Hhi		};;
76{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
77	(p18)	shrp	Zlo=Zhi,Zlo,4		}
78{ .mfi;	(p18)	ld8	rem=[rem]
79	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
80{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
81	(p18)	xor	Zlo=Zlo,Hlo
82	(p18)	shr.u	Zhi=Zhi,4		}
83{ .mib;	(p18)	xor	Hhi=Hhi,rem
84	(p17)	add	Hi[0]=Htbl,Hi[0]
85	br.ctop.sptk	$label			};;
86___
87}
88
89$code=<<___;
90.explicit
91.text
92
93prevfs=r2;	prevlc=r3;	prevpr=r8;
94mask0xf0=r21;
95rem=r22;	rem_4bitp=r23;
96Xi=r24;		Htbl=r25;
97inp=r26;	end=r27;
98Hhi=r28;	Hlo=r29;
99Zhi=r30;	Zlo=r31;
100
101.align	128
102.skip	16					// aligns loop body
103.global	gcm_gmult_4bit#
104.proc	gcm_gmult_4bit#
105gcm_gmult_4bit:
106	.prologue
107{ .mmi;	.save	ar.pfs,prevfs
108	alloc	prevfs=ar.pfs,2,6,0,8
109	$ADDP	Xi=15,in0			// &Xi[15]
110	mov	rem_4bitp=ip		}
111{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
112	.save	ar.lc,prevlc
113	mov	prevlc=ar.lc
114	.save	pr,prevpr
115	mov	prevpr=pr		};;
116
117	.body
118	.rotr	in[3],xi[3],Hi[2]
119
120{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
121	mov	mask0xf0=0xf0
122	brp.loop.imp	.Loop1,.Lend1-16};;
123{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
124					};;
125{ .mii;	shladd	Hi[1]=xi[2],4,r0
126	mov	pr.rot=0x7<<16
127	mov	ar.lc=13		};;
128{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
129	mov	ar.ec=3
130	xor	Zlo=Zlo,Zlo		};;
131{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
132	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133	xor	Zhi=Zhi,Zhi		};;
134___
135	&loop	(".Loop1",1);
136$code.=<<___;
137.Lend1:
138{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
139{ .mib;	mux1	Zlo=Zlo,\@rev		};;
140{ .mib;	mux1	Zhi=Zhi,\@rev		};;
141{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
142	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
143{ .mib;	st8	[Hlo]=Zlo
144	mov	pr=prevpr,0x1ffff	};;
145{ .mib;	st8	[Hhi]=Zhi
146	mov	ar.lc=prevlc
147	br.ret.sptk.many	b0	};;
148.endp	gcm_gmult_4bit#
149___
150
151######################################################################
152# "528B" (well, "512B" actualy) streamed GHASH
153#
154$Xip="in0";
155$Htbl="in1";
156$inp="in2";
157$len="in3";
158$rem_8bit="loc0";
159$mask0xff="loc1";
160($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161
162sub load_htable() {
163    for (my $i=0;$i<8;$i++) {
164	$code.=<<___;
165{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
166	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
167{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
168	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
169___
170	$code.=shift	if (($i+$#_)==7);
171	$code.="\t};;\n"
172    }
173}
174
175$code.=<<___;
176prevsp=r3;
177
178.align	32
179.skip	16					// aligns loop body
180.global	gcm_ghash_4bit#
181.proc	gcm_ghash_4bit#
182gcm_ghash_4bit:
183	.prologue
184{ .mmi;	.save	ar.pfs,prevfs
185	alloc	prevfs=ar.pfs,4,2,0,0
186	.vframe	prevsp
187	mov	prevsp=sp
188	mov	$rem_8bit=ip		};;
189	.body
190{ .mfi;	$ADDP	r8=0+0,$Htbl
191	$ADDP	r9=0+8,$Htbl		}
192{ .mfi;	$ADDP	r10=128+0,$Htbl
193	$ADDP	r11=128+8,$Htbl		};;
194___
195	&load_htable(
196	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
197	"	$ADDP	$len=$len,$inp",	# &inp[len]
198	"	$ADDP	$inp=15,$inp",		# &inp[15]
199	"	mov	$mask0xff=0xff",
200	"	add	sp=-512,sp",
201	"	andcm	sp=sp,$mask0xff",	# align stack frame
202	"	add	r14=0,sp",
203	"	add	r15=8,sp");
204$code.=<<___;
205{ .mmi;	$sum	1<<1				// go big-endian
206	add	r8=256+0,sp
207	add	r9=256+8,sp		}
208{ .mmi;	add	r10=256+128+0,sp
209	add	r11=256+128+8,sp
210	add	$len=-17,$len		};;
211___
212for($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
213my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214$code.=<<___;
215{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
216	st8	[r9]=$rhi,16			// Htable[$i].hi
217	shrp	$rlo=$rhi,$rlo,4	}//;;
218{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
219	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
220	shr.u	$rhi=$rhi,4		};;
221{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
222	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
223___
224}
225$code.=<<___;
226{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
227	ld8	r17=[r9],16		};;	// Htable[8].hi
228{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
229	ld8	r19=[r9],16		}	// Htable[9].hi
230{ .mmi;	rum	1<<5				// clear um.mfh
231	shrp	r16=r17,r16,4		};;
232___
233for($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
234$code.=<<___;
235{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
236	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
237	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
238{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
239	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
240	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
241___
242}
243$code.=<<___;
244{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
245{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
246	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
247	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
248{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
249	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
251{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
252	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
253___
254
255$in="r15";
256@xi=("r16","r17");
257@rem=("r18","r19");
258($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259($Atbl,$Btbl)=("r26","r27");
260
261$code.=<<___;	# (p16)
262{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
263	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
264	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
265___
266push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
267
268$code.=<<___;	# (p16),(p17)
269{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
270	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
271{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
272	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
273	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
274.align	32
275.LOOP:
276{ .mmi;
277(p6)	st8	[$Xip]=$Zhi,13
278	xor	$Zlo=$Zlo,$Zlo
279	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
280___
281push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
282
283$code.=<<___;	# (p16),(p17),(p18)
284{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
285	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
287{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
288	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
289{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
290	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
291{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292	ld1	$in=[$inp],-1		}	//(p16) *inp--
293{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
294	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
295	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
296{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
297	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
298	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
301___
302push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
303
304for ($i=1;$i<14;$i++) {
305# Above and below fragments are derived from this one by removing
306# unsuitable (p??) instructions.
307$code.=<<___;	# (p16),(p17),(p18),(p19)
308{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
309	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
311{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
312	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
313	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
314{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
315	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
316	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
317{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
318	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
319	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
320{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321	ld1	$in=[$inp],-1			//(p16) *inp--
322	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
323{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
324	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
325	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
326{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
327	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
328	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
331	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
332___
333push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
334}
335
336$code.=<<___;	# (p17),(p18),(p19)
337{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
338	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
340{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
341	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
342	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
343{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
344	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
345	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
346{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
347	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
348	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
349{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
351{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
352	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
353	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
354{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
355	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
358	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
359___
360push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
361
362$code.=<<___;	# (p18),(p19)
363{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
364	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
365{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
366	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
367{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
368	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
369{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
370	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
371{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
372	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
373{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
374	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
375{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
376	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
379___
380push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
381
382$code.=<<___;	# (p19)
383{ .mmi;	cmp.ltu	p6,p0=$inp,$len
384	add	$inp=32,$inp
385	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
386{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
387	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
388	add	$Xip=9,$Xip		};;	//	&Xi.lo
389{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
390(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
391(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
392{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
393(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
394{ .mmi;	st8	[$Xip]=$Zlo,-8
395(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
396	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
397{ .mmi;
398(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
399	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
400(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
401{ .mib;
402(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
403(p6)	br.cond.dptk.many	.LOOP	};;
404
405{ .mib;	st8	[$Xip]=$Zhi		};;
406{ .mib;	$rum	1<<1				// return to little-endian
407	.restore	sp
408	mov	sp=prevsp
409	br.ret.sptk.many	b0	};;
410.endp	gcm_ghash_4bit#
411___
412$code.=<<___;
413.align	128
414.type	rem_4bit#,\@object
415rem_4bit:
416        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420.size	rem_4bit#,128
421.type	rem_8bit#,\@object
422rem_8bit:
423	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455.size	rem_8bit#,512
456stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457___
458
459$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
460$code =~ s/\`([^\`]*)\`/eval $1/gem;
461
462print $code;
463close STDOUT;
464