1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# GHASH for for PowerISA v2.07.
18#
19# July 2014
20#
21# Accurate performance measurements are problematic, because it's
22# always virtualized setup with possibly throttled processor.
23# Relative comparison is therefore more informative. This initial
24# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25# faster than "4-bit" integer-only compiler-generated 64-bit code.
26# "Initial version" means that there is room for futher improvement.
27
28# May 2016
29#
30# 2x aggregated reduction improves performance by 50% (resulting
31# performance on POWER8 is 1 cycle per processed byte), and 4x
32# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33
34$flavour=shift;
35$output =shift;
36
37if ($flavour =~ /64/) {
38	$SIZE_T=8;
39	$LRSAVE=2*$SIZE_T;
40	$STU="stdu";
41	$POP="ld";
42	$PUSH="std";
43	$UCMP="cmpld";
44	$SHRI="srdi";
45} elsif ($flavour =~ /32/) {
46	$SIZE_T=4;
47	$LRSAVE=$SIZE_T;
48	$STU="stwu";
49	$POP="lwz";
50	$PUSH="stw";
51	$UCMP="cmplw";
52	$SHRI="srwi";
53} else { die "nonsense $flavour"; }
54
55$sp="r1";
56$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
57
58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
60( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
61die "can't locate ppc-xlate.pl";
62
63open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
64
65my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
66
67my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
68my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
69my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
70my $vrsave="r12";
71
72$code=<<___;
73.machine	"any"
74
75.text
76
77.globl	.gcm_init_p8
78.align	5
79.gcm_init_p8:
80	li		r0,-4096
81	li		r8,0x10
82	mfspr		$vrsave,256
83	li		r9,0x20
84	mtspr		256,r0
85	li		r10,0x30
86	lvx_u		$H,0,r4			# load H
87
88	vspltisb	$xC2,-16		# 0xf0
89	vspltisb	$t0,1			# one
90	vaddubm		$xC2,$xC2,$xC2		# 0xe0
91	vxor		$zero,$zero,$zero
92	vor		$xC2,$xC2,$t0		# 0xe1
93	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
94	vsldoi		$t1,$zero,$t0,1		# ...1
95	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
96	vspltisb	$t2,7
97	vor		$xC2,$xC2,$t1		# 0xc2....01
98	vspltb		$t1,$H,0		# most significant byte
99	vsl		$H,$H,$t0		# H<<=1
100	vsrab		$t1,$t1,$t2		# broadcast carry bit
101	vand		$t1,$t1,$xC2
102	vxor		$IN,$H,$t1		# twisted H
103
104	vsldoi		$H,$IN,$IN,8		# twist even more ...
105	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
106	vsldoi		$Hl,$zero,$H,8		# ... and split
107	vsldoi		$Hh,$H,$zero,8
108
109	stvx_u		$xC2,0,r3		# save pre-computed table
110	stvx_u		$Hl,r8,r3
111	li		r8,0x40
112	stvx_u		$H, r9,r3
113	li		r9,0x50
114	stvx_u		$Hh,r10,r3
115	li		r10,0x60
116
117	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
118	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
119	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
120
121	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
122
123	vsldoi		$t0,$Xm,$zero,8
124	vsldoi		$t1,$zero,$Xm,8
125	vxor		$Xl,$Xl,$t0
126	vxor		$Xh,$Xh,$t1
127
128	vsldoi		$Xl,$Xl,$Xl,8
129	vxor		$Xl,$Xl,$t2
130
131	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
132	vpmsumd		$Xl,$Xl,$xC2
133	vxor		$t1,$t1,$Xh
134	vxor		$IN1,$Xl,$t1
135
136	vsldoi		$H2,$IN1,$IN1,8
137	vsldoi		$H2l,$zero,$H2,8
138	vsldoi		$H2h,$H2,$zero,8
139
140	stvx_u		$H2l,r8,r3		# save H^2
141	li		r8,0x70
142	stvx_u		$H2,r9,r3
143	li		r9,0x80
144	stvx_u		$H2h,r10,r3
145	li		r10,0x90
146___
147{
148my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
149$code.=<<___;
150	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
151	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
152	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
153	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
154	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
155	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
156
157	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
158	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
159
160	vsldoi		$t0,$Xm,$zero,8
161	vsldoi		$t1,$zero,$Xm,8
162	 vsldoi		$t4,$Xm1,$zero,8
163	 vsldoi		$t5,$zero,$Xm1,8
164	vxor		$Xl,$Xl,$t0
165	vxor		$Xh,$Xh,$t1
166	 vxor		$Xl1,$Xl1,$t4
167	 vxor		$Xh1,$Xh1,$t5
168
169	vsldoi		$Xl,$Xl,$Xl,8
170	 vsldoi		$Xl1,$Xl1,$Xl1,8
171	vxor		$Xl,$Xl,$t2
172	 vxor		$Xl1,$Xl1,$t6
173
174	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
175	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
176	vpmsumd		$Xl,$Xl,$xC2
177	 vpmsumd	$Xl1,$Xl1,$xC2
178	vxor		$t1,$t1,$Xh
179	 vxor		$t5,$t5,$Xh1
180	vxor		$Xl,$Xl,$t1
181	 vxor		$Xl1,$Xl1,$t5
182
183	vsldoi		$H,$Xl,$Xl,8
184	 vsldoi		$H2,$Xl1,$Xl1,8
185	vsldoi		$Hl,$zero,$H,8
186	vsldoi		$Hh,$H,$zero,8
187	 vsldoi		$H2l,$zero,$H2,8
188	 vsldoi		$H2h,$H2,$zero,8
189
190	stvx_u		$Hl,r8,r3		# save H^3
191	li		r8,0xa0
192	stvx_u		$H,r9,r3
193	li		r9,0xb0
194	stvx_u		$Hh,r10,r3
195	li		r10,0xc0
196	 stvx_u		$H2l,r8,r3		# save H^4
197	 stvx_u		$H2,r9,r3
198	 stvx_u		$H2h,r10,r3
199
200	mtspr		256,$vrsave
201	blr
202	.long		0
203	.byte		0,12,0x14,0,0,0,2,0
204	.long		0
205.size	.gcm_init_p8,.-.gcm_init_p8
206___
207}
208$code.=<<___;
209.globl	.gcm_gmult_p8
210.align	5
211.gcm_gmult_p8:
212	lis		r0,0xfff8
213	li		r8,0x10
214	mfspr		$vrsave,256
215	li		r9,0x20
216	mtspr		256,r0
217	li		r10,0x30
218	lvx_u		$IN,0,$Xip		# load Xi
219
220	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
221	 le?lvsl	$lemask,r0,r0
222	lvx_u		$H, r9,$Htbl
223	 le?vspltisb	$t0,0x07
224	lvx_u		$Hh,r10,$Htbl
225	 le?vxor	$lemask,$lemask,$t0
226	lvx_u		$xC2,0,$Htbl
227	 le?vperm	$IN,$IN,$IN,$lemask
228	vxor		$zero,$zero,$zero
229
230	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
231	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
232	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
233
234	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
235
236	vsldoi		$t0,$Xm,$zero,8
237	vsldoi		$t1,$zero,$Xm,8
238	vxor		$Xl,$Xl,$t0
239	vxor		$Xh,$Xh,$t1
240
241	vsldoi		$Xl,$Xl,$Xl,8
242	vxor		$Xl,$Xl,$t2
243
244	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
245	vpmsumd		$Xl,$Xl,$xC2
246	vxor		$t1,$t1,$Xh
247	vxor		$Xl,$Xl,$t1
248
249	le?vperm	$Xl,$Xl,$Xl,$lemask
250	stvx_u		$Xl,0,$Xip		# write out Xi
251
252	mtspr		256,$vrsave
253	blr
254	.long		0
255	.byte		0,12,0x14,0,0,0,2,0
256	.long		0
257.size	.gcm_gmult_p8,.-.gcm_gmult_p8
258
259.globl	.gcm_ghash_p8
260.align	5
261.gcm_ghash_p8:
262	li		r0,-4096
263	li		r8,0x10
264	mfspr		$vrsave,256
265	li		r9,0x20
266	mtspr		256,r0
267	li		r10,0x30
268	lvx_u		$Xl,0,$Xip		# load Xi
269
270	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
271	li		r8,0x40
272	 le?lvsl	$lemask,r0,r0
273	lvx_u		$H, r9,$Htbl
274	li		r9,0x50
275	 le?vspltisb	$t0,0x07
276	lvx_u		$Hh,r10,$Htbl
277	li		r10,0x60
278	 le?vxor	$lemask,$lemask,$t0
279	lvx_u		$xC2,0,$Htbl
280	 le?vperm	$Xl,$Xl,$Xl,$lemask
281	vxor		$zero,$zero,$zero
282
283	${UCMP}i	$len,64
284	bge		Lgcm_ghash_p8_4x
285
286	lvx_u		$IN,0,$inp
287	addi		$inp,$inp,16
288	subic.		$len,$len,16
289	 le?vperm	$IN,$IN,$IN,$lemask
290	vxor		$IN,$IN,$Xl
291	beq		Lshort
292
293	lvx_u		$H2l,r8,$Htbl		# load H^2
294	li		r8,16
295	lvx_u		$H2, r9,$Htbl
296	add		r9,$inp,$len		# end of input
297	lvx_u		$H2h,r10,$Htbl
298	be?b		Loop_2x
299
300.align	5
301Loop_2x:
302	lvx_u		$IN1,0,$inp
303	le?vperm	$IN1,$IN1,$IN1,$lemask
304
305	 subic		$len,$len,32
306	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
307	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
308	 subfe		r0,r0,r0		# borrow?-1:0
309	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
310	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
311	 and		r0,r0,$len
312	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
313	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
314	 add		$inp,$inp,r0
315
316	vxor		$Xl,$Xl,$Xl1
317	vxor		$Xm,$Xm,$Xm1
318
319	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
320
321	vsldoi		$t0,$Xm,$zero,8
322	vsldoi		$t1,$zero,$Xm,8
323	 vxor		$Xh,$Xh,$Xh1
324	vxor		$Xl,$Xl,$t0
325	vxor		$Xh,$Xh,$t1
326
327	vsldoi		$Xl,$Xl,$Xl,8
328	vxor		$Xl,$Xl,$t2
329	 lvx_u		$IN,r8,$inp
330	 addi		$inp,$inp,32
331
332	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
333	vpmsumd		$Xl,$Xl,$xC2
334	 le?vperm	$IN,$IN,$IN,$lemask
335	vxor		$t1,$t1,$Xh
336	vxor		$IN,$IN,$t1
337	vxor		$IN,$IN,$Xl
338	$UCMP		r9,$inp
339	bgt		Loop_2x			# done yet?
340
341	cmplwi		$len,0
342	bne		Leven
343
344Lshort:
345	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
346	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
347	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
348
349	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
350
351	vsldoi		$t0,$Xm,$zero,8
352	vsldoi		$t1,$zero,$Xm,8
353	vxor		$Xl,$Xl,$t0
354	vxor		$Xh,$Xh,$t1
355
356	vsldoi		$Xl,$Xl,$Xl,8
357	vxor		$Xl,$Xl,$t2
358
359	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
360	vpmsumd		$Xl,$Xl,$xC2
361	vxor		$t1,$t1,$Xh
362
363Leven:
364	vxor		$Xl,$Xl,$t1
365	le?vperm	$Xl,$Xl,$Xl,$lemask
366	stvx_u		$Xl,0,$Xip		# write out Xi
367
368	mtspr		256,$vrsave
369	blr
370	.long		0
371	.byte		0,12,0x14,0,0,0,4,0
372	.long		0
373___
374{
375my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
376    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
377my $IN0=$IN;
378my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
379
380$code.=<<___;
381.align	5
382.gcm_ghash_p8_4x:
383Lgcm_ghash_p8_4x:
384	$STU		$sp,-$FRAME($sp)
385	li		r10,`15+6*$SIZE_T`
386	li		r11,`31+6*$SIZE_T`
387	stvx		v20,r10,$sp
388	addi		r10,r10,32
389	stvx		v21,r11,$sp
390	addi		r11,r11,32
391	stvx		v22,r10,$sp
392	addi		r10,r10,32
393	stvx		v23,r11,$sp
394	addi		r11,r11,32
395	stvx		v24,r10,$sp
396	addi		r10,r10,32
397	stvx		v25,r11,$sp
398	addi		r11,r11,32
399	stvx		v26,r10,$sp
400	addi		r10,r10,32
401	stvx		v27,r11,$sp
402	addi		r11,r11,32
403	stvx		v28,r10,$sp
404	addi		r10,r10,32
405	stvx		v29,r11,$sp
406	addi		r11,r11,32
407	stvx		v30,r10,$sp
408	li		r10,0x60
409	stvx		v31,r11,$sp
410	li		r0,-1
411	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
412	mtspr		256,r0			# preserve all AltiVec registers
413
414	lvsl		$t0,0,r8		# 0x0001..0e0f
415	#lvx_u		$H2l,r8,$Htbl		# load H^2
416	li		r8,0x70
417	lvx_u		$H2, r9,$Htbl
418	li		r9,0x80
419	vspltisb	$t1,8			# 0x0808..0808
420	#lvx_u		$H2h,r10,$Htbl
421	li		r10,0x90
422	lvx_u		$H3l,r8,$Htbl		# load H^3
423	li		r8,0xa0
424	lvx_u		$H3, r9,$Htbl
425	li		r9,0xb0
426	lvx_u		$H3h,r10,$Htbl
427	li		r10,0xc0
428	lvx_u		$H4l,r8,$Htbl		# load H^4
429	li		r8,0x10
430	lvx_u		$H4, r9,$Htbl
431	li		r9,0x20
432	lvx_u		$H4h,r10,$Htbl
433	li		r10,0x30
434
435	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
436	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
437	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
438
439	$SHRI		$len,$len,4		# this allows to use sign bit
440						# as carry
441	lvx_u		$IN0,0,$inp		# load input
442	lvx_u		$IN1,r8,$inp
443	subic.		$len,$len,8
444	lvx_u		$IN2,r9,$inp
445	lvx_u		$IN3,r10,$inp
446	addi		$inp,$inp,0x40
447	le?vperm	$IN0,$IN0,$IN0,$lemask
448	le?vperm	$IN1,$IN1,$IN1,$lemask
449	le?vperm	$IN2,$IN2,$IN2,$lemask
450	le?vperm	$IN3,$IN3,$IN3,$lemask
451
452	vxor		$Xh,$IN0,$Xl
453
454	 vpmsumd	$Xl1,$IN1,$H3l
455	 vpmsumd	$Xm1,$IN1,$H3
456	 vpmsumd	$Xh1,$IN1,$H3h
457
458	 vperm		$H21l,$H2,$H,$hiperm
459	 vperm		$t0,$IN2,$IN3,$loperm
460	 vperm		$H21h,$H2,$H,$loperm
461	 vperm		$t1,$IN2,$IN3,$hiperm
462	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
463	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
464	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
465	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
466
467	 vxor		$Xm2,$Xm2,$Xm1
468	 vxor		$Xl3,$Xl3,$Xl1
469	 vxor		$Xm3,$Xm3,$Xm2
470	 vxor		$Xh3,$Xh3,$Xh1
471
472	blt		Ltail_4x
473
474Loop_4x:
475	lvx_u		$IN0,0,$inp
476	lvx_u		$IN1,r8,$inp
477	subic.		$len,$len,4
478	lvx_u		$IN2,r9,$inp
479	lvx_u		$IN3,r10,$inp
480	addi		$inp,$inp,0x40
481	le?vperm	$IN1,$IN1,$IN1,$lemask
482	le?vperm	$IN2,$IN2,$IN2,$lemask
483	le?vperm	$IN3,$IN3,$IN3,$lemask
484	le?vperm	$IN0,$IN0,$IN0,$lemask
485
486	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
487	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
488	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
489	 vpmsumd	$Xl1,$IN1,$H3l
490	 vpmsumd	$Xm1,$IN1,$H3
491	 vpmsumd	$Xh1,$IN1,$H3h
492
493	vxor		$Xl,$Xl,$Xl3
494	vxor		$Xm,$Xm,$Xm3
495	vxor		$Xh,$Xh,$Xh3
496	 vperm		$t0,$IN2,$IN3,$loperm
497	 vperm		$t1,$IN2,$IN3,$hiperm
498
499	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
500	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
501	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
502
503	vsldoi		$t0,$Xm,$zero,8
504	vsldoi		$t1,$zero,$Xm,8
505	vxor		$Xl,$Xl,$t0
506	vxor		$Xh,$Xh,$t1
507
508	vsldoi		$Xl,$Xl,$Xl,8
509	vxor		$Xl,$Xl,$t2
510
511	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
512	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
513	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
514	vpmsumd		$Xl,$Xl,$xC2
515
516	 vxor		$Xl3,$Xl3,$Xl1
517	 vxor		$Xh3,$Xh3,$Xh1
518	vxor		$Xh,$Xh,$IN0
519	 vxor		$Xm2,$Xm2,$Xm1
520	vxor		$Xh,$Xh,$t1
521	 vxor		$Xm3,$Xm3,$Xm2
522	vxor		$Xh,$Xh,$Xl
523	bge		Loop_4x
524
525Ltail_4x:
526	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
527	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
528	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
529
530	vxor		$Xl,$Xl,$Xl3
531	vxor		$Xm,$Xm,$Xm3
532
533	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
534
535	vsldoi		$t0,$Xm,$zero,8
536	vsldoi		$t1,$zero,$Xm,8
537	 vxor		$Xh,$Xh,$Xh3
538	vxor		$Xl,$Xl,$t0
539	vxor		$Xh,$Xh,$t1
540
541	vsldoi		$Xl,$Xl,$Xl,8
542	vxor		$Xl,$Xl,$t2
543
544	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
545	vpmsumd		$Xl,$Xl,$xC2
546	vxor		$t1,$t1,$Xh
547	vxor		$Xl,$Xl,$t1
548
549	addic.		$len,$len,4
550	beq		Ldone_4x
551
552	lvx_u		$IN0,0,$inp
553	${UCMP}i	$len,2
554	li		$len,-4
555	blt		Lone
556	lvx_u		$IN1,r8,$inp
557	beq		Ltwo
558
559Lthree:
560	lvx_u		$IN2,r9,$inp
561	le?vperm	$IN0,$IN0,$IN0,$lemask
562	le?vperm	$IN1,$IN1,$IN1,$lemask
563	le?vperm	$IN2,$IN2,$IN2,$lemask
564
565	vxor		$Xh,$IN0,$Xl
566	vmr		$H4l,$H3l
567	vmr		$H4, $H3
568	vmr		$H4h,$H3h
569
570	vperm		$t0,$IN1,$IN2,$loperm
571	vperm		$t1,$IN1,$IN2,$hiperm
572	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
573	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
574	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
575	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
576
577	vxor		$Xm3,$Xm3,$Xm2
578	b		Ltail_4x
579
580.align	4
581Ltwo:
582	le?vperm	$IN0,$IN0,$IN0,$lemask
583	le?vperm	$IN1,$IN1,$IN1,$lemask
584
585	vxor		$Xh,$IN0,$Xl
586	vperm		$t0,$zero,$IN1,$loperm
587	vperm		$t1,$zero,$IN1,$hiperm
588
589	vsldoi		$H4l,$zero,$H2,8
590	vmr		$H4, $H2
591	vsldoi		$H4h,$H2,$zero,8
592
593	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
594	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
595	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
596
597	b		Ltail_4x
598
599.align	4
600Lone:
601	le?vperm	$IN0,$IN0,$IN0,$lemask
602
603	vsldoi		$H4l,$zero,$H,8
604	vmr		$H4, $H
605	vsldoi		$H4h,$H,$zero,8
606
607	vxor		$Xh,$IN0,$Xl
608	vxor		$Xl3,$Xl3,$Xl3
609	vxor		$Xm3,$Xm3,$Xm3
610	vxor		$Xh3,$Xh3,$Xh3
611
612	b		Ltail_4x
613
614Ldone_4x:
615	le?vperm	$Xl,$Xl,$Xl,$lemask
616	stvx_u		$Xl,0,$Xip		# write out Xi
617
618	li		r10,`15+6*$SIZE_T`
619	li		r11,`31+6*$SIZE_T`
620	mtspr		256,$vrsave
621	lvx		v20,r10,$sp
622	addi		r10,r10,32
623	lvx		v21,r11,$sp
624	addi		r11,r11,32
625	lvx		v22,r10,$sp
626	addi		r10,r10,32
627	lvx		v23,r11,$sp
628	addi		r11,r11,32
629	lvx		v24,r10,$sp
630	addi		r10,r10,32
631	lvx		v25,r11,$sp
632	addi		r11,r11,32
633	lvx		v26,r10,$sp
634	addi		r10,r10,32
635	lvx		v27,r11,$sp
636	addi		r11,r11,32
637	lvx		v28,r10,$sp
638	addi		r10,r10,32
639	lvx		v29,r11,$sp
640	addi		r11,r11,32
641	lvx		v30,r10,$sp
642	lvx		v31,r11,$sp
643	addi		$sp,$sp,$FRAME
644	blr
645	.long		0
646	.byte		0,12,0x04,0,0x80,0,4,0
647	.long		0
648___
649}
650$code.=<<___;
651.size	.gcm_ghash_p8,.-.gcm_ghash_p8
652
653.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
654.align  2
655___
656
657foreach (split("\n",$code)) {
658	s/\`([^\`]*)\`/eval $1/geo;
659
660	if ($flavour =~ /le$/o) {	# little-endian
661	    s/le\?//o		or
662	    s/be\?/#be#/o;
663	} else {
664	    s/le\?/#le#/o	or
665	    s/be\?//o;
666	}
667	print $_,"\n";
668}
669
670close STDOUT; # enforce flush
671