1bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez#! /usr/bin/env perl
2bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez#
4bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# Licensed under the OpenSSL license (the "License").  You may not use
5bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# this file except in compliance with the License.  You can obtain a copy
6bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# in the file LICENSE in the source distribution or at
7bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# https://www.openssl.org/source/license.html
8bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
9bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez#
10bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# ====================================================================
11bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# project. The module is, however, dual licensed under OpenSSL and
13bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# CRYPTOGAMS licenses depending on where you obtain it. For further
14bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# details see http://www.openssl.org/~appro/cryptogams/.
15bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# ====================================================================
16bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez#
17bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# GHASH for for PowerISA v2.07.
18bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez#
19bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# July 2014
20bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez#
21bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# Accurate performance measurements are problematic, because it's
22bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# always virtualized setup with possibly throttled processor.
23bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# Relative comparison is therefore more informative. This initial
24bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# faster than "4-bit" integer-only compiler-generated 64-bit code.
26bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# "Initial version" means that there is room for futher improvement.
27bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
28bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# May 2016
29bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez#
30bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# 2x aggregated reduction improves performance by 50% (resulting
31bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# performance on POWER8 is 1 cycle per processed byte), and 4x
32bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
34bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$flavour=shift;
35bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$output =shift;
36bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
37bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezif ($flavour =~ /64/) {
38bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$SIZE_T=8;
39bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$LRSAVE=2*$SIZE_T;
40bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$STU="stdu";
41bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$POP="ld";
42bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$PUSH="std";
43bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$UCMP="cmpld";
44bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$SHRI="srdi";
45bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez} elsif ($flavour =~ /32/) {
46bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$SIZE_T=4;
47bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$LRSAVE=$SIZE_T;
48bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$STU="stwu";
49bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$POP="lwz";
50bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$PUSH="stw";
51bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$UCMP="cmplw";
52bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$SHRI="srwi";
53bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez} else { die "nonsense $flavour"; }
54bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
55bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$sp="r1";
56bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
57bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
58bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
609254e681d446a8105bd66f08bae1252d4d89a139Robert Sloan( $xlate="${dir}../../../perlasm/ppc-xlate.pl" and -f $xlate) or
61bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezdie "can't locate ppc-xlate.pl";
62bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
63bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezopen STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
64bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
65bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
66bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
67bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
68bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
69bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
70bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy $vrsave="r12";
71bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
72bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$code=<<___;
73bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.machine	"any"
74bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
75bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.text
76bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
77bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.globl	.gcm_init_p8
78bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.align	5
79bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.gcm_init_p8:
80bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r0,-4096
81bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0x10
82bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mfspr		$vrsave,256
83bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0x20
84bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mtspr		256,r0
85bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x30
86bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H,0,r4			# load H
87bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
88bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vspltisb	$xC2,-16		# 0xf0
89bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vspltisb	$t0,1			# one
90bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vaddubm		$xC2,$xC2,$xC2		# 0xe0
91bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$zero,$zero,$zero
92bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vor		$xC2,$xC2,$t0		# 0xe1
93bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
94bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$zero,$t0,1		# ...1
95bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
96bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vspltisb	$t2,7
97bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vor		$xC2,$xC2,$t1		# 0xc2....01
98bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vspltb		$t1,$H,0		# most significant byte
99bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsl		$H,$H,$t0		# H<<=1
100bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsrab		$t1,$t1,$t2		# broadcast carry bit
101bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vand		$t1,$t1,$xC2
102bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$IN,$H,$t1		# twisted H
103bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
104bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H,$IN,$IN,8		# twist even more ...
105bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
106bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Hl,$zero,$H,8		# ... and split
107bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Hh,$H,$zero,8
108bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
109bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$xC2,0,r3		# save pre-computed table
110bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$Hl,r8,r3
111bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0x40
112bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$H, r9,r3
113bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0x50
114bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$Hh,r10,r3
115bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x60
116bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
117bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
118bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
119bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
120bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
121bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
122bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
123bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t0,$Xm,$zero,8
124bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$zero,$Xm,8
125bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t0
126bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$t1
127bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
128bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Xl,$Xl,$Xl,8
129bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t2
130bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
131bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
132bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xl,$xC2
133bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$t1,$t1,$Xh
134bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$IN1,$Xl,$t1
135bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
136bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H2,$IN1,$IN1,8
137bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H2l,$zero,$H2,8
138bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H2h,$H2,$zero,8
139bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
140bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$H2l,r8,r3		# save H^2
141bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0x70
142bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$H2,r9,r3
143bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0x80
144bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$H2h,r10,r3
145bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x90
146bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez___
147bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez{
148bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy ($t4,$t5,$t6) = ($Hl,$H,$Hh);
149bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$code.=<<___;
150bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
151bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
152bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
153bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
154bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
155bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
156bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
157bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
158bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
159bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
160bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t0,$Xm,$zero,8
161bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$zero,$Xm,8
162bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vsldoi		$t4,$Xm1,$zero,8
163bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vsldoi		$t5,$zero,$Xm1,8
164bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t0
165bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$t1
166bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xl1,$Xl1,$t4
167bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xh1,$Xh1,$t5
168bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
169bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Xl,$Xl,$Xl,8
170bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vsldoi		$Xl1,$Xl1,$Xl1,8
171bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t2
172bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xl1,$Xl1,$t6
173bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
174bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
175bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
176bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xl,$xC2
177bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xl1,$Xl1,$xC2
178bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$t1,$t1,$Xh
179bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$t5,$t5,$Xh1
180bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t1
181bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xl1,$Xl1,$t5
182bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
183bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H,$Xl,$Xl,8
184bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vsldoi		$H2,$Xl1,$Xl1,8
185bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Hl,$zero,$H,8
186bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Hh,$H,$zero,8
187bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vsldoi		$H2l,$zero,$H2,8
188bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vsldoi		$H2h,$H2,$zero,8
189bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
190bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$Hl,r8,r3		# save H^3
191bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0xa0
192bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$H,r9,r3
193bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0xb0
194bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$Hh,r10,r3
195bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0xc0
196bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 stvx_u		$H2l,r8,r3		# save H^4
197bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 stvx_u		$H2,r9,r3
198bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 stvx_u		$H2h,r10,r3
199bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
200bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mtspr		256,$vrsave
201bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	blr
202bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.long		0
203bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.byte		0,12,0x14,0,0,0,2,0
204bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.long		0
205bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.size	.gcm_init_p8,.-.gcm_init_p8
206bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez___
207bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez}
208bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$code.=<<___;
209bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.globl	.gcm_gmult_p8
210bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.align	5
211bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.gcm_gmult_p8:
212bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lis		r0,0xfff8
213bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0x10
214bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mfspr		$vrsave,256
215bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0x20
216bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mtspr		256,r0
217bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x30
218bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN,0,$Xip		# load Xi
219bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
220bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
221bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?lvsl	$lemask,r0,r0
222bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H, r9,$Htbl
223bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?vspltisb	$t0,0x07
224bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$Hh,r10,$Htbl
225bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?vxor	$lemask,$lemask,$t0
226bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$xC2,0,$Htbl
227bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?vperm	$IN,$IN,$IN,$lemask
228bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$zero,$zero,$zero
229bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
230bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
231bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
232bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
233bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
234bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
235bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
236bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t0,$Xm,$zero,8
237bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$zero,$Xm,8
238bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t0
239bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$t1
240bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
241bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Xl,$Xl,$Xl,8
242bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t2
243bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
244bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
245bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xl,$xC2
246bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$t1,$t1,$Xh
247bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t1
248bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
249bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$Xl,$Xl,$Xl,$lemask
250bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$Xl,0,$Xip		# write out Xi
251bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
252bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mtspr		256,$vrsave
253bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	blr
254bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.long		0
255bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.byte		0,12,0x14,0,0,0,2,0
256bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.long		0
257bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.size	.gcm_gmult_p8,.-.gcm_gmult_p8
258bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
259bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.globl	.gcm_ghash_p8
260bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.align	5
261bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.gcm_ghash_p8:
262bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r0,-4096
263bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0x10
264bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mfspr		$vrsave,256
265bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0x20
266bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mtspr		256,r0
267bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x30
268bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$Xl,0,$Xip		# load Xi
269bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
270bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
271bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0x40
272bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?lvsl	$lemask,r0,r0
273bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H, r9,$Htbl
274bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0x50
275bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?vspltisb	$t0,0x07
276bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$Hh,r10,$Htbl
277bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x60
278bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?vxor	$lemask,$lemask,$t0
279bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$xC2,0,$Htbl
280bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?vperm	$Xl,$Xl,$Xl,$lemask
281bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$zero,$zero,$zero
282bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
283bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	${UCMP}i	$len,64
284bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	bge		Lgcm_ghash_p8_4x
285bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
286bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN,0,$inp
287bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		$inp,$inp,16
288bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	subic.		$len,$len,16
289bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?vperm	$IN,$IN,$IN,$lemask
290bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$IN,$IN,$Xl
291bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	beq		Lshort
292bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
293bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H2l,r8,$Htbl		# load H^2
294bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,16
295bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H2, r9,$Htbl
296bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	add		r9,$inp,$len		# end of input
297bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H2h,r10,$Htbl
298bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	be?b		Loop_2x
299bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
300bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.align	5
301bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLoop_2x:
302bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN1,0,$inp
303bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN1,$IN1,$IN1,$lemask
304bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
305bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 subic		$len,$len,32
306bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
307bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
308bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 subfe		r0,r0,r0		# borrow?-1:0
309bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
310bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
311bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 and		r0,r0,$len
312bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
313bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
314bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 add		$inp,$inp,r0
315bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
316bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$Xl1
317bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xm,$Xm,$Xm1
318bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
319bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
320bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
321bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t0,$Xm,$zero,8
322bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$zero,$Xm,8
323bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xh,$Xh,$Xh1
324bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t0
325bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$t1
326bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
327bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Xl,$Xl,$Xl,8
328bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t2
329bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 lvx_u		$IN,r8,$inp
330bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 addi		$inp,$inp,32
331bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
332bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
333bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xl,$xC2
334bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 le?vperm	$IN,$IN,$IN,$lemask
335bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$t1,$t1,$Xh
336bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$IN,$IN,$t1
337bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$IN,$IN,$Xl
338bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$UCMP		r9,$inp
339bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	bgt		Loop_2x			# done yet?
340bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
341bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	cmplwi		$len,0
342bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	bne		Leven
343bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
344bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLshort:
345bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
346bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
347bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
348bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
349bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
350bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
351bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t0,$Xm,$zero,8
352bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$zero,$Xm,8
353bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t0
354bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$t1
355bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
356bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Xl,$Xl,$Xl,8
357bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t2
358bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
359bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
360bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xl,$xC2
361bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$t1,$t1,$Xh
362bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
363bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLeven:
364bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t1
365bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$Xl,$Xl,$Xl,$lemask
366bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$Xl,0,$Xip		# write out Xi
367bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
368bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mtspr		256,$vrsave
369bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	blr
370bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.long		0
371bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.byte		0,12,0x14,0,0,0,4,0
372bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.long		0
373bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez___
374bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez{
375bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
376bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
377bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy $IN0=$IN;
378bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezmy ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
379bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
380bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$code.=<<___;
381bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.align	5
382bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.gcm_ghash_p8_4x:
383bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLgcm_ghash_p8_4x:
384bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$STU		$sp,-$FRAME($sp)
385bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,`15+6*$SIZE_T`
386bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r11,`31+6*$SIZE_T`
387bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v20,r10,$sp
388bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
389bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v21,r11,$sp
390bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
391bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v22,r10,$sp
392bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
393bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v23,r11,$sp
394bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
395bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v24,r10,$sp
396bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
397bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v25,r11,$sp
398bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
399bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v26,r10,$sp
400bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
401bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v27,r11,$sp
402bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
403bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v28,r10,$sp
404bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
405bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v29,r11,$sp
406bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
407bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v30,r10,$sp
408bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x60
409bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx		v31,r11,$sp
410bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r0,-1
411bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
412bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mtspr		256,r0			# preserve all AltiVec registers
413bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
414bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvsl		$t0,0,r8		# 0x0001..0e0f
415bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	#lvx_u		$H2l,r8,$Htbl		# load H^2
416bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0x70
417bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H2, r9,$Htbl
418bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0x80
419bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vspltisb	$t1,8			# 0x0808..0808
420bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	#lvx_u		$H2h,r10,$Htbl
421bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x90
422bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H3l,r8,$Htbl		# load H^3
423bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0xa0
424bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H3, r9,$Htbl
425bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0xb0
426bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H3h,r10,$Htbl
427bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0xc0
428bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H4l,r8,$Htbl		# load H^4
429bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r8,0x10
430bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H4, r9,$Htbl
431bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r9,0x20
432bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$H4h,r10,$Htbl
433bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,0x30
434bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
435bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
436bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
437bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
438bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
439bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	$SHRI		$len,$len,4		# this allows to use sign bit
440bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez						# as carry
441bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN0,0,$inp		# load input
442bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN1,r8,$inp
443bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	subic.		$len,$len,8
444bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN2,r9,$inp
445bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN3,r10,$inp
446bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		$inp,$inp,0x40
447bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN0,$IN0,$IN0,$lemask
448bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN1,$IN1,$IN1,$lemask
449bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN2,$IN2,$IN2,$lemask
450bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN3,$IN3,$IN3,$lemask
451bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
452bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$IN0,$Xl
453bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
454bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xl1,$IN1,$H3l
455bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xm1,$IN1,$H3
456bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xh1,$IN1,$H3h
457bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
458bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vperm		$H21l,$H2,$H,$hiperm
459bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vperm		$t0,$IN2,$IN3,$loperm
460bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vperm		$H21h,$H2,$H,$loperm
461bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vperm		$t1,$IN2,$IN3,$hiperm
462bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
463bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
464bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
465bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
466bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
467bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xm2,$Xm2,$Xm1
468bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xl3,$Xl3,$Xl1
469bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xm3,$Xm3,$Xm2
470bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xh3,$Xh3,$Xh1
471bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
472bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	blt		Ltail_4x
473bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
474bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLoop_4x:
475bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN0,0,$inp
476bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN1,r8,$inp
477bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	subic.		$len,$len,4
478bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN2,r9,$inp
479bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN3,r10,$inp
480bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		$inp,$inp,0x40
481bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN1,$IN1,$IN1,$lemask
482bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN2,$IN2,$IN2,$lemask
483bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN3,$IN3,$IN3,$lemask
484bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN0,$IN0,$IN0,$lemask
485bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
486bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
487bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
488bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
489bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xl1,$IN1,$H3l
490bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xm1,$IN1,$H3
491bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xh1,$IN1,$H3h
492bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
493bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$Xl3
494bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xm,$Xm,$Xm3
495bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$Xh3
496bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vperm		$t0,$IN2,$IN3,$loperm
497bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vperm		$t1,$IN2,$IN3,$hiperm
498bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
499bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
500bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
501bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
502bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
503bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t0,$Xm,$zero,8
504bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$zero,$Xm,8
505bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t0
506bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$t1
507bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
508bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Xl,$Xl,$Xl,8
509bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t2
510bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
511bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
512bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
513bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
514bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xl,$xC2
515bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
516bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xl3,$Xl3,$Xl1
517bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xh3,$Xh3,$Xh1
518bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$IN0
519bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xm2,$Xm2,$Xm1
520bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$t1
521bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xm3,$Xm3,$Xm2
522bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$Xl
523bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	bge		Loop_4x
524bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
525bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLtail_4x:
526bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
527bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
528bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
529bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
530bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$Xl3
531bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xm,$Xm,$Xm3
532bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
533bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
534bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
535bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t0,$Xm,$zero,8
536bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$zero,$Xm,8
537bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	 vxor		$Xh,$Xh,$Xh3
538bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t0
539bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$Xh,$t1
540bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
541bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$Xl,$Xl,$Xl,8
542bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t2
543bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
544bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
545bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl,$Xl,$xC2
546bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$t1,$t1,$Xh
547bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl,$Xl,$t1
548bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
549bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addic.		$len,$len,4
550bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	beq		Ldone_4x
551bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
552bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN0,0,$inp
553bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	${UCMP}i	$len,2
554bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		$len,-4
555bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	blt		Lone
556bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN1,r8,$inp
557bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	beq		Ltwo
558bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
559bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLthree:
560bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx_u		$IN2,r9,$inp
561bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN0,$IN0,$IN0,$lemask
562bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN1,$IN1,$IN1,$lemask
563bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN2,$IN2,$IN2,$lemask
564bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
565bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$IN0,$Xl
566bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vmr		$H4l,$H3l
567bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vmr		$H4, $H3
568bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vmr		$H4h,$H3h
569bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
570bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vperm		$t0,$IN1,$IN2,$loperm
571bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vperm		$t1,$IN1,$IN2,$hiperm
572bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
573bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
574bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
575bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
576bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
577bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xm3,$Xm3,$Xm2
578bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	b		Ltail_4x
579bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
580bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.align	4
581bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLtwo:
582bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN0,$IN0,$IN0,$lemask
583bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN1,$IN1,$IN1,$lemask
584bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
585bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$IN0,$Xl
586bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vperm		$t0,$zero,$IN1,$loperm
587bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vperm		$t1,$zero,$IN1,$hiperm
588bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
589bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H4l,$zero,$H2,8
590bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vmr		$H4, $H2
591bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H4h,$H2,$zero,8
592bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
593bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
594bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
595bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
596bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
597bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	b		Ltail_4x
598bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
599bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.align	4
600bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLone:
601bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$IN0,$IN0,$IN0,$lemask
602bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
603bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H4l,$zero,$H,8
604bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vmr		$H4, $H
605bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vsldoi		$H4h,$H,$zero,8
606bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
607bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh,$IN0,$Xl
608bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xl3,$Xl3,$Xl3
609bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xm3,$Xm3,$Xm3
610bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	vxor		$Xh3,$Xh3,$Xh3
611bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
612bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	b		Ltail_4x
613bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
614bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven ValdezLdone_4x:
615bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	le?vperm	$Xl,$Xl,$Xl,$lemask
616bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	stvx_u		$Xl,0,$Xip		# write out Xi
617bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
618bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r10,`15+6*$SIZE_T`
619bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	li		r11,`31+6*$SIZE_T`
620bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	mtspr		256,$vrsave
621bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v20,r10,$sp
622bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
623bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v21,r11,$sp
624bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
625bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v22,r10,$sp
626bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
627bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v23,r11,$sp
628bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
629bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v24,r10,$sp
630bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
631bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v25,r11,$sp
632bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
633bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v26,r10,$sp
634bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
635bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v27,r11,$sp
636bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
637bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v28,r10,$sp
638bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r10,r10,32
639bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v29,r11,$sp
640bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		r11,r11,32
641bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v30,r10,$sp
642bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	lvx		v31,r11,$sp
643bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	addi		$sp,$sp,$FRAME
644bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	blr
645bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.long		0
646bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.byte		0,12,0x04,0,0x80,0,4,0
647bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	.long		0
648bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez___
649bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez}
650bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez$code.=<<___;
651bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.size	.gcm_ghash_p8,.-.gcm_ghash_p8
652bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
653bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
654bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez.align  2
655bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez___
656bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
657bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezforeach (split("\n",$code)) {
658bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	s/\`([^\`]*)\`/eval $1/geo;
659bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
660bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	if ($flavour =~ /le$/o) {	# little-endian
661bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	    s/le\?//o		or
662bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	    s/be\?/#be#/o;
663bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	} else {
664bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	    s/le\?/#le#/o	or
665bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	    s/be\?//o;
666bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	}
667bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez	print $_,"\n";
668bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez}
669bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdez
670bb1ceac29bc7a18b94e3da78057dc41aa7071784Steven Valdezclose STDOUT; # enforce flush
671