1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl
2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom###################################################################
4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### AES-128 [originally in CTR mode]				###
5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### bitsliced implementation for Intel Core 2 processors	###
6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### requires support of SSE extensions up to SSSE3		###
7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### Author: Emilia Käsper and Peter Schwabe			###
8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### Date: 2009-03-19						###
9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### Public domain						###
10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom###								###
11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### further information.					###
13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom###################################################################
14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# September 2011.
16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Started as transliteration to "perlasm" the original code has
18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# undergone following changes:
19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - code was made position-independent;
21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - rounds were folded into a loop resulting in >5x size reduction
22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   from 12.5KB to 2.2KB;
23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - above was possibile thanks to mixcolumns() modification that
24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   allowed to feed its output back to aesenc[last], this was
25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   achieved at cost of two additional inter-registers moves;
26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - some instruction reordering and interleaving;
27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - this module doesn't implement key setup subroutine, instead it
28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   relies on conversion of "conventional" key schedule as returned
29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   by AES_set_encrypt_key (see discussion below);
30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - first and last round keys are treated differently, which allowed
31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   to skip one shiftrows(), reduce bit-sliced key schedule and
32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#   speed-up conversion by 22%;
33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# - support for 192- and 256-bit keys was added;
34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Resulting performance in CPU cycles spent to encrypt one byte out
36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# of 4096-byte buffer with 128-bit key is:
37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#		Emilia's	this(*)		difference
39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Core 2    	9.30		8.69		+7%
41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Nehalem(**) 	7.63		6.98		+9%
42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Atom	    	17.1		17.4		-2%(***)
43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# (*)	Comparison is not completely fair, because "this" is ECB,
45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	i.e. no extra processing such as counter values calculation
46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	and xor-ing input as in Emilia's CTR implementation is
47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	performed. However, the CTR calculations stand for not more
48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	than 1% of total time, so comparison is *rather* fair.
49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# (**)	Results were collected on Westmere, which is considered to
51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	be equivalent to Nehalem for this code.
52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# (***)	Slowdown on Atom is rather strange per se, because original
54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	implementation has a number of 9+-bytes instructions, which
55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	are bad for Atom front-end, and which I eliminated completely.
56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	In attempt to address deterioration sbox() was tested in FP
57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	SIMD "domain" (movaps instead of movdqa, xorps instead of
58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	pxor, etc.). While it resulted in nominal 4% improvement on
59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	Atom, it hurted Westmere by more than 2x factor.
60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# As for key schedule conversion subroutine. Interface to OpenSSL
62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# relies on per-invocation on-the-fly conversion. This naturally
63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# has impact on performance, especially for short inputs. Conversion
64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# function is:
66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 		conversion	conversion/8x block
68a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom# Core 2	240		0.22
69a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom# Nehalem	180		0.20
70a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom# Atom		430		0.19
71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The ratio values mean that 128-byte blocks will be processed
73a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# etc. Then keep in mind that input sizes not divisible by 128 are
75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# *effectively* slower, especially shortest ones, e.g. consecutive
76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 144-byte blocks are processed 44% slower than one would expect,
77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# it's still faster than ["hyper-threading-safe" code path in]
79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# aes-x86_64.pl on all lengths above 64 bytes...
80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# October 2011.
82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Add decryption procedure. Performance in CPU cycles spent to decrypt
84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# one byte out of 4096-byte buffer with 128-bit key is:
85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Core 2	11.0
87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Nehalem	9.16
88a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom# Atom		20.9
89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# November 2011.
91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# suboptimal, but XTS is meant to be used with larger blocks...
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#						<appro@openssl.org>
96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$flavour = shift;
98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$output  = shift;
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromdie "can't locate x86_64-xlate.pl";
107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromopen STDOUT,"| $^X $xlate $flavour $output";
109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($key,$rounds,$const)=("%rax","%r10d","%r11");
116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub Sbox {
118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @b=@_[0..7];
121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @t=@_[8..11];
122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @s=@_[12..15];
123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&InBasisChange	(@b);
124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub InBasisChange {
129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @b=@_[0..7];
132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[6], @b[5]
134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[1], @b[2]
135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[0], @b[3]
136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[2], @b[6]
137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor 	@b[0], @b[5]
138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[3], @b[6]
140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[7], @b[3]
141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[7]
142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[4], @b[3]
143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[4]
144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[1], @b[3]
145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[7], @b[2]
147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[1]
148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub OutBasisChange {
152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @b=@_[0..7];
155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[6], @b[0]
157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[4], @b[1]
158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[0], @b[2]
159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[6], @b[4]
160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[1], @b[6]
161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[1]
163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[3], @b[5]
164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[7], @b[3]
165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[7]
166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[2]
167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[7], @b[4]
169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub InvSbox {
173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @b=@_[0..7];
176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @t=@_[8..11];
177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @s=@_[12..15];
178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&InvInBasisChange	(@b);
179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub InvInBasisChange {		# OutBasisChange in reverse
184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @b=@_[5,1,2,6,3,7,0,4];
185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___
186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[7], @b[4]
187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[7]
189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[2]
190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[7], @b[3]
191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[3], @b[5]
192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[1]
193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[1], @b[6]
195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[0], @b[2]
196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[6], @b[4]
197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[6], @b[0]
198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[4], @b[1]
199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub InvOutBasisChange {		# InBasisChange in reverse
203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @b=@_[2,5,7,3,6,1,0,4];
204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[1]
206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[7], @b[2]
207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[1], @b[3]
209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[4]
210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[5], @b[7]
211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[4], @b[3]
212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor 	@b[0], @b[5]
213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[7], @b[3]
214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@b[2], @b[6]
215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@b[1], @b[2]
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[3], @b[6]
217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[0], @b[3]
219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@b[6], @b[5]
220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub Mul_GF4 {
224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#;*************************************************************
225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#;*************************************************************
227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($x0,$x1,$y0,$y1,$t0)=@_;
228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$y0, $t0
230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor 	$y1, $t0
231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$x0, $t0
232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$x1, $x0
233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$y0, $x1
234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$y1, $x0
235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$x1, $x0
236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$t0, $x1
237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub Mul_GF4_N {				# not used, see next subroutine
241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# multiply and scale by N
242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($x0,$x1,$y0,$y1,$t0)=@_;
243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$y0, $t0
245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$y1, $t0
246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$x0, $t0
247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$x1, $x0
248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$y0, $x1
249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$y1, $x0
250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$x0, $x1
251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$t0, $x0
252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub Mul_GF4_N_GF4 {
256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# interleaved Mul_GF4_N and Mul_GF4
257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($x0,$x1,$y0,$y1,$t0,
258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $x2,$x3,$y2,$y3,$t1)=@_;
259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$y0, $t0
261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	$y2, $t1
262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$y1, $t0
263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor 	$y3, $t1
264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$x0, $t0
265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pand	$x2, $t1
266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$x1, $x0
267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	$x3, $x2
268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$y0, $x1
269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pand	$y2, $x3
270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$y1, $x0
271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pand	$y3, $x2
272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$x0, $x1
273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	$x3, $x2
274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$t0, $x0
275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	$t1, $x3
276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub Mul_GF16_2 {
279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @x=@_[0..7];
280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @y=@_[8..11];
281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @t=@_[12..15];
282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[0], @t[0]
284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[1], @t[1]
285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[2], @t[0]
289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[3], @t[1]
290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[2], @y[0]
291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[3], @y[1]
292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			 @x[2], @x[3], @y[2], @y[3], @t[2]);
295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @x[0]
297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @x[2]
298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @x[1]
299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @x[3]
300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[4], @t[0]
302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[5], @t[1]
303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[6], @t[0]
304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[7], @t[1]
305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			 @x[6], @x[7], @y[2], @y[3], @t[2]);
308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[2], @y[0]
310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[3], @y[1]
311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @x[4]
315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @x[6]
316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @x[5]
317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @x[7]
318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub Inv_GF256 {
321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#;********************************************************************
322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#;********************************************************************
324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @x=@_[0..7];
325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @t=@_[8..11];
326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @s=@_[12..15];
327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# direct optimizations from hardware
328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[4], @t[3]
330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[5], @t[2]
331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[1], @t[1]
332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[7], @s[1]
333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[0], @s[0]
334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[6], @t[3]
336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[7], @t[2]
337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[3], @t[1]
338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@t[3], @s[2]
339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[6], @s[1]
340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@t[2], @t[0]
341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[2], @s[0]
342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@t[3], @s[3]
343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	@t[1], @t[2]
345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	@s[0], @t[3]
346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @s[3]
347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@s[0], @s[2]
348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @s[0]
349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@t[1], @t[0]
350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@s[0], @s[3]
351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[3], @s[0]
352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[2], @s[0]
353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@s[0], @s[1]
354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[1], @t[3]
355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[1], @t[2]
356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[4], @s[1]
357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[1], @s[0]
358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[5], @s[1]
359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[0], @s[0]
360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@s[1], @t[1]
361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@s[0], @s[1]
362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	@s[0], @t[1]
363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[1], @t[0]
364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[3], @t[3]
365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[2], @t[2]
366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[3], @t[1]
367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[7], @s[0]
368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[2], @t[0]
369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[6], @s[1]
370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[2], @t[1]
371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[5], @s[2]
372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@x[3], @s[0]
373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[4], @s[3]
374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@x[2], @s[1]
375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@x[1], @s[2]
376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	@x[0], @s[3]
377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[0], @t[3]
378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[1], @t[2]
379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[2], @t[1]
380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[3], @t[0]
381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# new smaller inversion
385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[3], @s[0]
387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@t[1], @t[3]
388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @s[0]
389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[0], @s[2]
391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@s[0], @s[3]
392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[3], @s[2]
393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@s[2], @s[3]
394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[1], @s[1]
396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @s[3]
397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @s[1]
398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @t[3]
400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@t[3], @s[1]
402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@s[2], @t[2]
404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @s[1]
405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[1], @t[2]
407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[1], @t[1]
408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@t[0], @t[2]
410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @s[2]
412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @t[1]
413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	@s[3], @s[2]
415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@s[0], @s[2]
417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# output in s3, s2, s1, t1
419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# AES linear components
429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub ShiftRows {
431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @x=@_[0..7];
432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $mask=pop;
433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00($key),@x[0]
435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10($key),@x[1]
436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$mask,@x[0]
437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20($key),@x[2]
438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$mask,@x[1]
439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30($key),@x[3]
440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$mask,@x[2]
441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40($key),@x[4]
442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$mask,@x[3]
443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x50($key),@x[5]
444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$mask,@x[4]
445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x60($key),@x[6]
446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$mask,@x[5]
447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x70($key),@x[7]
448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$mask,@x[6]
449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($key),$key
450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$mask,@x[7]
451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub MixColumns {
455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# modified to emit output in order suitable for feeding back to aesenc[last]
456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @x=@_[0..7];
457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @t=@_[8..15];
458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[1], @t[1]
461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[2], @t[2]
463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[1], @x[1]
464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[3], @t[3]
465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[2], @x[2]
466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[4], @t[4]
467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[3], @x[3]
468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[5], @t[5]
469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[4], @x[4]
470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[6], @t[6]
471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[5], @x[5]
472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[7], @t[7]
473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[6], @x[6]
474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[7], @x[7]
475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[0], @t[1]
477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[7], @t[0]
478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[7], @t[1]
479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[1], @t[2]
481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$0x4E, @x[1], @x[1]
482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[4], @t[5]
483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[0], @x[0]
484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[5], @t[6]
485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	@t[1], @x[1]
486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[3], @t[4]
487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$0x4E, @x[4], @t[0]
488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[6], @t[7]
489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$0x4E, @x[5], @t[1]
490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[2], @t[3]
491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$0x4E, @x[3], @x[4]
492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[7], @t[3]
493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$0x4E, @x[7], @x[5]
494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[7], @t[4]
495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$0x4E, @x[6], @x[3]
496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[4], @t[0]
497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$0x4E, @x[2], @x[6]
498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @t[1]
499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[3], @x[4]
501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @x[5]
502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @x[3]
503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@t[0], @x[2]
504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @x[6]
505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@t[1], @x[7]
506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub InvMixColumns {
510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @x=@_[0..7];
511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @t=@_[8..15];
512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# multiplication by 0x0e
515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[7], @t[7]
516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[2], @t[2]
517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[5], @x[7]		# 7 5
518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[5], @x[2]		# 2 5
519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[0], @t[0]
520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[5], @t[5]
521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[0], @x[5]		# 5 0		[1]
522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[1], @x[0]		# 0 1
523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[1], @t[1]
524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[2], @x[1]		# 1 25
525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[6], @x[0]		# 01 6		[2]
526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[3], @x[1]		# 125 3		[4]
527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[3], @t[3]
528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[0], @x[2]		# 25 016	[3]
529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[7], @x[3]		# 3 75
530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[6], @x[7]		# 75 6		[0]
531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @x[6], @t[6]
532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[4], @t[4]
533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[4], @x[6]		# 6 4
534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[3], @x[4]		# 4 375		[6]
535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@x[7], @x[3]		# 375 756=36
536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @x[6]		# 64 5		[7]
537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @x[3]		# 36 2
538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[4], @x[3]		# 362 4		[5]
539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[5], @t[5]
540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom					my @y = @x[7,5,0,2,1,3,4,6];
542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# multiplication by 0x0b
544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[0], @y[1]
545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @y[0]
546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @y[1]
547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[2], @t[2]
548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @y[0]
549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @y[1]
550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[0]
551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[4], @t[4]
552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @t[7]		# clobber t[7]
553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[0], @y[1]
554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @y[3]
556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[0], @t[0]
557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @y[2]
558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @y[4]
559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @y[2]
560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[1], @t[1]
561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @y[3]
562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @y[5]
563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[2]
564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[2], @t[2]
565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[3], @y[3]
566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[3], @y[6]
567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[3], @y[4]
568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[3], @t[3]
569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[4], @y[7]
570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[4], @y[5]
571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[7]
572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @y[3]
573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[4], @y[4]
574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @t[7]		# clobber t[7] even more
575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[5]
577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[4], @t[4]
578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[6]
579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[4]
580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @t[7]
582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[5], @t[5]
583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @t[7]		# restore t[7]
584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# multiplication by 0x0d
586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[7], @y[4]
587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[4], @y[7]
588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[6], @t[6]
589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @y[2]
590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @y[7]
591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @y[2]
592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[7], @t[7]
593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[1], @y[3]
595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @y[1]
596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @y[0]
597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @y[3]
598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @y[1]
599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @y[0]
600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[1]
601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[0], @t[0]
602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @y[0]
603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[1], @y[3]
604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @y[4]
605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[1], @t[1]
606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[7]
608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @y[4]
609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @y[5]
610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[2], @t[2]
611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @y[2]
612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[3], @t[6]		# clobber t[6]
613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[7], @y[4]
614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @y[3]
615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @y[6]
617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @y[5]
618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[4], @y[6]
619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[4], @t[4]
620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @y[5]
621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @y[6]
622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[3], @t[6]		# restore t[6]
623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[5], @t[5]
625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[6], @t[6]
626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[7], @t[7]
627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x93, @t[3], @t[3]
628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# multiplication by 0x09
630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[1], @y[4]
631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[1], @t[1]		# t[1]=y[1]
632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @t[0]		# clobber t[0]
633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[5], @t[1]
634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[0], @y[3]
635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[0], @t[0]		# t[0]=y[0]
636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @t[1]
637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @t[6]		# clobber t[6]
638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[1], @y[4]
639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[4], @y[7]
640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[4], @t[4]		# t[4]=y[4]
641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[3], @y[6]
642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[3], @t[3]		# t[3]=y[3]
643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[2], @y[5]
644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[2], @t[2]		# t[2]=y[2]
645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[7], @t[3]
646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[5], @t[5]		# t[5]=y[5]
647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @t[2]
648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@t[6], @t[5]
649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[6], @t[6]		# t[6]=y[6]
650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@y[7], @t[7]		# t[7]=y[7]
651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[0],@XMM[0]
653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[1],@XMM[1]
654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[2],@XMM[2]
655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[3],@XMM[3]
656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[4],@XMM[4]
657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[5],@XMM[5]
658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[6],@XMM[6]
659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@t[7],@XMM[7]
660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub aesenc {				# not used
664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @b=@_[0..7];
665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @t=@_[8..15];
666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x30($const),@t[0]	# .LSR
668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ShiftRows	(@b,@t[0]);
670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Sbox		(@b,@t);
671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub aesenclast {			# not used
675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @b=@_[0..7];
676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @t=@_[8..15];
677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x40($const),@t[0]	# .LSRM0
679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ShiftRows	(@b,@t[0]);
681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Sbox		(@b,@t);
682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___
683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00($key),@b[0]
684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10($key),@b[1]
685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20($key),@b[4]
686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30($key),@b[6]
687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40($key),@b[3]
688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x50($key),@b[7]
689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x60($key),@b[2]
690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x70($key),@b[5]
691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub swapmove {
695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($a,$b,$n,$mask,$t)=@_;
696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$b,$t
698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	psrlq	\$$n,$b
699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor  	$a,$b
700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$mask,$b
701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$b,$a
702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	psllq	\$$n,$b
703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$t,$b
704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub swapmove2x {
707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$b0,$t0
710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	psrlq	\$$n,$b0
711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	$b1,$t1
712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 psrlq	\$$n,$b1
713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor  	$a0,$b0
714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor  	$a1,$b1
715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$mask,$b0
716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pand	$mask,$b1
717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$b0,$a0
718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	psllq	\$$n,$b0
719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	$b1,$a1
720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 psllq	\$$n,$b1
721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$t0,$b0
722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor	$t1,$b1
723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub bitslice {
727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @x=reverse(@_[0..7]);
728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($t0,$t1,$t2,$t3)=@_[8..11];
729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x00($const),$t0	# .LBS0
731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x10($const),$t1	# .LBS1
732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x20($const),$t0	# .LBS2
737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.text
747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.extern	asm_AES_encrypt
749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.extern	asm_AES_decrypt
750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	_bsaes_encrypt8,\@abi-omnipotent
752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_bsaes_encrypt8:
754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.LBS0(%rip), $const	# constants table
755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	($key), @XMM[9]		# round 0 key
757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($key), $key
758a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	0x50($const), @XMM[8]	# .LM0SR
759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[0]	# xor with round0 key
760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[1]
761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[0]
762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[2]
763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[1]
764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[3]
765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[2]
766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[4]
767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[3]
768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[5]
769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[4]
770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[5]
772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[7]
773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[6]
774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[7]
775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_bsaes_encrypt8_bitslice:
776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&bitslice	(@XMM[0..7, 8..11]);
778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lenc_sbox
781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lenc_loop:
783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ShiftRows	(@XMM[0..7, 8]);
785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=".Lenc_sbox:\n";
786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&Sbox		(@XMM[0..7, 8..15]);
787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jl	.Lenc_done
790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x30($const), @XMM[8]	# .LSR
794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lenc_loop
795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x40($const), @XMM[8]	# .LSRM0
796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lenc_loop
797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lenc_done:
799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	($key), @XMM[8]		# last round key
804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[4]
805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[6]
806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[3]
807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[7]
808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[2]
809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[5]
810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[0]
811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	_bsaes_encrypt8,.-_bsaes_encrypt8
814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	_bsaes_decrypt8,\@abi-omnipotent
816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_bsaes_decrypt8:
818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.LBS0(%rip), $const	# constants table
819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	($key), @XMM[9]		# round 0 key
821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($key), $key
822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[0]	# xor with round0 key
824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[1]
825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[0]
826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[2]
827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[1]
828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[3]
829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[2]
830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[4]
831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[3]
832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[5]
833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[4]
834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[5]
836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[7]
837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[6]
838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[7]
839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&bitslice	(@XMM[0..7, 8..11]);
841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Ldec_sbox
844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Ldec_loop:
846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&ShiftRows	(@XMM[0..7, 8]);
848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=".Ldec_sbox:\n";
849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&InvSbox	(@XMM[0..7, 8..15]);
850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jl	.Ldec_done
853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	-0x10($const), @XMM[8]	# .LISR
857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Ldec_loop
858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	-0x20($const), @XMM[8]	# .LISRM0
859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Ldec_loop
860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Ldec_done:
862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	($key), @XMM[8]		# last round key
866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[6]
867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[4]
868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[2]
869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[7]
870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[3]
871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[5]
872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[0]
873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	_bsaes_decrypt8,.-_bsaes_decrypt8
876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub bitslice_key {
882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @x=reverse(@_[0..7]);
883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[0], @x[2]
889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[1], @x[3]
890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[0], @x[4]
897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[2], @x[6]
898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[1], @x[5]
899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@x[3], @x[7]
900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	_bsaes_key_convert,\@abi-omnipotent
907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_bsaes_key_convert:
909a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	lea	.Lmasks(%rip), $const
910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($inp), %xmm7		# load round 0 key
911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($inp), $inp
912a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	0x00($const), %xmm0	# 0x01...
913a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	0x10($const), %xmm1	# 0x02...
914a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	0x20($const), %xmm2	# 0x04...
915a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	0x30($const), %xmm3	# 0x08...
916a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	0x40($const), %xmm4	# .LM0
917a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqd	%xmm5, %xmm5		# .LNOT
918a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom
919a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqu	($inp), %xmm6		# load round 1 key
920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7, ($out)		# save round 0 key
921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($out), $out
922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lkey_loop
924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_loop:
926a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pshufb	%xmm4, %xmm6		# .LM0
927a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom
928a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm0,	%xmm8
929a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm1,	%xmm9
930a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom
931a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pand	%xmm6,	%xmm8
932a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pand	%xmm6,	%xmm9
933a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm2,	%xmm10
934a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqb	%xmm0,	%xmm8
935a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	psllq	\$4,	%xmm0		# 0x10...
936a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm3,	%xmm11
937a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqb	%xmm1,	%xmm9
938a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	psllq	\$4,	%xmm1		# 0x20...
939a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom
940a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pand	%xmm6,	%xmm10
941a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pand	%xmm6,	%xmm11
942a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm0,	%xmm12
943a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqb	%xmm2,	%xmm10
944a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	psllq	\$4,	%xmm2		# 0x40...
945a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm1,	%xmm13
946a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqb	%xmm3,	%xmm11
947a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	psllq	\$4,	%xmm3		# 0x80...
948a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom
949a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm2,	%xmm14
950a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm3,	%xmm15
951a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	 pxor	%xmm5,	%xmm8		# "pnot"
952a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	 pxor	%xmm5,	%xmm9
953a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom
954a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pand	%xmm6,	%xmm12
955a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pand	%xmm6,	%xmm13
956a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
957a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqb	%xmm0,	%xmm12
958a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	psrlq	\$4,	%xmm0		# 0x01...
959a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	 movdqa	%xmm9, 0x10($out)
960a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqb	%xmm1,	%xmm13
961a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	psrlq	\$4,	%xmm1		# 0x02...
962a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	 lea	0x10($inp), $inp
963a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom
964a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pand	%xmm6,	%xmm14
965a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pand	%xmm6,	%xmm15
966a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	 movdqa	%xmm10, 0x20($out)
967a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqb	%xmm2,	%xmm14
968a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	psrlq	\$4,	%xmm2		# 0x04...
969a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	 movdqa	%xmm11, 0x30($out)
970a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pcmpeqb	%xmm3,	%xmm15
971a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	psrlq	\$4,	%xmm3		# 0x08...
972a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	 movdqu	($inp), %xmm6		# load next round key
973a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom
974a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pxor	%xmm5, %xmm13		# "pnot"
975a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	pxor	%xmm5, %xmm14
976a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm12, 0x40($out)
977a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm13, 0x50($out)
978a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm14, 0x60($out)
979a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	%xmm15, 0x70($out)
980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out),$out
981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lkey_loop
983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
984a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	movdqa	0x50($const), %xmm7	# .L63
985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#movdqa	%xmm6, ($out)		# don't save last round key
986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	_bsaes_key_convert,.-_bsaes_key_convert
988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif (0 && !$win64) {	# following four functions are unsupported interface
992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			# used for benchmarking...
993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_enc_key_convert
995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_enc_key_convert,\@function,2
996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_enc_key_convert:
998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($inp),%r10d		# pass rounds
999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$inp,%rcx		# pass key
1000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$out,%rax		# pass key schedule
1001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_key_convert
1002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm6,%xmm7		# fix up last round key
1003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7,(%rax)		# save last round key
1004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_encrypt_128
1008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_encrypt_128,\@function,4
1009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_encrypt_128:
1011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lenc128_loop:
1012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[0]	# load input
1013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[1]
1014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[2]
1015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[3]
1016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[4]
1017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[5]
1018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[6]
1019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[7]
1020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key, %rax		# pass the $key
1021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp), $inp
1022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$10,%r10d
1023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
1030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
1031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x60($out)
1033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[5], 0x70($out)
1034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out), $out
1035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
1036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ja	.Lenc128_loop
1037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_dec_key_convert
1041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_dec_key_convert,\@function,2
1042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_dec_key_convert:
1044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($inp),%r10d		# pass rounds
1045392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$inp,%rcx		# pass key
1046392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$out,%rax		# pass key schedule
1047392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_key_convert
1048392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	($out),%xmm7		# fix up round 0 key
1049392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm6,(%rax)		# save last round key
1050392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7,($out)
1051392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1052392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1053392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1054392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_decrypt_128
1055392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_decrypt_128,\@function,4
1056392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1057392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_decrypt_128:
1058392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Ldec128_loop:
1059392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[0]	# load input
1060392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[1]
1061392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[2]
1062392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[3]
1063392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[4]
1064392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[5]
1065392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[6]
1066392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[7]
1067392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key, %rax		# pass the $key
1068392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp), $inp
1069392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$10,%r10d
1070392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1071392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1072392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1073392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1074392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1075392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1076392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1077392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1078392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1079392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x60($out)
1080392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[5], 0x70($out)
1081392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out), $out
1082392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
1083392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ja	.Ldec128_loop
1084392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1085392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1086392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1087392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
1088392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
1089392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom######################################################################
1090392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
1091392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# OpenSSL interface
1092392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
1093392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1094392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1095392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1096392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1097392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($ecb) {
1098392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1099392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_ecb_encrypt_blocks
1100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_ecb_encrypt_blocks:
1103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax
1104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_prologue:
1105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
1106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
1107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
1108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
1109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
1110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
1111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x48(%rsp),%rsp
1112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0xa0(%rsp), %rsp
1115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6, 0x40(%rsp)
1116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7, 0x50(%rsp)
1117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8, 0x60(%rsp)
1118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9, 0x70(%rsp)
1119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10, 0x80(%rsp)
1120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11, 0x90(%rsp)
1121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12, 0xa0(%rsp)
1122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13, 0xb0(%rsp)
1123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14, 0xc0(%rsp)
1124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15, 0xd0(%rsp)
1125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_body:
1126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp,%rbp		# backup %rsp
1129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($arg4),%eax		# rounds
1130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg1,$inp		# backup arguments
1131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg2,$out
1132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg3,$len
1133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg4,$key
1134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$8,$arg3
1135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_short
1136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%eax,%ebx		# backup rounds
1138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$7,%rax		# 128 bytes per inner round key
1139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	%rax,%rsp
1141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp,%rax		# pass key schedule
1142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key,%rcx		# pass key
1143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%ebx,%r10d		# pass rounds
1144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_key_convert
1145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm6,%xmm7		# fix up last round key
1146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7,(%rax)		# save last round key
1147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$8,$len
1149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_loop:
1150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[0]	# load input
1151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[1]
1152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[2]
1153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[3]
1154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[4]
1155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[5]
1156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
1157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[6]
1158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%ebx,%r10d		# pass rounds
1159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[7]
1160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp), $inp
1161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
1168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
1169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x60($out)
1171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[5], 0x70($out)
1172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out), $out
1173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$8,$len
1174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lecb_enc_loop
1175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$8,$len
1177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lecb_enc_done
1178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[0]	# load input
1180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
1181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%ebx,%r10d		# pass rounds
1182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$2,$len
1183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_one
1184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[1]
1185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_enc_two
1186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[2]
1187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$4,$len
1188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_three
1189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[3]
1190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_enc_four
1191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[4]
1192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$6,$len
1193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_five
1194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[5]
1195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_enc_six
1196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[6]
1197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
1202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
1203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x60($out)
1205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_enc_done
1206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_six:
1208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
1213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
1214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_enc_done
1216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_five:
1218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
1223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
1224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_enc_done
1225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_four:
1227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
1232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_enc_done
1233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_three:
1235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_enc_done
1240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_two:
1242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_enc_done
1246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_one:
1248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
1249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_enc_done
1251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_short:
1253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($inp), $arg1
1254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($out), $arg2
1255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
1256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_encrypt
1257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($inp), $inp
1258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($out), $out
1259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$len
1260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lecb_enc_short
1261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_done:
1263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp),%rax
1264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm0, %xmm0
1265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_bzero:			# wipe key schedule [if any]
1266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x00(%rax)
1267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x10(%rax)
1268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rax), %rax
1269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%rax, %rbp
1270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_bzero
1271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rbp),%rsp		# restore %rsp
1273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x40(%rbp), %xmm6
1276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x50(%rbp), %xmm7
1277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rbp), %xmm8
1278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rbp), %xmm9
1279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rbp), %xmm10
1280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rbp), %xmm11
1281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rbp), %xmm12
1282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rbp), %xmm13
1283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xc0(%rbp), %xmm14
1284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xd0(%rbp), %xmm15
1285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xa0(%rbp), %rsp
1286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x48(%rsp), %r15
1289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x50(%rsp), %r14
1290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x58(%rsp), %r13
1291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x60(%rsp), %r12
1292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x68(%rsp), %rbx
1293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x70(%rsp), %rax
1294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x78(%rsp), %rsp
1295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax, %rbp
1296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_epilogue:
1297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_ecb_decrypt_blocks
1301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_ecb_decrypt_blocks:
1304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax
1305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_prologue:
1306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
1307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
1308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
1309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
1310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
1311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
1312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x48(%rsp),%rsp
1313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0xa0(%rsp), %rsp
1316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6, 0x40(%rsp)
1317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7, 0x50(%rsp)
1318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8, 0x60(%rsp)
1319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9, 0x70(%rsp)
1320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10, 0x80(%rsp)
1321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11, 0x90(%rsp)
1322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12, 0xa0(%rsp)
1323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13, 0xb0(%rsp)
1324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14, 0xc0(%rsp)
1325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15, 0xd0(%rsp)
1326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_body:
1327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp,%rbp		# backup %rsp
1330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($arg4),%eax		# rounds
1331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg1,$inp		# backup arguments
1332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg2,$out
1333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg3,$len
1334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg4,$key
1335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$8,$arg3
1336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_short
1337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%eax,%ebx		# backup rounds
1339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$7,%rax		# 128 bytes per inner round key
1340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	%rax,%rsp
1342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp,%rax		# pass key schedule
1343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key,%rcx		# pass key
1344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%ebx,%r10d		# pass rounds
1345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_key_convert
1346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	(%rsp),%xmm7		# fix up 0 round key
1347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm6,(%rax)		# save last round key
1348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7,(%rsp)
1349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$8,$len
1351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_loop:
1352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[0]	# load input
1353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[1]
1354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[2]
1355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[3]
1356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[4]
1357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[5]
1358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
1359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[6]
1360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%ebx,%r10d		# pass rounds
1361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[7]
1362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp), $inp
1363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x60($out)
1373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[5], 0x70($out)
1374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out), $out
1375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$8,$len
1376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lecb_dec_loop
1377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$8,$len
1379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lecb_dec_done
1380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[0]	# load input
1382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
1383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%ebx,%r10d		# pass rounds
1384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$2,$len
1385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_one
1386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[1]
1387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_dec_two
1388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[2]
1389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$4,$len
1390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_three
1391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[3]
1392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_dec_four
1393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[4]
1394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$6,$len
1395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_five
1396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[5]
1397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_dec_six
1398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[6]
1399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x60($out)
1407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_dec_done
1408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_six:
1410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_dec_done
1418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_five:
1420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_dec_done
1427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_four:
1429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_dec_done
1435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_three:
1437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_dec_done
1442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_two:
1444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_dec_done
1448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_one:
1450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_dec_done
1453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_short:
1455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($inp), $arg1
1456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($out), $arg2
1457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
1458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_decrypt
1459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($inp), $inp
1460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($out), $out
1461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$len
1462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lecb_dec_short
1463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_done:
1465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp),%rax
1466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm0, %xmm0
1467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_bzero:			# wipe key schedule [if any]
1468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x00(%rax)
1469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x10(%rax)
1470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rax), %rax
1471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%rax, %rbp
1472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_bzero
1473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rbp),%rsp		# restore %rsp
1475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x40(%rbp), %xmm6
1478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x50(%rbp), %xmm7
1479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rbp), %xmm8
1480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rbp), %xmm9
1481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rbp), %xmm10
1482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rbp), %xmm11
1483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rbp), %xmm12
1484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rbp), %xmm13
1485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xc0(%rbp), %xmm14
1486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xd0(%rbp), %xmm15
1487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xa0(%rbp), %rsp
1488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x48(%rsp), %r15
1491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x50(%rsp), %r14
1492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x58(%rsp), %r13
1493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x60(%rsp), %r12
1494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x68(%rsp), %rbx
1495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x70(%rsp), %rax
1496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x78(%rsp), %rsp
1497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax, %rbp
1498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_epilogue:
1499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
1503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.extern	asm_AES_cbc_encrypt
1505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_cbc_encrypt
1506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_cbc_encrypt,\@abi-omnipotent
1507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_cbc_encrypt:
1509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	48(%rsp),$arg6		# pull direction flag
1512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0,$arg6
1515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jne	asm_AES_cbc_encrypt
1516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$128,$arg3
1517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	asm_AES_cbc_encrypt
1518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax
1520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_prologue:
1521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
1522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
1523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
1524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
1525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
1526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
1527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x48(%rsp), %rsp
1528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0xa0(%rsp),$arg5	# pull ivp
1531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0xa0(%rsp), %rsp
1532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6, 0x40(%rsp)
1533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7, 0x50(%rsp)
1534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8, 0x60(%rsp)
1535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9, 0x70(%rsp)
1536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10, 0x80(%rsp)
1537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11, 0x90(%rsp)
1538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12, 0xa0(%rsp)
1539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13, 0xb0(%rsp)
1540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14, 0xc0(%rsp)
1541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15, 0xd0(%rsp)
1542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_body:
1543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rbp		# backup %rsp
1546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($arg4), %eax	# rounds
1547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg1, $inp		# backup arguments
1548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg2, $out
1549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg3, $len
1550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg4, $key
1551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg5, %rbx
1552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$4, $len		# bytes to blocks
1553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%eax, %edx		# rounds
1555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$7, %rax		# 128 bytes per inner round key
1556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	%rax, %rsp
1558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
1560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key, %rcx		# pass key
1561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
1562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_key_convert
1563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	(%rsp),%xmm7		# fix up 0 round key
1564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm6,(%rax)		# save last round key
1565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7,(%rsp)
1566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	(%rbx), @XMM[15]	# load IV
1568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$8,$len
1569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_loop:
1570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[0]	# load input
1571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[1]
1572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[2]
1573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[3]
1574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[4]
1575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[5]
1576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
1577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[6]
1578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx,%r10d		# pass rounds
1579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[7]
1580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# re-load input
1586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[9]
1587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
1588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[10]
1589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
1590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[11]
1591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[10], @XMM[4]
1592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[12]
1593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[11], @XMM[2]
1594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[13]
1595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[12], @XMM[7]
1596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[14]
1597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[13], @XMM[3]
1598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[15]	# IV
1599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[14], @XMM[5]
1600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp), $inp
1602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x60($out)
1608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[5], 0x70($out)
1609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out), $out
1610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$8,$len
1611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lcbc_dec_loop
1612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$8,$len
1614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lcbc_dec_done
1615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[0]	# load input
1617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
1618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
1619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$2,$len
1620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcbc_dec_one
1621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[1]
1622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lcbc_dec_two
1623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[2]
1624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$4,$len
1625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcbc_dec_three
1626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[3]
1627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lcbc_dec_four
1628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[4]
1629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$6,$len
1630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcbc_dec_five
1631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[5]
1632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lcbc_dec_six
1633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[6]
1634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# re-load input
1638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[9]
1639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
1640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[10]
1641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
1642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[11]
1643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[10], @XMM[4]
1644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[12]
1645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[11], @XMM[2]
1646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[13]
1647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[12], @XMM[7]
1648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[15]	# IV
1649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[13], @XMM[3]
1650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x60($out)
1657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_done
1658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_six:
1660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# re-load input
1664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[9]
1665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
1666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[10]
1667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
1668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[11]
1669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[10], @XMM[4]
1670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[12]
1671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[11], @XMM[2]
1672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[15]	# IV
1673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[12], @XMM[7]
1674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_done
1681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_five:
1683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# re-load input
1687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[9]
1688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
1689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[10]
1690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
1691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[11]
1692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[10], @XMM[4]
1693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[15]	# IV
1694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[11], @XMM[2]
1695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
1700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_done
1701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_four:
1703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# re-load input
1707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[9]
1708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
1709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[10]
1710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
1711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[15]	# IV
1712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[10], @XMM[4]
1713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
1717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_done
1718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_three:
1720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# re-load input
1724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[9]
1725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
1726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[15]	# IV
1727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
1728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
1731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_done
1732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_two:
1734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
1736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# re-load input
1738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[15]	# IV
1739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[1]
1740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_done
1743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_one:
1745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($inp), $arg1
1746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg2	# buffer output
1747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
1748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_decrypt		# doesn't touch %xmm
1749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[15], ($out)	# write output
1751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], @XMM[15]	# IV
1752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_done:
1754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[15], (%rbx)	# return IV
1755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp), %rax
1756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm0, %xmm0
1757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_bzero:			# wipe key schedule [if any]
1758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x00(%rax)
1759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x10(%rax)
1760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rax), %rax
1761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%rax, %rbp
1762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ja	.Lcbc_dec_bzero
1763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rbp),%rsp		# restore %rsp
1765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x40(%rbp), %xmm6
1768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x50(%rbp), %xmm7
1769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rbp), %xmm8
1770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rbp), %xmm9
1771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rbp), %xmm10
1772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rbp), %xmm11
1773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rbp), %xmm12
1774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rbp), %xmm13
1775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xc0(%rbp), %xmm14
1776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xd0(%rbp), %xmm15
1777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xa0(%rbp), %rsp
1778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x48(%rsp), %r15
1781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x50(%rsp), %r14
1782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x58(%rsp), %r13
1783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x60(%rsp), %r12
1784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x68(%rsp), %rbx
1785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x70(%rsp), %rax
1786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x78(%rsp), %rsp
1787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax, %rbp
1788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_epilogue:
1789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_ctr32_encrypt_blocks
1793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_ctr32_encrypt_blocks:
1796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax
1797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_prologue:
1798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
1799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
1800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
1801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
1802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
1803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
1804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x48(%rsp), %rsp
1805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0xa0(%rsp),$arg5	# pull ivp
1808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0xa0(%rsp), %rsp
1809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6, 0x40(%rsp)
1810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7, 0x50(%rsp)
1811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8, 0x60(%rsp)
1812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9, 0x70(%rsp)
1813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10, 0x80(%rsp)
1814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11, 0x90(%rsp)
1815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12, 0xa0(%rsp)
1816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13, 0xb0(%rsp)
1817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14, 0xc0(%rsp)
1818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15, 0xd0(%rsp)
1819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_body:
1820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rbp		# backup %rsp
1823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($arg5), %xmm0		# load counter
1824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($arg4), %eax	# rounds
1825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg1, $inp		# backup arguments
1826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg2, $out
1827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg3, $len
1828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg4, $key
1829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$8, $arg3
1831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lctr_enc_short
1832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%eax, %ebx		# rounds
1834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$7, %rax		# 128 bytes per inner round key
1835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	%rax, %rsp
1837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
1839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key, %rcx		# pass key
1840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%ebx, %r10d		# pass rounds
1841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_key_convert
1842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm6,%xmm7		# fix up last round key
1843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7,(%rax)		# save last round key
1844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	(%rsp), @XMM[9]		# load round0 key
1846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.LADD1(%rip), %r11
1847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	@XMM[8], @XMM[0]
1851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lctr_enc_loop
1853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_loop:
1855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], @XMM[2]
1858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddd	0x00(%r11), @XMM[1]	# .LADD1
1859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], @XMM[3]
1860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddd	0x10(%r11), @XMM[2]	# .LADD2
1861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], @XMM[4]
1862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddd	0x20(%r11), @XMM[3]	# .LADD3
1863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], @XMM[5]
1864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddd	0x30(%r11), @XMM[4]	# .LADD4
1865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], @XMM[6]
1866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddd	0x40(%r11), @XMM[5]	# .LADD5
1867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], @XMM[7]
1868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddd	0x50(%r11), @XMM[6]	# .LADD6
1869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddd	0x60(%r11), @XMM[7]	# .LADD7
1870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# to flip byte order in 32-bit counter
1873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	(%rsp), @XMM[9]		# round 0 key
1874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10(%rsp), %rax	# pass key schedule
1875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[1]
1878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[0]
1879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[2]
1880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[1]
1881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[3]
1882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[2]
1883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[4]
1884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[3]
1885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[5]
1886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[4]
1887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[6]
1888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[5]
1889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[7]
1890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[6]
1891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.LBS0(%rip), %r11	# constants table
1892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb	@XMM[8], @XMM[7]
1893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%ebx,%r10d		# pass rounds
1894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8_bitslice
1896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$8,$len
1898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jc	.Lctr_enc_loop_done
1899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# load input
1901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[9]
1902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[10]
1903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[11]
1904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[12]
1905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[13]
1906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[14]
1907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[15]
1908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp),$inp
1909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[0], @XMM[8]
1910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x20(%rbp), @XMM[0]	# load counter
1911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[1]
1912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[8], 0x00($out)	# write output
1913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[10], @XMM[4]
1914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[11], @XMM[6]
1916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[12], @XMM[3]
1918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
1919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[13], @XMM[7]
1920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
1921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[14], @XMM[2]
1922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[15], @XMM[5]
1924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x60($out)
1925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.LADD1(%rip), %r11
1926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[5], 0x70($out)
1927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out), $out
1928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddd	0x70(%r11), @XMM[0]	# .LADD8
1929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lctr_enc_loop
1930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lctr_enc_done
1932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_loop_done:
1934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$8, $len
1935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x00($inp), @XMM[8]	# load input
1936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8], @XMM[0]
1937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
1938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$2,$len
1939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lctr_enc_done
1940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp), @XMM[9]
1941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[9], @XMM[1]
1942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
1943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lctr_enc_done
1944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp), @XMM[10]
1945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[10], @XMM[4]
1946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
1947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$4,$len
1948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lctr_enc_done
1949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp), @XMM[11]
1950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[11], @XMM[6]
1951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
1952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lctr_enc_done
1953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp), @XMM[12]
1954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[12], @XMM[3]
1955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
1956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$6,$len
1957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lctr_enc_done
1958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp), @XMM[13]
1959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[13], @XMM[7]
1960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
1961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lctr_enc_done
1962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[14]
1963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[14], @XMM[2]
1964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x60($out)
1965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lctr_enc_done
1966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_short:
1969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg1
1970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x30(%rbp), $arg2
1971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
1972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_encrypt
1973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($inp), @XMM[1]
1974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($inp), $inp
1975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x2c(%rbp), %eax	# load 32-bit counter
1976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	bswap	%eax
1977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rbp), @XMM[1]
1978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	inc	%eax			# increment
1979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], ($out)
1980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	bswap	%eax
1981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($out), $out
1982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
1983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$len
1984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lctr_enc_short
1985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_done:
1987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp), %rax
1988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm0, %xmm0
1989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_bzero:			# wipe key schedule [if any]
1990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x00(%rax)
1991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x10(%rax)
1992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rax), %rax
1993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%rax, %rbp
1994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ja	.Lctr_enc_bzero
1995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rbp),%rsp		# restore %rsp
1997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x40(%rbp), %xmm6
2000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x50(%rbp), %xmm7
2001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rbp), %xmm8
2002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rbp), %xmm9
2003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rbp), %xmm10
2004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rbp), %xmm11
2005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rbp), %xmm12
2006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rbp), %xmm13
2007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xc0(%rbp), %xmm14
2008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xd0(%rbp), %xmm15
2009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xa0(%rbp), %rsp
2010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x48(%rsp), %r15
2013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x50(%rsp), %r14
2014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x58(%rsp), %r13
2015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x60(%rsp), %r12
2016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x68(%rsp), %rbx
2017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x70(%rsp), %rax
2018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x78(%rsp), %rsp
2019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax, %rbp
2020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_epilogue:
2021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom######################################################################
2025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	const AES_KEY *key1, const AES_KEY *key2,
2027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	const unsigned char iv[16]);
2028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
2029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($twmask,$twres,$twtmp)=@XMM[13..15];
2030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_xts_encrypt
2032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_xts_encrypt,\@abi-omnipotent
2033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_xts_encrypt:
2035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax
2036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_prologue:
2037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
2038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
2039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
2040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
2041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
2042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
2043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x48(%rsp), %rsp
2044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2045392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
2046392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0xa0(%rsp),$arg5	# pull key2
2047392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0xa8(%rsp),$arg6	# pull ivp
2048392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0xa0(%rsp), %rsp
2049392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6, 0x40(%rsp)
2050392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7, 0x50(%rsp)
2051392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8, 0x60(%rsp)
2052392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9, 0x70(%rsp)
2053392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10, 0x80(%rsp)
2054392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11, 0x90(%rsp)
2055392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12, 0xa0(%rsp)
2056392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13, 0xb0(%rsp)
2057392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14, 0xc0(%rsp)
2058392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15, 0xd0(%rsp)
2059392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_body:
2060392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2061392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2062392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rbp		# backup %rsp
2063392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg1, $inp		# backup arguments
2064392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg2, $out
2065392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg3, $len
2066392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg4, $key
2067392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2068392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($arg6), $arg1
2069392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg2
2070392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($arg5), $arg3
2071392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_encrypt		# generate initial tweak
2072392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2073392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key), %eax		# rounds
2074392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$len, %rbx		# backup $len
2075392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2076392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%eax, %edx		# rounds
2077392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$7, %rax		# 128 bytes per inner round key
2078392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2079392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	%rax, %rsp
2080392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2081392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
2082392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key, %rcx		# pass key
2083392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2084392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_key_convert
2085392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm6, %xmm7		# fix up last round key
2086392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7, (%rax)		# save last round key
2087392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2088392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$-16, $len
2089392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80, %rsp		# place for tweak[8]
2090392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2091392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2092392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2093392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lxts_magic(%rip), $twmask
2094392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2095392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2096392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80, $len
2097392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jc	.Lxts_enc_short
2098392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_loop
2099392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_loop:
2102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for ($i=0;$i<7;$i++) {
2104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___;
2105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13, $twtmp, $twres
2106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], @XMM[$i]
2108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask, $twres		# isolate carry and residue
2111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres, @XMM[7]
2113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___ if ($i>=1);
2115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___ if ($i>=2);
2118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
2121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[8+6]
2123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+5], @XMM[5]
2124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[8+7]
2125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp), $inp
2126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], 0x70(%rsp)
2127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+6], @XMM[6]
2128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+7], @XMM[7]
2130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
2133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[4]
2138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[6]
2140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
2141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40(%rsp), @XMM[3]
2142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
2143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x50(%rsp), @XMM[7]
2144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
2145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x60(%rsp), @XMM[2]
2146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
2147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x70(%rsp), @XMM[5]
2148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x60($out)
2149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[5], 0x70($out)
2150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out), $out
2151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lxts_magic(%rip), $twmask
2155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp
2156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13, $twtmp, $twres
2157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask, $twres		# isolate carry and residue
2160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres, @XMM[7]
2162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
2164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lxts_enc_loop
2165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_short:
2167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$0x80, $len
2168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_enc_done
2169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for ($i=0;$i<7;$i++) {
2171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___;
2172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13, $twtmp, $twres
2173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], @XMM[$i]
2175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask, $twres		# isolate carry and residue
2178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres, @XMM[7]
2180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___ if ($i>=1);
2182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$`0x10*$i`,$len
2184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lxts_enc_$i
2185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___ if ($i>=2);
2187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
2190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[8+6]
2192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+5], @XMM[5]
2193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], 0x70(%rsp)
2194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x70($inp), $inp
2195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+6], @XMM[6]
2196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
2200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[4]
2205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[6]
2207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
2208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40(%rsp), @XMM[3]
2209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
2210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x50(%rsp), @XMM[7]
2211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
2212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x60(%rsp), @XMM[2]
2213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
2214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x60($out)
2215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x70($out), $out
2216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
2219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_6:
2221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+4], @XMM[4]
2222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x60($inp), $inp
2223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+5], @XMM[5]
2224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
2228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[4]
2233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[6]
2235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
2236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40(%rsp), @XMM[3]
2237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
2238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x50(%rsp), @XMM[7]
2239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
2240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
2241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x60($out), $out
2242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
2245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_5:
2247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+3], @XMM[3]
2248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x50($inp), $inp
2249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+4], @XMM[4]
2250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
2254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[4]
2259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[6]
2261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
2262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40(%rsp), @XMM[3]
2263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
2264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x40($out)
2265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x50($out), $out
2266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
2269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_4:
2271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+2], @XMM[2]
2272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x40($inp), $inp
2273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+3], @XMM[3]
2274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
2278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[4]
2283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[6]
2285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
2286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x30($out)
2287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x40($out), $out
2288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
2291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_3:
2293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+1], @XMM[1]
2294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x30($inp), $inp
2295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+2], @XMM[2]
2296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
2300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[4]
2305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x20($out)
2307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x30($out), $out
2308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
2311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_2:
2313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+0], @XMM[0]
2314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20($inp), $inp
2315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+1], @XMM[1]
2316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_encrypt8
2320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20($out), $out
2326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
2329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_1:
2331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[0], @XMM[8]
2332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($inp), $inp
2333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[8], 0x20(%rbp)
2334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg1
2335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg2
2336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
2337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_encrypt		# doesn't touch %xmm
2338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#pxor	@XMM[8], @XMM[0]
2340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#lea	0x80(%rsp), %rax	# pass key schedule
2341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#mov	%edx, %r10d		# pass rounds
2342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#call	_bsaes_encrypt8
2343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($out), $out
2346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_done:
2350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$15, %ebx
2351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_enc_ret
2352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$out, %rdx
2353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_steal:
2355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movzb	($inp), %eax
2356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movzb	-16(%rdx), %ecx
2357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($inp), $inp
2358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%al, -16(%rdx)
2359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%cl, 0(%rdx)
2360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1(%rdx), %rdx
2361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$1,%ebx
2362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lxts_enc_steal
2363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	-16($out), @XMM[0]
2365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg1
2366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[7], @XMM[0]
2367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg2
2368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], 0x20(%rbp)
2369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
2370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_encrypt		# doesn't touch %xmm
2371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[7]
2372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], -16($out)
2373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_ret:
2375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp), %rax
2376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm0, %xmm0
2377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_bzero:			# wipe key schedule [if any]
2378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x00(%rax)
2379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x10(%rax)
2380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rax), %rax
2381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%rax, %rbp
2382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ja	.Lxts_enc_bzero
2383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rbp),%rsp		# restore %rsp
2385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
2387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x40(%rbp), %xmm6
2388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x50(%rbp), %xmm7
2389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rbp), %xmm8
2390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rbp), %xmm9
2391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rbp), %xmm10
2392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rbp), %xmm11
2393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rbp), %xmm12
2394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rbp), %xmm13
2395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xc0(%rbp), %xmm14
2396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xd0(%rbp), %xmm15
2397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xa0(%rbp), %rsp
2398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x48(%rsp), %r15
2401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x50(%rsp), %r14
2402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x58(%rsp), %r13
2403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x60(%rsp), %r12
2404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x68(%rsp), %rbx
2405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x70(%rsp), %rax
2406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x78(%rsp), %rsp
2407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax, %rbp
2408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_epilogue:
2409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	bsaes_xts_decrypt
2413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	bsaes_xts_decrypt,\@abi-omnipotent
2414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrombsaes_xts_decrypt:
2416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax
2417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_prologue:
2418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
2419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
2420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
2421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
2422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
2423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
2424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x48(%rsp), %rsp
2425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
2427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0xa0(%rsp),$arg5	# pull key2
2428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0xa8(%rsp),$arg6	# pull ivp
2429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0xa0(%rsp), %rsp
2430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6, 0x40(%rsp)
2431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7, 0x50(%rsp)
2432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8, 0x60(%rsp)
2433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9, 0x70(%rsp)
2434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10, 0x80(%rsp)
2435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11, 0x90(%rsp)
2436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12, 0xa0(%rsp)
2437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13, 0xb0(%rsp)
2438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14, 0xc0(%rsp)
2439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15, 0xd0(%rsp)
2440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_body:
2441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rbp		# backup %rsp
2444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg1, $inp		# backup arguments
2445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg2, $out
2446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg3, $len
2447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$arg4, $key
2448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($arg6), $arg1
2450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg2
2451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($arg5), $arg3
2452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_encrypt		# generate initial tweak
2453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key), %eax		# rounds
2455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$len, %rbx		# backup $len
2456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%eax, %edx		# rounds
2458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$7, %rax		# 128 bytes per inner round key
2459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	%rax, %rsp
2461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsp, %rax		# pass key schedule
2463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key, %rcx		# pass key
2464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_key_convert
2466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	(%rsp), %xmm7		# fix up round 0 key
2467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm6, (%rax)		# save last round key
2468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm7, (%rsp)
2469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%eax, %eax		# if ($len%16) len-=16;
2471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$-16, $len
2472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	\$15, %ebx
2473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	setnz	%al
2474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$4, %rax
2475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	%rax, $len
2476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80, %rsp		# place for tweak[8]
2478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lxts_magic(%rip), $twmask
2482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80, $len
2485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jc	.Lxts_dec_short
2486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_loop
2487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_loop:
2490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for ($i=0;$i<7;$i++) {
2492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___;
2493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13, $twtmp, $twres
2494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], @XMM[$i]
2496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask, $twres		# isolate carry and residue
2499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres, @XMM[7]
2501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___ if ($i>=1);
2503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___ if ($i>=2);
2506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
2509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[8+6]
2511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+5], @XMM[5]
2512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp), @XMM[8+7]
2513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp), $inp
2514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], 0x70(%rsp)
2515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+6], @XMM[6]
2516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+7], @XMM[7]
2518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
2521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[6]
2526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[4]
2528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
2529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40(%rsp), @XMM[2]
2530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
2531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x50(%rsp), @XMM[7]
2532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
2533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x60(%rsp), @XMM[3]
2534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
2535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x70(%rsp), @XMM[5]
2536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x60($out)
2537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[5], 0x70($out)
2538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out), $out
2539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lxts_magic(%rip), $twmask
2543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp
2544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13, $twtmp, $twres
2545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask, $twres		# isolate carry and residue
2548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres, @XMM[7]
2550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
2552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lxts_dec_loop
2553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_short:
2555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$0x80, $len
2556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_dec_done
2557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for ($i=0;$i<7;$i++) {
2559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___;
2560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13, $twtmp, $twres
2561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], @XMM[$i]
2563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask, $twres		# isolate carry and residue
2566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres, @XMM[7]
2568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___ if ($i>=1);
2570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$`0x10*$i`,$len
2572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lxts_dec_$i
2573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___ if ($i>=2);
2575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
2578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp), @XMM[8+6]
2580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+5], @XMM[5]
2581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], 0x70(%rsp)
2582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x70($inp), $inp
2583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+6], @XMM[6]
2584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
2588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[6]
2593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[4]
2595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
2596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40(%rsp), @XMM[2]
2597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
2598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x50(%rsp), @XMM[7]
2599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
2600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x60(%rsp), @XMM[3]
2601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
2602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[3], 0x60($out)
2603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x70($out), $out
2604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_6:
2609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+4], @XMM[4]
2610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x60($inp), $inp
2611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+5], @XMM[5]
2612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
2616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[6]
2621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[4]
2623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
2624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40(%rsp), @XMM[2]
2625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
2626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x50(%rsp), @XMM[7]
2627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
2628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], 0x50($out)
2629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x60($out), $out
2630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_5:
2635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+3], @XMM[3]
2636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x50($inp), $inp
2637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+4], @XMM[4]
2638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
2642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[6]
2647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[4]
2649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
2650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x40(%rsp), @XMM[2]
2651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
2652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[2], 0x40($out)
2653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x50($out), $out
2654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_4:
2659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+2], @XMM[2]
2660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x40($inp), $inp
2661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+3], @XMM[3]
2662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
2666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[6]
2671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x30(%rsp), @XMM[4]
2673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
2674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[4], 0x30($out)
2675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x40($out), $out
2676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_3:
2681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+1], @XMM[1]
2682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x30($inp), $inp
2683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+2], @XMM[2]
2684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
2688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rsp), @XMM[6]
2693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], 0x20($out)
2695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x30($out), $out
2696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_2:
2701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+0], @XMM[0]
2702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20($inp), $inp
2703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[8+1], @XMM[1]
2704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80(%rsp), %rax	# pass key schedule
2705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%edx, %r10d		# pass rounds
2706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_bsaes_decrypt8
2708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x10(%rsp), @XMM[1]
2711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[1], 0x10($out)
2713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20($out), $out
2714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_1:
2719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[0], @XMM[8]
2720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($inp), $inp
2721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[8], 0x20(%rbp)
2722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg1
2723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg2
2724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
2725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_decrypt		# doesn't touch %xmm
2726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#pxor	@XMM[8], @XMM[0]
2728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#lea	0x80(%rsp), %rax	# pass key schedule
2729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#mov	%edx, %r10d		# pass rounds
2730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#call	_bsaes_decrypt8
2731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[0], 0x00($out)	# write output
2733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($out), $out
2734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_done:
2738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$15, %ebx
2739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_dec_ret
2740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp, $twtmp
2742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lxts_magic(%rip), $twmask
2743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@XMM[7], $twtmp
2744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13, $twtmp, $twres
2745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[7], @XMM[6]
2746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask, $twres		# isolate carry and residue
2748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($inp), @XMM[0]
2749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres, @XMM[7]
2750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg1
2752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[7], @XMM[0]
2753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg2
2754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], 0x20(%rbp)
2755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
2756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_decrypt		# doesn't touch %xmm
2757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[7]
2758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$out, %rdx
2759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[7], ($out)
2760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_steal:
2762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movzb	16($inp), %eax
2763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movzb	(%rdx), %ecx
2764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($inp), $inp
2765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%al, (%rdx)
2766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%cl, 16(%rdx)
2767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1(%rdx), %rdx
2768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$1,%ebx
2769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lxts_dec_steal
2770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($out), @XMM[0]
2772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg1
2773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@XMM[6], @XMM[0]
2774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rbp), $arg2
2775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@XMM[0], 0x20(%rbp)
2776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	($key), $arg3
2777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	asm_AES_decrypt		# doesn't touch %xmm
2778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	0x20(%rbp), @XMM[6]
2779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	@XMM[6], ($out)
2780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_ret:
2782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsp), %rax
2783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm0, %xmm0
2784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_bzero:			# wipe key schedule [if any]
2785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x00(%rax)
2786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm0, 0x10(%rax)
2787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rax), %rax
2788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%rax, %rbp
2789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ja	.Lxts_dec_bzero
2790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rbp),%rsp		# restore %rsp
2792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
2794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x40(%rbp), %xmm6
2795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x50(%rbp), %xmm7
2796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rbp), %xmm8
2797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rbp), %xmm9
2798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rbp), %xmm10
2799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rbp), %xmm11
2800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rbp), %xmm12
2801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rbp), %xmm13
2802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xc0(%rbp), %xmm14
2803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xd0(%rbp), %xmm15
2804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xa0(%rbp), %rsp
2805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x48(%rsp), %r15
2808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x50(%rsp), %r14
2809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x58(%rsp), %r13
2810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x60(%rsp), %r12
2811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x68(%rsp), %rbx
2812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x70(%rsp), %rax
2813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x78(%rsp), %rsp
2814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax, %rbp
2815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_epilogue:
2816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
2820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	_bsaes_const,\@object
2822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
2823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_bsaes_const:
2824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LM0ISR:	# InvShiftRows constants
2825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LISRM0:
2827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LISR:
2829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LBS0:		# bit-slice constants
2831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x5555555555555555, 0x5555555555555555
2832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LBS1:
2833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x3333333333333333, 0x3333333333333333
2834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LBS2:
2835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSR:		# shiftrows constants
2837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSRM0:
2839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LM0SR:
2841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSWPUP:	# byte-swap upper dword
2843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSWPUPM0SR:
2845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LADD1:		# counter increment constants
2847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0000000000000000, 0x0000000100000000
2848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LADD2:
2849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0000000000000000, 0x0000000200000000
2850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LADD3:
2851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0000000000000000, 0x0000000300000000
2852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LADD4:
2853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0000000000000000, 0x0000000400000000
2854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LADD5:
2855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0000000000000000, 0x0000000500000000
2856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LADD6:
2857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0000000000000000, 0x0000000600000000
2858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LADD7:
2859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0000000000000000, 0x0000000700000000
2860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LADD8:
2861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.quad	0x0000000000000000, 0x0000000800000000
2862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_magic:
2863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0x87,0,1,0
2864a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom.Lmasks:
2865a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	.quad	0x0101010101010101, 0x0101010101010101
2866a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	.quad	0x0202020202020202, 0x0202020202020202
2867a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	.quad	0x0404040404040404, 0x0404040404040404
2868a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	.quad	0x0808080808080808, 0x0808080808080808
2869a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom.LM0:
2870a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2871a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom.L63:
2872a1a5710c055e139ea00e785f9eb55b3af3e4dab1Brian Carlstrom	.quad	0x6363636363636363, 0x6363636363636363
2873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
2875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	_bsaes_const,.-_bsaes_const
2876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($win64) {
2881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rec="%rcx";
2882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$frame="%rdx";
2883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$context="%r8";
2884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$disp="%r9";
2885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.extern	__imp_RtlVirtualUnwind
2888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	se_handler,\@abi-omnipotent
2889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromse_handler:
2891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rsi
2892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rdi
2893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
2894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
2895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
2896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
2897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
2898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
2899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pushfq
2900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$64,%rsp
2901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	120($context),%rax	# pull context->Rax
2903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	248($context),%rbx	# pull context->Rip
2904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($disp),%rsi		# disp->ImageBase
2906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	56($disp),%r11		# disp->HandlerData
2907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%r11),%r10d		# HandlerData[0]
2909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsi,%r10),%r10	# prologue label
2910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip<prologue label
2911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lin_prologue
2912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	152($context),%rax	# pull context->Rsp
2914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	4(%r11),%r10d		# HandlerData[1]
2916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsi,%r10),%r10	# epilogue label
2917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip>=epilogue label
2918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jae	.Lin_prologue
2919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	160($context),%rax	# pull context->Rbp
2921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x40(%rax),%rsi		# %xmm save area
2923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	512($context),%rdi	# &context.Xmm6
2924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0xa548f3fc		# cld; rep movsq
2926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xa0(%rax),%rax		# adjust stack pointer
2927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x70(%rax),%rbp
2929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x68(%rax),%rbx
2930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x60(%rax),%r12
2931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x58(%rax),%r13
2932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x50(%rax),%r14
2933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0x48(%rax),%r15
2934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x78(%rax),%rax		# adjust stack pointer
2935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rbx,144($context)	# restore context->Rbx
2936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rbp,160($context)	# restore context->Rbp
2937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r12,216($context)	# restore context->R12
2938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r13,224($context)	# restore context->R13
2939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r14,232($context)	# restore context->R14
2940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r15,240($context)	# restore context->R15
2941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lin_prologue:
2943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,152($context)	# restore context->Rsp
2944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40($disp),%rdi		# disp->ContextRecord
2946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$context,%rsi		# context
2947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
2948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0xa548f3fc		# cld; rep movsq
2949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$disp,%rsi
2951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40(%rsi),%r10		# disp->ContextRecord
2956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	56(%rsi),%r11		# &disp->HandlerData
2957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r10,32(%rsp)		# arg5
2959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r11,40(%rsp)		# arg6
2960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r12,48(%rsp)		# arg7
2961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rcx,56(%rsp)		# arg8, (NULL)
2962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	*__imp_RtlVirtualUnwind(%rip)
2963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$1,%eax		# ExceptionContinueSearch
2965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$64,%rsp
2966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	popfq
2967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%r15
2968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%r14
2969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%r13
2970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%r12
2971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%rbp
2972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%rbx
2973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%rdi
2974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%rsi
2975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	se_handler,.-se_handler
2977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.section	.pdata
2979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	4
2980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($ecb);
2982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lecb_enc_prologue
2983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lecb_enc_epilogue
2984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lecb_enc_info
2985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lecb_dec_prologue
2987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lecb_dec_epilogue
2988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lecb_dec_info
2989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lcbc_dec_prologue
2992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lcbc_dec_epilogue
2993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lcbc_dec_info
2994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lctr_enc_prologue
2996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lctr_enc_epilogue
2997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lctr_enc_info
2998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_enc_prologue
3000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_enc_epilogue
3001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_enc_info
3002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_dec_prologue
3004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_dec_epilogue
3005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_dec_info
3006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.section	.xdata
3008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	8
3009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
3010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($ecb);
3011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_info:
3012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	se_handler
3014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_info:
3016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	se_handler
3018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
3020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
3021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_info:
3022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	se_handler
3024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr_enc_info:
3026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	se_handler
3028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_info:
3030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	se_handler
3032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_info:
3034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	se_handler
3036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
3038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
3039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprint $code;
3043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT;
3045