1d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#!/usr/bin/env perl
2d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3d9e397b599b13d642138480a28c14db7a136bf0Adam Langley###################################################################
4d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### AES-128 [originally in CTR mode]				###
5d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### bitsliced implementation for Intel Core 2 processors	###
6d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### requires support of SSE extensions up to SSSE3		###
7d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### Author: Emilia Käsper and Peter Schwabe			###
8d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### Date: 2009-03-19						###
9d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### Public domain						###
10d9e397b599b13d642138480a28c14db7a136bf0Adam Langley###								###
11d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### further information.					###
13d9e397b599b13d642138480a28c14db7a136bf0Adam Langley###################################################################
14d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
15d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# September 2011.
16d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
17d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Started as transliteration to "perlasm" the original code has
18d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# undergone following changes:
19d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
20d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# - code was made position-independent;
21d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# - rounds were folded into a loop resulting in >5x size reduction
22d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#   from 12.5KB to 2.2KB;
23d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# - above was possibile thanks to mixcolumns() modification that
24d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#   allowed to feed its output back to aesenc[last], this was
25d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#   achieved at cost of two additional inter-registers moves;
26d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# - some instruction reordering and interleaving;
27d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# - this module doesn't implement key setup subroutine, instead it
28d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#   relies on conversion of "conventional" key schedule as returned
29d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#   by AES_set_encrypt_key (see discussion below);
30d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# - first and last round keys are treated differently, which allowed
31d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#   to skip one shiftrows(), reduce bit-sliced key schedule and
32d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#   speed-up conversion by 22%;
33d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# - support for 192- and 256-bit keys was added;
34d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
35d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Resulting performance in CPU cycles spent to encrypt one byte out
36d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# of 4096-byte buffer with 128-bit key is:
37d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
38d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#		Emilia's	this(*)		difference
39d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
40d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Core 2    	9.30		8.69		+7%
41d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Nehalem(**) 	7.63		6.88		+11%
42d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Atom	    	17.1		16.4		+4%
43d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Silvermont	-		12.9
44a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan# Goldmont	-		8.85
45d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
46d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# (*)	Comparison is not completely fair, because "this" is ECB,
47d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#	i.e. no extra processing such as counter values calculation
48d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#	and xor-ing input as in Emilia's CTR implementation is
49d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#	performed. However, the CTR calculations stand for not more
50d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#	than 1% of total time, so comparison is *rather* fair.
51d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
52d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# (**)	Results were collected on Westmere, which is considered to
53d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#	be equivalent to Nehalem for this code.
54d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
55d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# As for key schedule conversion subroutine. Interface to OpenSSL
56d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# relies on per-invocation on-the-fly conversion. This naturally
57d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# has impact on performance, especially for short inputs. Conversion
58d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# time in CPU cycles and its ratio to CPU cycles spent in 8x block
59d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# function is:
60d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
61d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# 		conversion	conversion/8x block
62d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Core 2	240		0.22
63d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Nehalem	180		0.20
64d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Atom		430		0.20
65d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
66d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# The ratio values mean that 128-byte blocks will be processed
67d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
68d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# etc. Then keep in mind that input sizes not divisible by 128 are
69d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# *effectively* slower, especially shortest ones, e.g. consecutive
70d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# 144-byte blocks are processed 44% slower than one would expect,
71d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
72d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# it's still faster than ["hyper-threading-safe" code path in]
73d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# aes-x86_64.pl on all lengths above 64 bytes...
74d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
75d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# October 2011.
76d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
77d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Add decryption procedure. Performance in CPU cycles spent to decrypt
78d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# one byte out of 4096-byte buffer with 128-bit key is:
79d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
80d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Core 2	9.98
81d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Nehalem	7.80
82d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Atom		17.9
83d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Silvermont	14.0
84a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan# Goldmont	10.2
85d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
86d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# November 2011.
87d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
88d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
89d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# suboptimal, but XTS is meant to be used with larger blocks...
90d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
91d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#						<appro@openssl.org>
92d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
93d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$flavour = shift;
94d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$output  = shift;
95d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
96d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
97d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
98d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
99d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
100d9e397b599b13d642138480a28c14db7a136bf0Adam Langley( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
101572a4e2e687520da9e518528d7371b794b1decc0Robert Sloan( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
102d9e397b599b13d642138480a28c14db7a136bf0Adam Langleydie "can't locate x86_64-xlate.pl";
103d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
104c895d6b1c580258e72e1ed3fcc86d38970ded9e1David Benjaminopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
105d9e397b599b13d642138480a28c14db7a136bf0Adam Langley*STDOUT=*OUT;
106d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
107d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
108d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
109d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
110d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
111d9e397b599b13d642138480a28c14db7a136bf0Adam Langley{
112d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($key,$rounds,$const)=("%rax","%r10d","%r11");
113d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
114d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Sbox {
115d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
116d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
117d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @b=@_[0..7];
118d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[8..11];
119d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @s=@_[12..15];
120d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&InBasisChange	(@b);
121d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
122d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
123d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
124d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
125d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub InBasisChange {
126d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
127a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
128d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @b=@_[0..7];
129d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
130d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[6], @b[5]
131d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[1], @b[2]
132d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[0], @b[3]
133d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[2], @b[6]
134d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor 	@b[0], @b[5]
135d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
136d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[3], @b[6]
137d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[7], @b[3]
138d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[7]
139d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[4], @b[3]
140d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[4]
141d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[1], @b[3]
142d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
143d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[7], @b[2]
144d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[1]
145d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
146d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
147d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
148d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub OutBasisChange {
149d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
150d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
151d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @b=@_[0..7];
152d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
153d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[6], @b[0]
154d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[4], @b[1]
155d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[0], @b[2]
156d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[6], @b[4]
157d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[1], @b[6]
158d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
159d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[1]
160d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[3], @b[5]
161d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[7], @b[3]
162d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[7]
163d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[2]
164d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
165d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[7], @b[4]
166d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
167d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
168d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
169d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub InvSbox {
170d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
171d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
172d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @b=@_[0..7];
173d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[8..11];
174d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @s=@_[12..15];
175d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&InvInBasisChange	(@b);
176d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
177d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
178d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
179d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
180d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub InvInBasisChange {		# OutBasisChange in reverse
181d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @b=@_[5,1,2,6,3,7,0,4];
182d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___
183d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[7], @b[4]
184d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
185d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[7]
186d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[2]
187d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[7], @b[3]
188d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[3], @b[5]
189d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[1]
190d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
191d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[1], @b[6]
192d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[0], @b[2]
193d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[6], @b[4]
194d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[6], @b[0]
195d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[4], @b[1]
196d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
197d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
198d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
199d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub InvOutBasisChange {		# InBasisChange in reverse
200d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @b=@_[2,5,7,3,6,1,0,4];
201d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
202d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[1]
203d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[7], @b[2]
204d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
205d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[1], @b[3]
206d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[4]
207d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[5], @b[7]
208d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[4], @b[3]
209d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor 	@b[0], @b[5]
210d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[7], @b[3]
211d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@b[2], @b[6]
212d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@b[1], @b[2]
213d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[3], @b[6]
214d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
215d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[0], @b[3]
216d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@b[6], @b[5]
217d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
218d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
219d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
220d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Mul_GF4 {
221d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#;*************************************************************
222d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
223d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#;*************************************************************
224d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($x0,$x1,$y0,$y1,$t0)=@_;
225d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
226d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	$y0, $t0
227d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor 	$y1, $t0
228d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$x0, $t0
229d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$x1, $x0
230d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$y0, $x1
231d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$y1, $x0
232d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$x1, $x0
233d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$t0, $x1
234d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
235d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
236d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
237d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Mul_GF4_N {				# not used, see next subroutine
238d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# multiply and scale by N
239d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($x0,$x1,$y0,$y1,$t0)=@_;
240d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
241d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	$y0, $t0
242d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$y1, $t0
243d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$x0, $t0
244d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$x1, $x0
245d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$y0, $x1
246d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$y1, $x0
247d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$x0, $x1
248d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$t0, $x0
249d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
250d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
251d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
252d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Mul_GF4_N_GF4 {
253d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# interleaved Mul_GF4_N and Mul_GF4
254d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($x0,$x1,$y0,$y1,$t0,
255d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $x2,$x3,$y2,$y3,$t1)=@_;
256d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
257d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	$y0, $t0
258d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	$y2, $t1
259d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$y1, $t0
260d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor 	$y3, $t1
261d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$x0, $t0
262d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pand	$x2, $t1
263d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$x1, $x0
264d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	$x3, $x2
265d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$y0, $x1
266d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pand	$y2, $x3
267d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$y1, $x0
268d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pand	$y3, $x2
269d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$x0, $x1
270d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	$x3, $x2
271d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$t0, $x0
272d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	$t1, $x3
273d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
274d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
275d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Mul_GF16_2 {
276d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @x=@_[0..7];
277d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @y=@_[8..11];
278d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[12..15];
279d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
280d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[0], @t[0]
281d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[1], @t[1]
282d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
283d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
284d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
285d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[2], @t[0]
286d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[3], @t[1]
287d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[2], @y[0]
288d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[3], @y[1]
289d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
290d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
291d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			 @x[2], @x[3], @y[2], @y[3], @t[2]);
292d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
293d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @x[0]
294d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @x[2]
295d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @x[1]
296d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @x[3]
297d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
298d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[4], @t[0]
299d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[5], @t[1]
300d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[6], @t[0]
301d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @t[1]
302d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
303d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
304d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			 @x[6], @x[7], @y[2], @y[3], @t[2]);
305d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
306d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[2], @y[0]
307d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[3], @y[1]
308d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
309d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
310d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
311d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @x[4]
312d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @x[6]
313d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @x[5]
314d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @x[7]
315d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
316d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
317d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub Inv_GF256 {
318d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#;********************************************************************
319d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
320d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#;********************************************************************
321d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @x=@_[0..7];
322d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[8..11];
323d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @s=@_[12..15];
324d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# direct optimizations from hardware
325d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
326d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[4], @t[3]
327d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[5], @t[2]
328d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[1], @t[1]
329d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[7], @s[1]
330d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[0], @s[0]
331d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
332d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[6], @t[3]
333d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @t[2]
334d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[3], @t[1]
335d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[3], @s[2]
336d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[6], @s[1]
337d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[2], @t[0]
338d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[2], @s[0]
339d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[3], @s[3]
340d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
341d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	por	@t[1], @t[2]
342d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	por	@s[0], @t[3]
343d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @s[3]
344d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@s[0], @s[2]
345d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @s[0]
346d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@t[1], @t[0]
347d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@s[0], @s[3]
348d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[3], @s[0]
349d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[2], @s[0]
350d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@s[0], @s[1]
351d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[1], @t[3]
352d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[1], @t[2]
353d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[4], @s[1]
354d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[1], @s[0]
355d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[5], @s[1]
356d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[0], @s[0]
357d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@s[1], @t[1]
358d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@s[0], @s[1]
359d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	por	@s[0], @t[1]
360d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[1], @t[0]
361d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[3], @t[3]
362d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[2], @t[2]
363d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[3], @t[1]
364d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[7], @s[0]
365d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[2], @t[0]
366d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[6], @s[1]
367d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[2], @t[1]
368d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[5], @s[2]
369d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@x[3], @s[0]
370d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[4], @s[3]
371d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@x[2], @s[1]
372d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@x[1], @s[2]
373d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	por	@x[0], @s[3]
374d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[0], @t[3]
375d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[1], @t[2]
376d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[2], @t[1]
377a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	pxor	@s[3], @t[0]
378d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
379d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
380d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
381d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# new smaller inversion
382d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
383d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[3], @s[0]
384d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@t[1], @t[3]
385d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @s[0]
386d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
387d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[0], @s[2]
388d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@s[0], @s[3]
389d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[3], @s[2]
390d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@s[2], @s[3]
391d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
392d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[1], @s[1]
393d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @s[3]
394d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @s[1]
395d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
396d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @t[3]
397d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
398d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@t[3], @s[1]
399d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
400d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@s[2], @t[2]
401d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @s[1]
402d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
403d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[1], @t[2]
404d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[1], @t[1]
405d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
406d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@t[0], @t[2]
407d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
408d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @s[2]
409d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @t[1]
410d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
411d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	@s[3], @s[2]
412d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
413d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@s[0], @s[2]
414d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
415d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# output in s3, s2, s1, t1
416d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
417d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
418d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
419d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
420d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
421d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
422d9e397b599b13d642138480a28c14db7a136bf0Adam Langley### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
423d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
424d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
425d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# AES linear components
426d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
427d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub ShiftRows {
428d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @x=@_[0..7];
429d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $mask=pop;
430d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
431d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00($key),@x[0]
432d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10($key),@x[1]
433d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20($key),@x[2]
434d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30($key),@x[3]
435d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	$mask,@x[0]
436d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	$mask,@x[1]
437d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40($key),@x[4]
438d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x50($key),@x[5]
439d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	$mask,@x[2]
440d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	$mask,@x[3]
441d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x60($key),@x[6]
442d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x70($key),@x[7]
443d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	$mask,@x[4]
444d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	$mask,@x[5]
445d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	$mask,@x[6]
446d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	$mask,@x[7]
447d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($key),$key
448d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
449d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
450d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
451d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub MixColumns {
452d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# modified to emit output in order suitable for feeding back to aesenc[last]
453d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @x=@_[0..7];
454d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[8..15];
455d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $inv=@_[16];	# optional
456d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
457d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
458d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[1], @t[1]
459d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
460d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[2], @t[2]
461d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[1], @x[1]
462d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[3], @t[3]
463d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[2], @x[2]
464d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[4], @t[4]
465d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[3], @x[3]
466d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[5], @t[5]
467d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[4], @x[4]
468d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[6], @t[6]
469d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[5], @x[5]
470d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[7], @t[7]
471d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[6], @x[6]
472d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[7], @x[7]
473d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
474d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[0], @t[1]
475d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @t[0]
476d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @t[1]
477d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
478d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[1], @t[2]
479d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufd	\$0x4E, @x[1], @x[1]
480d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[4], @t[5]
481d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[0], @x[0]
482d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[5], @t[6]
483d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[1], @x[1]
484d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[3], @t[4]
485d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufd	\$0x4E, @x[4], @t[0]
486d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[6], @t[7]
487d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufd	\$0x4E, @x[5], @t[1]
488d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[2], @t[3]
489d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufd	\$0x4E, @x[3], @x[4]
490d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @t[3]
491d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufd	\$0x4E, @x[7], @x[5]
492d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @t[4]
493d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufd	\$0x4E, @x[6], @x[3]
494d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[4], @t[0]
495d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufd	\$0x4E, @x[2], @x[6]
496d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @t[1]
497d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
498d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if (!$inv);
499d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[3], @x[4]
500d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @x[5]
501d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @x[3]
502d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[0], @x[2]
503d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @x[6]
504d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[1], @x[7]
505d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
506d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($inv);
507d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[4], @t[3]
508d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @x[5]
509d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[3], @t[6]
510d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[0], @x[3]
511d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @x[6]
512d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[6], @x[2]
513d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[1], @x[7]
514d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@x[6], @x[4]
515d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	@t[3], @x[6]
516d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
517d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
518d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
519d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub InvMixColumns_orig {
520d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @x=@_[0..7];
521d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[8..15];
522d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
523d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
524d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# multiplication by 0x0e
525d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[7], @t[7]
526d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[2], @t[2]
527d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[5], @x[7]		# 7 5
528d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[5], @x[2]		# 2 5
529d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[0], @t[0]
530d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[5], @t[5]
531d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[0], @x[5]		# 5 0		[1]
532d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[1], @x[0]		# 0 1
533d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[1], @t[1]
534d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[2], @x[1]		# 1 25
535d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[6], @x[0]		# 01 6		[2]
536d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[3], @x[1]		# 125 3		[4]
537d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[3], @t[3]
538d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[0], @x[2]		# 25 016	[3]
539d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @x[3]		# 3 75
540d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[6], @x[7]		# 75 6		[0]
541d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @x[6], @t[6]
542d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[4], @t[4]
543d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[4], @x[6]		# 6 4
544d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[3], @x[4]		# 4 375		[6]
545d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @x[3]		# 375 756=36
546d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @x[6]		# 64 5		[7]
547d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @x[3]		# 36 2
548d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[4], @x[3]		# 362 4		[5]
549d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[5], @t[5]
550d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
551d9e397b599b13d642138480a28c14db7a136bf0Adam Langley					my @y = @x[7,5,0,2,1,3,4,6];
552d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
553d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# multiplication by 0x0b
554d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[0], @y[1]
555d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @y[0]
556d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @y[1]
557d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[2], @t[2]
558d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @y[0]
559d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @y[1]
560d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[0]
561d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[4], @t[4]
562d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @t[7]		# clobber t[7]
563d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[0], @y[1]
564d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
565d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @y[3]
566d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[0], @t[0]
567d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @y[2]
568d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @y[4]
569d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @y[2]
570d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[1], @t[1]
571d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @y[3]
572d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @y[5]
573d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[2]
574d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[2], @t[2]
575d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[3], @y[3]
576d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[3], @y[6]
577d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[3], @y[4]
578d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[3], @t[3]
579d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[4], @y[7]
580d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[4], @y[5]
581d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[7]
582d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @y[3]
583d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[4], @y[4]
584d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @t[7]		# clobber t[7] even more
585d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
586d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[5]
587d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[4], @t[4]
588d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[6]
589d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[4]
590d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
591d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @t[7]
592d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[5], @t[5]
593d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @t[7]		# restore t[7]
594d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
595d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# multiplication by 0x0d
596d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[7], @y[4]
597d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[4], @y[7]
598d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[6], @t[6]
599d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @y[2]
600d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @y[7]
601d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @y[2]
602d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[7], @t[7]
603d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
604d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[1], @y[3]
605d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @y[1]
606d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @y[0]
607d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @y[3]
608d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @y[1]
609d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @y[0]
610d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[1]
611d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[0], @t[0]
612d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @y[0]
613d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[1], @y[3]
614d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @y[4]
615d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[1], @t[1]
616d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
617d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[7]
618d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @y[4]
619d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @y[5]
620d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[2], @t[2]
621d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @y[2]
622d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[3], @t[6]		# clobber t[6]
623d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[7], @y[4]
624d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @y[3]
625d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
626d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @y[6]
627d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @y[5]
628d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[4], @y[6]
629d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[4], @t[4]
630d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @y[5]
631d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @y[6]
632d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[3], @t[6]		# restore t[6]
633d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
634d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[5], @t[5]
635d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[6], @t[6]
636d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[7], @t[7]
637d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x93, @t[3], @t[3]
638d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
639d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# multiplication by 0x09
640d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[1], @y[4]
641d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[1], @t[1]		# t[1]=y[1]
642d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @t[0]		# clobber t[0]
643d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[5], @t[1]
644d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[0], @y[3]
645d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[0], @t[0]		# t[0]=y[0]
646d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @t[1]
647d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @t[6]		# clobber t[6]
648d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[1], @y[4]
649d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[4], @y[7]
650d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[4], @t[4]		# t[4]=y[4]
651d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[3], @y[6]
652d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[3], @t[3]		# t[3]=y[3]
653d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[2], @y[5]
654d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[2], @t[2]		# t[2]=y[2]
655d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[7], @t[3]
656d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[5], @t[5]		# t[5]=y[5]
657d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @t[2]
658d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@t[6], @t[5]
659d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[6], @t[6]		# t[6]=y[6]
660d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@y[7], @t[7]		# t[7]=y[7]
661d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
662d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[0],@XMM[0]
663d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[1],@XMM[1]
664d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[2],@XMM[2]
665d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[3],@XMM[3]
666d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[4],@XMM[4]
667d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[5],@XMM[5]
668d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[6],@XMM[6]
669d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@t[7],@XMM[7]
670d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
671d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
672d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
673d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub InvMixColumns {
674d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @x=@_[0..7];
675d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[8..15];
676d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
677d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Thanks to Jussi Kivilinna for providing pointer to
678d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
679d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
680d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
681d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
682d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
683d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
684d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
685d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# multiplication by 0x05-0x00-0x04-0x00
686d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x4E, @x[0], @t[0]
687d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x4E, @x[6], @t[6]
688d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[0], @t[0]
689d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x4E, @x[7], @t[7]
690d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[6], @t[6]
691d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x4E, @x[1], @t[1]
692d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[7], @t[7]
693d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x4E, @x[2], @t[2]
694d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[1], @t[1]
695d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x4E, @x[3], @t[3]
696d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[2], @t[2]
697d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[6], @x[0]
698d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[6], @x[1]
699d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x4E, @x[4], @t[4]
700d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[3], @t[3]
701d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[0], @x[2]
702d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[1], @x[3]
703d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x4E, @x[5], @t[5]
704d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[4], @t[4]
705d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[7], @x[1]
706d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[2], @x[4]
707d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@x[5], @t[5]
708d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
709d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[7], @x[2]
710d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[6], @x[3]
711d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[6], @x[4]
712d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[3], @x[5]
713d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[4], @x[6]
714d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[7], @x[4]
715d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[7], @x[5]
716d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	@t[5], @x[7]
717d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
718d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
719d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
720d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
721d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub aesenc {				# not used
722d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @b=@_[0..7];
723d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[8..15];
724d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
725d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x30($const),@t[0]	# .LSR
726d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
727d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&ShiftRows	(@b,@t[0]);
728d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Sbox		(@b,@t);
729d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
730d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
731d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
732d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub aesenclast {			# not used
733d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @b=@_[0..7];
734d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @t=@_[8..15];
735d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
736d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x40($const),@t[0]	# .LSRM0
737d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
738d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&ShiftRows	(@b,@t[0]);
739d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Sbox		(@b,@t);
740d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___
741d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00($key),@b[0]
742d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10($key),@b[1]
743d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20($key),@b[4]
744d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30($key),@b[6]
745d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40($key),@b[3]
746d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x50($key),@b[7]
747d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x60($key),@b[2]
748d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x70($key),@b[5]
749d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
750d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
751d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
752d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub swapmove {
753d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($a,$b,$n,$mask,$t)=@_;
754d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
755d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	$b,$t
756d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psrlq	\$$n,$b
757d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor  	$a,$b
758d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$mask,$b
759d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$b,$a
760d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$$n,$b
761d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$t,$b
762d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
763d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
764d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub swapmove2x {
765d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
766d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
767d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	$b0,$t0
768d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psrlq	\$$n,$b0
769d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	$b1,$t1
770d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 psrlq	\$$n,$b1
771d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor  	$a0,$b0
772d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor  	$a1,$b1
773d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$mask,$b0
774d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pand	$mask,$b1
775d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$b0,$a0
776d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$$n,$b0
777d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	$b1,$a1
778d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 psllq	\$$n,$b1
779d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$t0,$b0
780d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	$t1,$b1
781d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
782d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
783d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
784d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub bitslice {
785d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @x=reverse(@_[0..7]);
786d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($t0,$t1,$t2,$t3)=@_[8..11];
787d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
788d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x00($const),$t0	# .LBS0
789d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x10($const),$t1	# .LBS1
790d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
791d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
792d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
793d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
794d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x20($const),$t0	# .LBS2
795d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
796d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
797d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
798d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
799d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
800d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
801d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
802d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
803d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
804d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.text
805d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
806d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.extern	asm_AES_encrypt
807d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.extern	asm_AES_decrypt
808d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
809d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	_bsaes_encrypt8,\@abi-omnipotent
810d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	64
811d9e397b599b13d642138480a28c14db7a136bf0Adam Langley_bsaes_encrypt8:
812d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.LBS0(%rip), $const	# constants table
813d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
814d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	($key), @XMM[9]		# round 0 key
815d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10($key), $key
816d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x50($const), @XMM[8]	# .LM0SR
817d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[0]	# xor with round0 key
818d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[1]
819d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[2]
820d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[3]
821d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[0]
822d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[1]
823d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[4]
824d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[5]
825d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[2]
826d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[3]
827d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
828d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[7]
829d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[4]
830d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[5]
831d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[6]
832d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[7]
833d9e397b599b13d642138480a28c14db7a136bf0Adam Langley_bsaes_encrypt8_bitslice:
834d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
835d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&bitslice	(@XMM[0..7, 8..11]);
836d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
837d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$rounds
838d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lenc_sbox
839d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
840d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lenc_loop:
841d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
842d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&ShiftRows	(@XMM[0..7, 8]);
843d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=".Lenc_sbox:\n";
844d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&Sbox		(@XMM[0..7, 8..15]);
845d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
846d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$rounds
847d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jl	.Lenc_done
848d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
849d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
850d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
851d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x30($const), @XMM[8]	# .LSR
852d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Lenc_loop
853d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x40($const), @XMM[8]	# .LSRM0
854d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lenc_loop
855d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
856d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lenc_done:
857d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
858d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
859d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
860d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
861d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	($key), @XMM[8]		# last round key
862d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[4]
863d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[6]
864d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[3]
865d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[7]
866d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[2]
867d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[5]
868d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[0]
869d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
870d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
871d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	_bsaes_encrypt8,.-_bsaes_encrypt8
872d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
873d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	_bsaes_decrypt8,\@abi-omnipotent
874d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	64
875d9e397b599b13d642138480a28c14db7a136bf0Adam Langley_bsaes_decrypt8:
876d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.LBS0(%rip), $const	# constants table
877d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
878d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	($key), @XMM[9]		# round 0 key
879d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10($key), $key
880d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
881d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[0]	# xor with round0 key
882d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[1]
883d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[2]
884d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[3]
885d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[0]
886d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[1]
887d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[4]
888d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[5]
889d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[2]
890d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[3]
891d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
892d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[7]
893d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[4]
894d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[5]
895d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[6]
896d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[7]
897d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
898d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&bitslice	(@XMM[0..7, 8..11]);
899d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
900d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$rounds
901d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Ldec_sbox
902d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
903d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Ldec_loop:
904d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
905d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&ShiftRows	(@XMM[0..7, 8]);
906d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=".Ldec_sbox:\n";
907d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&InvSbox	(@XMM[0..7, 8..15]);
908d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
909d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$rounds
910d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jl	.Ldec_done
911d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
912d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
913d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
914d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	-0x10($const), @XMM[8]	# .LISR
915d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Ldec_loop
916d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	-0x20($const), @XMM[8]	# .LISRM0
917d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Ldec_loop
918d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
919d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Ldec_done:
920d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
921d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
922d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
923d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	($key), @XMM[8]		# last round key
924d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[6]
925d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[4]
926d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[2]
927d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[7]
928d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[3]
929d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[5]
930d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[0]
931d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
932d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
933d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	_bsaes_decrypt8,.-_bsaes_decrypt8
934d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
935d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
936d9e397b599b13d642138480a28c14db7a136bf0Adam Langley{
937d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
938d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
939d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub bitslice_key {
940d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @x=reverse(@_[0..7]);
941d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
942d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
943d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
944d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
945d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
946d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[0], @x[2]
947d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[1], @x[3]
948d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
949d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
950d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
951d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
952d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
953d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
954d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[0], @x[4]
955d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[2], @x[6]
956d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[1], @x[5]
957d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@x[3], @x[7]
958d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
959d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
960d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
961d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
962d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
963d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
964d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	_bsaes_key_convert,\@abi-omnipotent
965d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
966d9e397b599b13d642138480a28c14db7a136bf0Adam Langley_bsaes_key_convert:
967d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.Lmasks(%rip), $const
968d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	($inp), %xmm7		# load round 0 key
969d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10($inp), $inp
970d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x00($const), %xmm0	# 0x01...
971d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x10($const), %xmm1	# 0x02...
972d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x20($const), %xmm2	# 0x04...
973d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x30($const), %xmm3	# 0x08...
974d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x40($const), %xmm4	# .LM0
975d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqd	%xmm5, %xmm5		# .LNOT
976d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
977d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	($inp), %xmm6		# load round 1 key
978d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7, ($out)		# save round 0 key
979d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10($out), $out
980d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$rounds
981d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lkey_loop
982d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
983d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lkey_loop:
984d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	%xmm4, %xmm6		# .LM0
985d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
986d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0,	%xmm8
987d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm1,	%xmm9
988d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
989d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	%xmm6,	%xmm8
990d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	%xmm6,	%xmm9
991d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm2,	%xmm10
992d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqb	%xmm0,	%xmm8
993d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$4,	%xmm0		# 0x10...
994d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm3,	%xmm11
995d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqb	%xmm1,	%xmm9
996d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$4,	%xmm1		# 0x20...
997d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
998d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	%xmm6,	%xmm10
999d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	%xmm6,	%xmm11
1000d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0,	%xmm12
1001d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqb	%xmm2,	%xmm10
1002d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$4,	%xmm2		# 0x40...
1003d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm1,	%xmm13
1004d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqb	%xmm3,	%xmm11
1005d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$4,	%xmm3		# 0x80...
1006d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1007d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm2,	%xmm14
1008d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm3,	%xmm15
1009d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	%xmm5,	%xmm8		# "pnot"
1010d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pxor	%xmm5,	%xmm9
1011d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1012d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	%xmm6,	%xmm12
1013d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	%xmm6,	%xmm13
1014d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1015d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqb	%xmm0,	%xmm12
1016d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psrlq	\$4,	%xmm0		# 0x01...
1017d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	%xmm9, 0x10($out)
1018d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqb	%xmm1,	%xmm13
1019d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psrlq	\$4,	%xmm1		# 0x02...
1020d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 lea	0x10($inp), $inp
1021d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1022d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	%xmm6,	%xmm14
1023d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	%xmm6,	%xmm15
1024d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	%xmm10, 0x20($out)
1025d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqb	%xmm2,	%xmm14
1026d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psrlq	\$4,	%xmm2		# 0x04...
1027d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqa	%xmm11, 0x30($out)
1028d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpeqb	%xmm3,	%xmm15
1029d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psrlq	\$4,	%xmm3		# 0x08...
1030d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 movdqu	($inp), %xmm6		# load next round key
1031d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1032d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm5, %xmm13		# "pnot"
1033d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm5, %xmm14
1034d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm12, 0x40($out)
1035d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm13, 0x50($out)
1036d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm14, 0x60($out)
1037d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm15, 0x70($out)
1038d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out),$out
1039d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$rounds
1040d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Lkey_loop
1041d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1042d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x50($const), %xmm7	# .L63
1043d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#movdqa	%xmm6, ($out)		# don't save last round key
1044d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
1045d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	_bsaes_key_convert,.-_bsaes_key_convert
1046d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1047d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
1048d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1049d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif (0 && !$win64) {	# following four functions are unsupported interface
1050d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			# used for benchmarking...
1051d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1052d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_enc_key_convert
1053d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_enc_key_convert,\@function,2
1054d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1055d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_enc_key_convert:
1056d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	240($inp),%r10d		# pass rounds
1057d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$inp,%rcx		# pass key
1058d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$out,%rax		# pass key schedule
1059d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_key_convert
1060d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm6,%xmm7		# fix up last round key
1061d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7,(%rax)		# save last round key
1062d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
1063d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1064d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1065d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_encrypt_128
1066d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_encrypt_128,\@function,4
1067d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1068d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_encrypt_128:
1069d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lenc128_loop:
1070d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[0]	# load input
1071d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[1]
1072d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[2]
1073d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[3]
1074d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[4]
1075d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[5]
1076d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[6]
1077d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[7]
1078d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$key, %rax		# pass the $key
1079d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($inp), $inp
1080d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	\$10,%r10d
1081d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1082d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1083d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1084d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1085d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1086d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
1087d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
1088d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
1089d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1090d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x60($out)
1091d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[5], 0x70($out)
1092d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out), $out
1093d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$0x80,$len
1094d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ja	.Lenc128_loop
1095d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
1096d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1097d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1098d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_dec_key_convert
1099d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_dec_key_convert,\@function,2
1100d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1101d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_dec_key_convert:
1102d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	240($inp),%r10d		# pass rounds
1103d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$inp,%rcx		# pass key
1104d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$out,%rax		# pass key schedule
1105d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_key_convert
1106d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	($out),%xmm7		# fix up round 0 key
1107d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm6,(%rax)		# save last round key
1108d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7,($out)
1109d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
1110d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1111d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1112d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_decrypt_128
1113d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_decrypt_128,\@function,4
1114d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1115d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_decrypt_128:
1116d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Ldec128_loop:
1117d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[0]	# load input
1118d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[1]
1119d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[2]
1120d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[3]
1121d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[4]
1122d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[5]
1123d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[6]
1124d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[7]
1125d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$key, %rax		# pass the $key
1126d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($inp), $inp
1127d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	\$10,%r10d
1128d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1129d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1130d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1131d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1132d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1133d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1134d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1135d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1136d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1137d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x60($out)
1138d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[5], 0x70($out)
1139d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out), $out
1140d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$0x80,$len
1141d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ja	.Ldec128_loop
1142d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
1143d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1144d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1145d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
1146d9e397b599b13d642138480a28c14db7a136bf0Adam Langley{
1147d9e397b599b13d642138480a28c14db7a136bf0Adam Langley######################################################################
1148d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
1149d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# OpenSSL interface
1150d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
1151d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1152d9e397b599b13d642138480a28c14db7a136bf0Adam Langley						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1153d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1154d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1155d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif ($ecb) {
1156d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1157d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_ecb_encrypt_blocks
1158d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1159d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1160d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_ecb_encrypt_blocks:
1161d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax
1162d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_prologue:
1163d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
1164d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
1165d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
1166d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
1167d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
1168d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
1169d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0x48(%rsp),%rsp
1170d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1171d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
1172d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0xa0(%rsp), %rsp
1173d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm6, 0x40(%rsp)
1174d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm7, 0x50(%rsp)
1175d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm8, 0x60(%rsp)
1176d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm9, 0x70(%rsp)
1177d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm10, 0x80(%rsp)
1178d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm11, 0x90(%rsp)
1179d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm12, 0xa0(%rsp)
1180d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm13, 0xb0(%rsp)
1181d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm14, 0xc0(%rsp)
1182d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm15, 0xd0(%rsp)
1183d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_body:
1184d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1185d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1186d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp,%rbp		# backup %rsp
1187d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	240($arg4),%eax		# rounds
1188d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg1,$inp		# backup arguments
1189d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg2,$out
1190d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg3,$len
1191d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg4,$key
1192d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$8,$arg3
1193d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_enc_short
1194d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1195d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax,%ebx		# backup rounds
1196d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shl	\$7,%rax		# 128 bytes per inner round key
1197d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1198d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	%rax,%rsp
1199d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp,%rax		# pass key schedule
1200d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$key,%rcx		# pass key
1201d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%ebx,%r10d		# pass rounds
1202d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_key_convert
1203d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm6,%xmm7		# fix up last round key
1204d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7,(%rax)		# save last round key
1205d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1206d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$8,$len
1207d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_loop:
1208d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[0]	# load input
1209d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[1]
1210d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[2]
1211d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[3]
1212d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[4]
1213d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[5]
1214d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
1215d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[6]
1216d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%ebx,%r10d		# pass rounds
1217d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[7]
1218d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($inp), $inp
1219d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1220d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1221d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1222d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1223d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1224d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
1225d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
1226d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
1227d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1228d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x60($out)
1229d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[5], 0x70($out)
1230d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out), $out
1231d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$8,$len
1232d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnc	.Lecb_enc_loop
1233d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1234d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$8,$len
1235d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Lecb_enc_done
1236d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1237d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[0]	# load input
1238d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
1239d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%ebx,%r10d		# pass rounds
1240d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$2,$len
1241d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_enc_one
1242d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[1]
1243d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lecb_enc_two
1244d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[2]
1245d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$4,$len
1246d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_enc_three
1247d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[3]
1248d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lecb_enc_four
1249d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[4]
1250d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$6,$len
1251d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_enc_five
1252d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[5]
1253d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lecb_enc_six
1254d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[6]
1255d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1256d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1257d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1258d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
1259d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
1260d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
1261d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1262d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x60($out)
1263d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_enc_done
1264d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1265d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_six:
1266d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1267d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1268d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1269d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
1270d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
1271d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
1272d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1273d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_enc_done
1274d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1275d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_five:
1276d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1277d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1278d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1279d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
1280d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
1281d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
1282d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_enc_done
1283d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1284d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_four:
1285d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1286d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1287d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1288d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
1289d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
1290d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_enc_done
1291d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1292d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_three:
1293d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1294d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1295d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1296d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
1297d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_enc_done
1298d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1299d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_two:
1300d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1301d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1302d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1303d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_enc_done
1304d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1305d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_one:
1306d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
1307d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1308d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_enc_done
1309d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1310d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_short:
1311d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($inp), $arg1
1312d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($out), $arg2
1313d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
1314d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_encrypt
1315d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	16($inp), $inp
1316d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	16($out), $out
1317d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$len
1318d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Lecb_enc_short
1319d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1320d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_done:
1321d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	(%rsp),%rax
1322d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0, %xmm0
1323d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_bzero:			# wipe key schedule [if any]
1324d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x00(%rax)
1325d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x10(%rax)
1326d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rax), %rax
1327d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%rax, %rbp
1328d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_enc_bzero
1329d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1330a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0x78(%rbp),%rax
1331d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1332d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
1333d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x40(%rbp), %xmm6
1334d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x50(%rbp), %xmm7
1335d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x60(%rbp), %xmm8
1336d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x70(%rbp), %xmm9
1337d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x80(%rbp), %xmm10
1338d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x90(%rbp), %xmm11
1339d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xa0(%rbp), %xmm12
1340d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xb0(%rbp), %xmm13
1341d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xc0(%rbp), %xmm14
1342d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xd0(%rbp), %xmm15
1343a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0xa0(%rax), %rax
1344a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan.Lecb_enc_tail:
1345d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1346d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1347a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-48(%rax), %r15
1348a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-40(%rax), %r14
1349a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-32(%rax), %r13
1350a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-24(%rax), %r12
1351a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-16(%rax), %rbx
1352a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-8(%rax), %rbp
1353a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	(%rax), %rsp		# restore %rsp
1354d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_epilogue:
1355d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
1356d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1357d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1358d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_ecb_decrypt_blocks
1359d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1360d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1361d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_ecb_decrypt_blocks:
1362d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax
1363d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_prologue:
1364d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
1365d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
1366d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
1367d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
1368d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
1369d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
1370d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0x48(%rsp),%rsp
1371d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1372d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
1373d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0xa0(%rsp), %rsp
1374d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm6, 0x40(%rsp)
1375d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm7, 0x50(%rsp)
1376d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm8, 0x60(%rsp)
1377d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm9, 0x70(%rsp)
1378d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm10, 0x80(%rsp)
1379d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm11, 0x90(%rsp)
1380d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm12, 0xa0(%rsp)
1381d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm13, 0xb0(%rsp)
1382d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm14, 0xc0(%rsp)
1383d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm15, 0xd0(%rsp)
1384d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_body:
1385d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1386d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1387d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp,%rbp		# backup %rsp
1388d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	240($arg4),%eax		# rounds
1389d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg1,$inp		# backup arguments
1390d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg2,$out
1391d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg3,$len
1392d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg4,$key
1393d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$8,$arg3
1394d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_dec_short
1395d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1396d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax,%ebx		# backup rounds
1397d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shl	\$7,%rax		# 128 bytes per inner round key
1398d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1399d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	%rax,%rsp
1400d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp,%rax		# pass key schedule
1401d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$key,%rcx		# pass key
1402d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%ebx,%r10d		# pass rounds
1403d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_key_convert
1404d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	(%rsp),%xmm7		# fix up 0 round key
1405d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm6,(%rax)		# save last round key
1406d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7,(%rsp)
1407d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1408d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$8,$len
1409d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_loop:
1410d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[0]	# load input
1411d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[1]
1412d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[2]
1413d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[3]
1414d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[4]
1415d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[5]
1416d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
1417d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[6]
1418d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%ebx,%r10d		# pass rounds
1419d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[7]
1420d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($inp), $inp
1421d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1422d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1423d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1424d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1425d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1426d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1427d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1428d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1429d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1430d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x60($out)
1431d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[5], 0x70($out)
1432d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out), $out
1433d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$8,$len
1434d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnc	.Lecb_dec_loop
1435d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1436d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$8,$len
1437d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Lecb_dec_done
1438d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1439d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[0]	# load input
1440d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
1441d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%ebx,%r10d		# pass rounds
1442d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$2,$len
1443d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_dec_one
1444d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[1]
1445d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lecb_dec_two
1446d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[2]
1447d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$4,$len
1448d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_dec_three
1449d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[3]
1450d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lecb_dec_four
1451d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[4]
1452d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$6,$len
1453d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_dec_five
1454d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[5]
1455d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lecb_dec_six
1456d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[6]
1457d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1458d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1459d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1460d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1461d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1462d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1463d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1464d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x60($out)
1465d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_dec_done
1466d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1467d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_six:
1468d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1469d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1470d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1471d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1472d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1473d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1474d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1475d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_dec_done
1476d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1477d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_five:
1478d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1479d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1480d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1481d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1482d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1483d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1484d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_dec_done
1485d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1486d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_four:
1487d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1488d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1489d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1490d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1491d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1492d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_dec_done
1493d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1494d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_three:
1495d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1496d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1497d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1498d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1499d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_dec_done
1500d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1501d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_two:
1502d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1503d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1504d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1505d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_dec_done
1506d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1507d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_one:
1508d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1509d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1510d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lecb_dec_done
1511d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1512d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_short:
1513d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($inp), $arg1
1514d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($out), $arg2
1515d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
1516d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_decrypt
1517d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	16($inp), $inp
1518d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	16($out), $out
1519d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$len
1520d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Lecb_dec_short
1521d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1522d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_done:
1523d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	(%rsp),%rax
1524d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0, %xmm0
1525d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_bzero:			# wipe key schedule [if any]
1526d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x00(%rax)
1527d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x10(%rax)
1528d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rax), %rax
1529d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%rax, %rbp
1530d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lecb_dec_bzero
1531d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1532a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0x78(%rbp),%rax
1533d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1534d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
1535d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x40(%rbp), %xmm6
1536d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x50(%rbp), %xmm7
1537d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x60(%rbp), %xmm8
1538d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x70(%rbp), %xmm9
1539d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x80(%rbp), %xmm10
1540d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x90(%rbp), %xmm11
1541d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xa0(%rbp), %xmm12
1542d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xb0(%rbp), %xmm13
1543d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xc0(%rbp), %xmm14
1544d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xd0(%rbp), %xmm15
1545a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0xa0(%rax), %rax
1546a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan.Lecb_dec_tail:
1547d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1548d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1549a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-48(%rax), %r15
1550a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-40(%rax), %r14
1551a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-32(%rax), %r13
1552a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-24(%rax), %r12
1553a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-16(%rax), %rbx
1554a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-8(%rax), %rbp
1555a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	(%rax), %rsp		# restore %rsp
1556d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_epilogue:
1557d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
1558d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1559d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1560d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
1561d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1562d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.extern	asm_AES_cbc_encrypt
1563d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_cbc_encrypt
1564d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_cbc_encrypt,\@abi-omnipotent
1565d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1566d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_cbc_encrypt:
1567d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1568d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
1569d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	48(%rsp),$arg6		# pull direction flag
1570d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1571d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1572d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$0,$arg6
1573d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jne	asm_AES_cbc_encrypt
1574d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$128,$arg3
1575d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	asm_AES_cbc_encrypt
1576d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1577d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax
1578d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_prologue:
1579d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
1580d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
1581d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
1582d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
1583d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
1584d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
1585d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0x48(%rsp), %rsp
1586d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1587d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
1588d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0xa0(%rsp),$arg5	# pull ivp
1589d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0xa0(%rsp), %rsp
1590d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm6, 0x40(%rsp)
1591d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm7, 0x50(%rsp)
1592d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm8, 0x60(%rsp)
1593d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm9, 0x70(%rsp)
1594d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm10, 0x80(%rsp)
1595d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm11, 0x90(%rsp)
1596d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm12, 0xa0(%rsp)
1597d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm13, 0xb0(%rsp)
1598d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm14, 0xc0(%rsp)
1599d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm15, 0xd0(%rsp)
1600d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_body:
1601d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1602d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1603d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rbp		# backup %rsp
1604d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	240($arg4), %eax	# rounds
1605d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg1, $inp		# backup arguments
1606d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg2, $out
1607d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg3, $len
1608d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg4, $key
1609d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg5, %rbx
1610d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shr	\$4, $len		# bytes to blocks
1611d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1612d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax, %edx		# rounds
1613d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shl	\$7, %rax		# 128 bytes per inner round key
1614d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1615d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	%rax, %rsp
1616d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1617d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
1618d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$key, %rcx		# pass key
1619d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
1620d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_key_convert
1621d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	(%rsp),%xmm7		# fix up 0 round key
1622d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm6,(%rax)		# save last round key
1623d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7,(%rsp)
1624d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1625d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	(%rbx), @XMM[15]	# load IV
1626d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$8,$len
1627d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_loop:
1628d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[0]	# load input
1629d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[1]
1630d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[2]
1631d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[3]
1632d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[4]
1633d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[5]
1634d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
1635d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[6]
1636d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx,%r10d		# pass rounds
1637d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[7]
1638d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1639d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1640d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1641d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1642d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1643d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# re-load input
1644d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[9]
1645d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
1646d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[10]
1647d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
1648d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[11]
1649d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[10], @XMM[4]
1650d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[12]
1651d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[11], @XMM[2]
1652d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[13]
1653d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[12], @XMM[7]
1654d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[14]
1655d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[13], @XMM[3]
1656d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[15]	# IV
1657d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[14], @XMM[5]
1658d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1659d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($inp), $inp
1660d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1661d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1662d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1663d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1664d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1665d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x60($out)
1666d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[5], 0x70($out)
1667d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out), $out
1668d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$8,$len
1669d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnc	.Lcbc_dec_loop
1670d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1671d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$8,$len
1672d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Lcbc_dec_done
1673d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1674d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[0]	# load input
1675d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
1676d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
1677d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$2,$len
1678d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lcbc_dec_one
1679d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[1]
1680d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lcbc_dec_two
1681d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[2]
1682d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$4,$len
1683d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lcbc_dec_three
1684d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[3]
1685d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lcbc_dec_four
1686d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[4]
1687d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$6,$len
1688d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lcbc_dec_five
1689d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[5]
1690d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lcbc_dec_six
1691d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[6]
1692d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1693d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1694d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1695d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# re-load input
1696d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[9]
1697d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
1698d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[10]
1699d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
1700d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[11]
1701d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[10], @XMM[4]
1702d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[12]
1703d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[11], @XMM[2]
1704d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[13]
1705d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[12], @XMM[7]
1706d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[15]	# IV
1707d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[13], @XMM[3]
1708d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1709d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1710d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1711d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1712d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1713d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1714d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x60($out)
1715d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lcbc_dec_done
1716d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1717d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_six:
1718d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1719d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1720d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1721d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# re-load input
1722d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[9]
1723d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
1724d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[10]
1725d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
1726d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[11]
1727d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[10], @XMM[4]
1728d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[12]
1729d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[11], @XMM[2]
1730d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[15]	# IV
1731d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[12], @XMM[7]
1732d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1733d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1734d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1735d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1736d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1737d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1738d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lcbc_dec_done
1739d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1740d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_five:
1741d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1742d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1743d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1744d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# re-load input
1745d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[9]
1746d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
1747d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[10]
1748d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
1749d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[11]
1750d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[10], @XMM[4]
1751d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[15]	# IV
1752d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[11], @XMM[2]
1753d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1754d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1755d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1756d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1757d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
1758d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lcbc_dec_done
1759d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1760d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_four:
1761d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1762d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1763d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1764d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# re-load input
1765d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[9]
1766d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
1767d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[10]
1768d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
1769d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[15]	# IV
1770d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[10], @XMM[4]
1771d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1772d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1773d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1774d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
1775d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lcbc_dec_done
1776d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1777d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_three:
1778d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1779d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1780d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1781d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# re-load input
1782d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[9]
1783d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
1784d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[15]	# IV
1785d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
1786d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1787d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1788d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
1789d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lcbc_dec_done
1790d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1791d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_two:
1792d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1793d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
1794d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1795d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# re-load input
1796d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[15]	# IV
1797d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[1]
1798d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1799d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1800d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lcbc_dec_done
1801d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1802d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_one:
1803d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($inp), $arg1
1804d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg2	# buffer output
1805d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
1806d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_decrypt		# doesn't touch %xmm
1807d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1808d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[15], ($out)	# write output
1809d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], @XMM[15]	# IV
1810d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1811d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_done:
1812d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[15], (%rbx)	# return IV
1813d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	(%rsp), %rax
1814d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0, %xmm0
1815d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_bzero:			# wipe key schedule [if any]
1816d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x00(%rax)
1817d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x10(%rax)
1818d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rax), %rax
1819d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%rax, %rbp
1820d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ja	.Lcbc_dec_bzero
1821d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1822a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0x78(%rbp),%rax
1823d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1824d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
1825d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x40(%rbp), %xmm6
1826d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x50(%rbp), %xmm7
1827d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x60(%rbp), %xmm8
1828d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x70(%rbp), %xmm9
1829d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x80(%rbp), %xmm10
1830d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x90(%rbp), %xmm11
1831d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xa0(%rbp), %xmm12
1832d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xb0(%rbp), %xmm13
1833d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xc0(%rbp), %xmm14
1834d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xd0(%rbp), %xmm15
1835a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0xa0(%rax), %rax
1836a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan.Lcbc_dec_tail:
1837d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1838d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1839a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-48(%rax), %r15
1840a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-40(%rax), %r14
1841a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-32(%rax), %r13
1842a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-24(%rax), %r12
1843a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-16(%rax), %rbx
1844a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-8(%rax), %rbp
1845a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	(%rax), %rsp		# restore %rsp
1846d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_epilogue:
1847d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
1848d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1849d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1850d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_ctr32_encrypt_blocks
1851d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1852d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1853d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_ctr32_encrypt_blocks:
1854d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax
1855d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_prologue:
1856d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
1857d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
1858d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
1859d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
1860d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
1861d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
1862d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0x48(%rsp), %rsp
1863d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1864d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
1865d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0xa0(%rsp),$arg5	# pull ivp
1866d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0xa0(%rsp), %rsp
1867d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm6, 0x40(%rsp)
1868d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm7, 0x50(%rsp)
1869d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm8, 0x60(%rsp)
1870d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm9, 0x70(%rsp)
1871d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm10, 0x80(%rsp)
1872d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm11, 0x90(%rsp)
1873d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm12, 0xa0(%rsp)
1874d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm13, 0xb0(%rsp)
1875d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm14, 0xc0(%rsp)
1876d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm15, 0xd0(%rsp)
1877d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_body:
1878d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
1879d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
1880d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rbp		# backup %rsp
1881d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	($arg5), %xmm0		# load counter
1882d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	240($arg4), %eax	# rounds
1883d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg1, $inp		# backup arguments
1884d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg2, $out
1885d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg3, $len
1886d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg4, $key
1887d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1888d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$8, $arg3
1889d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lctr_enc_short
1890d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1891d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax, %ebx		# rounds
1892d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shl	\$7, %rax		# 128 bytes per inner round key
1893d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1894d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	%rax, %rsp
1895d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1896d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
1897d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$key, %rcx		# pass key
1898d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%ebx, %r10d		# pass rounds
1899d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_key_convert
1900d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm6,%xmm7		# fix up last round key
1901d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7,(%rax)		# save last round key
1902d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1903d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	(%rsp), @XMM[9]		# load round0 key
1904d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.LADD1(%rip), %r11
1905d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1906d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1907d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1908d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufb	@XMM[8], @XMM[0]
1909d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1910d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lctr_enc_loop
1911d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1912d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_loop:
1913d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1914d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1915d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], @XMM[2]
1916d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddd	0x00(%r11), @XMM[1]	# .LADD1
1917d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], @XMM[3]
1918d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddd	0x10(%r11), @XMM[2]	# .LADD2
1919d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], @XMM[4]
1920d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddd	0x20(%r11), @XMM[3]	# .LADD3
1921d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], @XMM[5]
1922d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddd	0x30(%r11), @XMM[4]	# .LADD4
1923d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], @XMM[6]
1924d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddd	0x40(%r11), @XMM[5]	# .LADD5
1925d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], @XMM[7]
1926d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddd	0x50(%r11), @XMM[6]	# .LADD6
1927d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddd	0x60(%r11), @XMM[7]	# .LADD7
1928d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1929d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1930d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	# to flip byte order in 32-bit counter
1931d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	(%rsp), @XMM[9]		# round 0 key
1932d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10(%rsp), %rax	# pass key schedule
1933d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1934d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1935d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[1]
1936d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[2]
1937d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[3]
1938d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[0]
1939d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[1]
1940d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[4]
1941d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[5]
1942d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[2]
1943d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[3]
1944d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[6]
1945d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[7]
1946d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[4]
1947d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[5]
1948d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[6]
1949d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	 pshufb	@XMM[8], @XMM[7]
1950d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.LBS0(%rip), %r11	# constants table
1951d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%ebx,%r10d		# pass rounds
1952d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1953d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8_bitslice
1954d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1955d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$8,$len
1956d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jc	.Lctr_enc_loop_done
1957d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1958d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# load input
1959d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[9]
1960d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[10]
1961d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[11]
1962d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[12]
1963d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[13]
1964d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[14]
1965d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[15]
1966d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($inp),$inp
1967d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[0], @XMM[8]
1968d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x20(%rbp), @XMM[0]	# load counter
1969d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[1]
1970d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[8], 0x00($out)	# write output
1971d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[10], @XMM[4]
1972d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
1973d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[11], @XMM[6]
1974d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
1975d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[12], @XMM[3]
1976d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
1977d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[13], @XMM[7]
1978d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
1979d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[14], @XMM[2]
1980d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
1981d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[15], @XMM[5]
1982d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x60($out)
1983d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.LADD1(%rip), %r11
1984d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[5], 0x70($out)
1985d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out), $out
1986d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddd	0x70(%r11), @XMM[0]	# .LADD8
1987d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Lctr_enc_loop
1988d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
1989d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lctr_enc_done
1990d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
1991d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_loop_done:
1992d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$8, $len
1993d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x00($inp), @XMM[8]	# load input
1994d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8], @XMM[0]
1995d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
1996d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$2,$len
1997d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lctr_enc_done
1998d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x10($inp), @XMM[9]
1999d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[9], @XMM[1]
2000d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2001d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lctr_enc_done
2002d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x20($inp), @XMM[10]
2003d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[10], @XMM[4]
2004d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
2005d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$4,$len
2006d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lctr_enc_done
2007d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x30($inp), @XMM[11]
2008d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[11], @XMM[6]
2009d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
2010d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lctr_enc_done
2011d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x40($inp), @XMM[12]
2012d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[12], @XMM[3]
2013d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
2014d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$6,$len
2015d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lctr_enc_done
2016d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x50($inp), @XMM[13]
2017d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[13], @XMM[7]
2018d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
2019d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lctr_enc_done
2020d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[14]
2021d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[14], @XMM[2]
2022d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x60($out)
2023d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lctr_enc_done
2024d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2025d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2026d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_short:
2027d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg1
2028d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x30(%rbp), $arg2
2029d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
2030d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_encrypt
2031d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	($inp), @XMM[1]
2032d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	16($inp), $inp
2033d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0x2c(%rbp), %eax	# load 32-bit counter
2034d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bswap	%eax
2035d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rbp), @XMM[1]
2036d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	inc	%eax			# increment
2037d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], ($out)
2038d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	bswap	%eax
2039d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	16($out), $out
2040d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2041d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$len
2042d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Lctr_enc_short
2043d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2044d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_done:
2045d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	(%rsp), %rax
2046d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0, %xmm0
2047d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_bzero:			# wipe key schedule [if any]
2048d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x00(%rax)
2049d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x10(%rax)
2050d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rax), %rax
2051d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%rax, %rbp
2052d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ja	.Lctr_enc_bzero
2053d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2054a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0x78(%rbp),%rax
2055d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2056d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
2057d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x40(%rbp), %xmm6
2058d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x50(%rbp), %xmm7
2059d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x60(%rbp), %xmm8
2060d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x70(%rbp), %xmm9
2061d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x80(%rbp), %xmm10
2062d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x90(%rbp), %xmm11
2063d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xa0(%rbp), %xmm12
2064d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xb0(%rbp), %xmm13
2065d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xc0(%rbp), %xmm14
2066d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xd0(%rbp), %xmm15
2067a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0xa0(%rax), %rax
2068a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan.Lctr_enc_tail:
2069d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2070d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2071a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-48(%rax), %r15
2072a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-40(%rax), %r14
2073a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-32(%rax), %r13
2074a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-24(%rax), %r12
2075a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-16(%rax), %rbx
2076a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-8(%rax), %rbp
2077a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	(%rax), %rsp		# restore %rsp
2078d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_epilogue:
2079d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
2080d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2081d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2082d9e397b599b13d642138480a28c14db7a136bf0Adam Langley######################################################################
2083d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2084d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#	const AES_KEY *key1, const AES_KEY *key2,
2085d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#	const unsigned char iv[16]);
2086d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
2087d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($twmask,$twres,$twtmp)=@XMM[13..15];
2088d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$arg6=~s/d$//;
2089d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2090d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2091d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_xts_encrypt
2092d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_xts_encrypt,\@abi-omnipotent
2093d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2094d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_xts_encrypt:
2095d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax
2096d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_prologue:
2097d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
2098d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
2099d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
2100d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
2101d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
2102d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
2103d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0x48(%rsp), %rsp
2104d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2105d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
2106d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0xa0(%rsp),$arg5	# pull key2
2107d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0xa8(%rsp),$arg6	# pull ivp
2108d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0xa0(%rsp), %rsp
2109d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm6, 0x40(%rsp)
2110d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm7, 0x50(%rsp)
2111d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm8, 0x60(%rsp)
2112d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm9, 0x70(%rsp)
2113d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm10, 0x80(%rsp)
2114d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm11, 0x90(%rsp)
2115d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm12, 0xa0(%rsp)
2116d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm13, 0xb0(%rsp)
2117d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm14, 0xc0(%rsp)
2118d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm15, 0xd0(%rsp)
2119d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_body:
2120d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2121d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2122d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rbp		# backup %rsp
2123d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg1, $inp		# backup arguments
2124d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg2, $out
2125d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg3, $len
2126d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg4, $key
2127d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2128d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($arg6), $arg1
2129d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg2
2130d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($arg5), $arg3
2131d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_encrypt		# generate initial tweak
2132d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2133d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	240($key), %eax		# rounds
2134d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$len, %rbx		# backup $len
2135d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2136d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax, %edx		# rounds
2137d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shl	\$7, %rax		# 128 bytes per inner round key
2138d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2139d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	%rax, %rsp
2140d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2141d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
2142d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$key, %rcx		# pass key
2143d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2144d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_key_convert
2145d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm6, %xmm7		# fix up last round key
2146d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7, (%rax)		# save last round key
2147d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2148d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	and	\$-16, $len
2149d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$0x80, %rsp		# place for tweak[8]
2150d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2151d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2152d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2153d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	.Lxts_magic(%rip), $twmask
2154d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2155d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2156d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$0x80, $len
2157d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jc	.Lxts_enc_short
2158d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_enc_loop
2159d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2160d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2161d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_loop:
2162d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2163d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    for ($i=0;$i<7;$i++) {
2164d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
2165d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x13, $twtmp, $twres
2166d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2167d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], @XMM[$i]
2168d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2169d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2170d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$twmask, $twres		# isolate carry and residue
2171d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2172d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twres, @XMM[7]
2173d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2174d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($i>=1);
2175d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2176d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2177d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($i>=2);
2178d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2179d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2180d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    }
2181d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2182d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[8+6]
2183d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+5], @XMM[5]
2184d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[8+7]
2185d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($inp), $inp
2186d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], 0x70(%rsp)
2187d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+6], @XMM[6]
2188d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2189d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+7], @XMM[7]
2190d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2191d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2192d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
2193d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2194d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2195d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2196d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2197d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[4]
2198d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2199d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[6]
2200d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
2201d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40(%rsp), @XMM[3]
2202d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
2203d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x50(%rsp), @XMM[7]
2204d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
2205d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x60(%rsp), @XMM[2]
2206d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
2207d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x70(%rsp), @XMM[5]
2208d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x60($out)
2209d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[5], 0x70($out)
2210d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out), $out
2211d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2212d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2213d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2214d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	.Lxts_magic(%rip), $twmask
2215d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp
2216d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x13, $twtmp, $twres
2217d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2218d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2219d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$twmask, $twres		# isolate carry and residue
2220d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2221d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twres, @XMM[7]
2222d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2223d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$0x80,$len
2224d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnc	.Lxts_enc_loop
2225d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2226d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_short:
2227d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$0x80, $len
2228d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Lxts_enc_done
2229d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2230d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    for ($i=0;$i<7;$i++) {
2231d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
2232d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x13, $twtmp, $twres
2233d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2234d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], @XMM[$i]
2235d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2236d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2237d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$twmask, $twres		# isolate carry and residue
2238d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2239d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twres, @XMM[7]
2240d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2241d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($i>=1);
2242d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2243d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$`0x10*$i`,$len
2244d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lxts_enc_$i
2245d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2246d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($i>=2);
2247d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2248d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2249d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    }
2250d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2251d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[8+6]
2252d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+5], @XMM[5]
2253d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], 0x70(%rsp)
2254d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x70($inp), $inp
2255d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+6], @XMM[6]
2256d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2257d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2258d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2259d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
2260d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2261d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2262d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2263d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2264d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[4]
2265d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2266d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[6]
2267d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
2268d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40(%rsp), @XMM[3]
2269d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
2270d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x50(%rsp), @XMM[7]
2271d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
2272d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x60(%rsp), @XMM[2]
2273d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
2274d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x60($out)
2275d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x70($out), $out
2276d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2277d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2278d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_enc_done
2279d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2280d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_6:
2281d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+4], @XMM[4]
2282d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x60($inp), $inp
2283d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+5], @XMM[5]
2284d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2285d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2286d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2287d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
2288d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2289d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2290d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2291d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2292d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[4]
2293d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2294d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[6]
2295d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
2296d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40(%rsp), @XMM[3]
2297d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
2298d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x50(%rsp), @XMM[7]
2299d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
2300d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
2301d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x60($out), $out
2302d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2303d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2304d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_enc_done
2305d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2306d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_5:
2307d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+3], @XMM[3]
2308d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x50($inp), $inp
2309d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+4], @XMM[4]
2310d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2311d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2312d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2313d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
2314d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2315d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2316d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2317d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2318d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[4]
2319d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2320d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[6]
2321d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
2322d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40(%rsp), @XMM[3]
2323d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
2324d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x40($out)
2325d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x50($out), $out
2326d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2327d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2328d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_enc_done
2329d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2330d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_4:
2331d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+2], @XMM[2]
2332d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x40($inp), $inp
2333d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+3], @XMM[3]
2334d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2335d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2336d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2337d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
2338d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2339d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2340d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2341d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2342d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[4]
2343d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2344d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[6]
2345d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
2346d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x30($out)
2347d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x40($out), $out
2348d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2349d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2350d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_enc_done
2351d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2352d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_3:
2353d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+1], @XMM[1]
2354d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x30($inp), $inp
2355d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+2], @XMM[2]
2356d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2357d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2358d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2359d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
2360d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2361d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2362d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2363d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2364d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[4]
2365d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2366d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x20($out)
2367d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x30($out), $out
2368d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2369d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2370d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_enc_done
2371d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2372d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_2:
2373d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+0], @XMM[0]
2374d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20($inp), $inp
2375d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+1], @XMM[1]
2376d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2377d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2378d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2379d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_encrypt8
2380d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2381d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2382d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2383d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2384d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2385d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20($out), $out
2386d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2387d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2388d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_enc_done
2389d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2390d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_1:
2391d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[0], @XMM[8]
2392d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10($inp), $inp
2393d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[8], 0x20(%rbp)
2394d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg1
2395d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg2
2396d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
2397d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_encrypt		# doesn't touch %xmm
2398d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2399d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#pxor	@XMM[8], @XMM[0]
2400d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#lea	0x80(%rsp), %rax	# pass key schedule
2401d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#mov	%edx, %r10d		# pass rounds
2402d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#call	_bsaes_encrypt8
2403d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2404d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2405d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10($out), $out
2406d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2407d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2408d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2409d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_done:
2410d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	and	\$15, %ebx
2411d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Lxts_enc_ret
2412d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$out, %rdx
2413d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2414d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_steal:
2415d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movzb	($inp), %eax
2416d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movzb	-16(%rdx), %ecx
2417d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	1($inp), $inp
2418d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%al, -16(%rdx)
2419d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%cl, 0(%rdx)
2420d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	1(%rdx), %rdx
2421d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$1,%ebx
2422d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Lxts_enc_steal
2423d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2424d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	-16($out), @XMM[0]
2425d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg1
2426d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[7], @XMM[0]
2427d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg2
2428d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], 0x20(%rbp)
2429d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
2430d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_encrypt		# doesn't touch %xmm
2431d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[7]
2432d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], -16($out)
2433d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2434d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_ret:
2435d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	(%rsp), %rax
2436d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0, %xmm0
2437d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_bzero:			# wipe key schedule [if any]
2438d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x00(%rax)
2439d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x10(%rax)
2440d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rax), %rax
2441d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%rax, %rbp
2442d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ja	.Lxts_enc_bzero
2443d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2444a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0x78(%rbp),%rax
2445d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2446d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
2447d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x40(%rbp), %xmm6
2448d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x50(%rbp), %xmm7
2449d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x60(%rbp), %xmm8
2450d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x70(%rbp), %xmm9
2451d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x80(%rbp), %xmm10
2452d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x90(%rbp), %xmm11
2453d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xa0(%rbp), %xmm12
2454d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xb0(%rbp), %xmm13
2455d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xc0(%rbp), %xmm14
2456d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xd0(%rbp), %xmm15
2457a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0xa0(%rax), %rax
2458a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan.Lxts_enc_tail:
2459d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2460d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2461a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-48(%rax), %r15
2462a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-40(%rax), %r14
2463a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-32(%rax), %r13
2464a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-24(%rax), %r12
2465a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-16(%rax), %rbx
2466a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-8(%rax), %rbp
2467a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	(%rax), %rsp		# restore %rsp
2468d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_epilogue:
2469d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
2470d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2471d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2472d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	bsaes_xts_decrypt
2473d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	bsaes_xts_decrypt,\@abi-omnipotent
2474d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2475d9e397b599b13d642138480a28c14db7a136bf0Adam Langleybsaes_xts_decrypt:
2476d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax
2477d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_prologue:
2478d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
2479d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
2480d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
2481d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
2482d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
2483d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
2484d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0x48(%rsp), %rsp
2485d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2486d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
2487d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0xa0(%rsp),$arg5	# pull key2
2488d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0xa8(%rsp),$arg6	# pull ivp
2489d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	-0xa0(%rsp), %rsp
2490d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm6, 0x40(%rsp)
2491d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm7, 0x50(%rsp)
2492d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm8, 0x60(%rsp)
2493d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm9, 0x70(%rsp)
2494d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm10, 0x80(%rsp)
2495d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm11, 0x90(%rsp)
2496d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm12, 0xa0(%rsp)
2497d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm13, 0xb0(%rsp)
2498d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm14, 0xc0(%rsp)
2499d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	%xmm15, 0xd0(%rsp)
2500d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_body:
2501d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2502d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2503d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rbp		# backup %rsp
2504d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg1, $inp		# backup arguments
2505d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg2, $out
2506d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg3, $len
2507d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$arg4, $key
2508d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2509d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($arg6), $arg1
2510d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg2
2511d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($arg5), $arg3
2512d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_encrypt		# generate initial tweak
2513d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2514d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	240($key), %eax		# rounds
2515d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$len, %rbx		# backup $len
2516d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2517d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax, %edx		# rounds
2518d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shl	\$7, %rax		# 128 bytes per inner round key
2519d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2520d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	%rax, %rsp
2521d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2522d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp, %rax		# pass key schedule
2523d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$key, %rcx		# pass key
2524d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2525d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_key_convert
2526d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	(%rsp), %xmm7		# fix up round 0 key
2527d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm6, (%rax)		# save last round key
2528d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm7, (%rsp)
2529d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2530d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	%eax, %eax		# if ($len%16) len-=16;
2531d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	and	\$-16, $len
2532d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	test	\$15, %ebx
2533d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	setnz	%al
2534d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shl	\$4, %rax
2535d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	%rax, $len
2536d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2537d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$0x80, %rsp		# place for tweak[8]
2538d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2539d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2540d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2541d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	.Lxts_magic(%rip), $twmask
2542d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2543d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2544d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$0x80, $len
2545d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jc	.Lxts_dec_short
2546d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_dec_loop
2547d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2548d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2549d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_loop:
2550d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2551d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    for ($i=0;$i<7;$i++) {
2552d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
2553d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x13, $twtmp, $twres
2554d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2555d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], @XMM[$i]
2556d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2557d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2558d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$twmask, $twres		# isolate carry and residue
2559d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2560d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twres, @XMM[7]
2561d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2562d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($i>=1);
2563d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2564d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2565d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($i>=2);
2566d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2567d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2568d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    }
2569d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2570d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[8+6]
2571d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+5], @XMM[5]
2572d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x70($inp), @XMM[8+7]
2573d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($inp), $inp
2574d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], 0x70(%rsp)
2575d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+6], @XMM[6]
2576d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2577d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+7], @XMM[7]
2578d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2579d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2580d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
2581d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2582d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2583d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2584d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2585d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[6]
2586d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2587d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[4]
2588d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
2589d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40(%rsp), @XMM[2]
2590d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
2591d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x50(%rsp), @XMM[7]
2592d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
2593d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x60(%rsp), @XMM[3]
2594d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
2595d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x70(%rsp), @XMM[5]
2596d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x60($out)
2597d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[5], 0x70($out)
2598d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80($out), $out
2599d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2600d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2601d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2602d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	.Lxts_magic(%rip), $twmask
2603d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp
2604d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x13, $twtmp, $twres
2605d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2606d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2607d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$twmask, $twres		# isolate carry and residue
2608d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2609d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twres, @XMM[7]
2610d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2611d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$0x80,$len
2612d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnc	.Lxts_dec_loop
2613d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2614d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_short:
2615d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$0x80, $len
2616d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Lxts_dec_done
2617d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2618d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    for ($i=0;$i<7;$i++) {
2619d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
2620d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x13, $twtmp, $twres
2621d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2622d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], @XMM[$i]
2623d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2624d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2625d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$twmask, $twres		# isolate carry and residue
2626d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2627d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twres, @XMM[7]
2628d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2629d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($i>=1);
2630d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2631d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$`0x10*$i`,$len
2632d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Lxts_dec_$i
2633d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2634d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($i>=2);
2635d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2636d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2637d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    }
2638d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2639d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	0x60($inp), @XMM[8+6]
2640d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+5], @XMM[5]
2641d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], 0x70(%rsp)
2642d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x70($inp), $inp
2643d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+6], @XMM[6]
2644d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2645d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2646d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2647d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
2648d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2649d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2650d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2651d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2652d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[6]
2653d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2654d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[4]
2655d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
2656d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40(%rsp), @XMM[2]
2657d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
2658d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x50(%rsp), @XMM[7]
2659d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
2660d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x60(%rsp), @XMM[3]
2661d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
2662d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[3], 0x60($out)
2663d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x70($out), $out
2664d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2665d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2666d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_dec_done
2667d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2668d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_6:
2669d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+4], @XMM[4]
2670d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x60($inp), $inp
2671d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+5], @XMM[5]
2672d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2673d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2674d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2675d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
2676d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2677d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2678d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2679d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2680d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[6]
2681d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2682d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[4]
2683d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
2684d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40(%rsp), @XMM[2]
2685d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
2686d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x50(%rsp), @XMM[7]
2687d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
2688d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], 0x50($out)
2689d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x60($out), $out
2690d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2691d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2692d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_dec_done
2693d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2694d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_5:
2695d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+3], @XMM[3]
2696d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x50($inp), $inp
2697d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+4], @XMM[4]
2698d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2699d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2700d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2701d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
2702d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2703d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2704d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2705d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2706d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[6]
2707d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2708d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[4]
2709d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
2710d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x40(%rsp), @XMM[2]
2711d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
2712d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[2], 0x40($out)
2713d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x50($out), $out
2714d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2715d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2716d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_dec_done
2717d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2718d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_4:
2719d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+2], @XMM[2]
2720d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x40($inp), $inp
2721d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+3], @XMM[3]
2722d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2723d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2724d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2725d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
2726d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2727d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2728d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2729d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2730d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[6]
2731d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2732d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x30(%rsp), @XMM[4]
2733d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
2734d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[4], 0x30($out)
2735d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x40($out), $out
2736d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2737d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2738d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_dec_done
2739d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2740d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_3:
2741d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+1], @XMM[1]
2742d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x30($inp), $inp
2743d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+2], @XMM[2]
2744d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2745d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2746d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2747d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
2748d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2749d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2750d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2751d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2752d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rsp), @XMM[6]
2753d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2754d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], 0x20($out)
2755d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x30($out), $out
2756d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2757d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2758d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_dec_done
2759d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2760d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_2:
2761d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+0], @XMM[0]
2762d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20($inp), $inp
2763d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[8+1], @XMM[1]
2764d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x80(%rsp), %rax	# pass key schedule
2765d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%edx, %r10d		# pass rounds
2766d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2767d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	_bsaes_decrypt8
2768d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2769d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2770d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x10(%rsp), @XMM[1]
2771d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2772d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[1], 0x10($out)
2773d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20($out), $out
2774d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2775d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2776d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lxts_dec_done
2777d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2778d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_1:
2779d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[0], @XMM[8]
2780d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10($inp), $inp
2781d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[8], 0x20(%rbp)
2782d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg1
2783d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg2
2784d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
2785d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_decrypt		# doesn't touch %xmm
2786d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2787d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#pxor	@XMM[8], @XMM[0]
2788d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#lea	0x80(%rsp), %rax	# pass key schedule
2789d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#mov	%edx, %r10d		# pass rounds
2790d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#call	_bsaes_decrypt8
2791d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2792d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[0], 0x00($out)	# write output
2793d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x10($out), $out
2794d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2795d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2796d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2797d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_done:
2798d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	and	\$15, %ebx
2799d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Lxts_dec_ret
2800d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2801d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twtmp, $twtmp
2802d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	.Lxts_magic(%rip), $twmask
2803d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pcmpgtd	@XMM[7], $twtmp
2804d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pshufd	\$0x13, $twtmp, $twres
2805d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[7], @XMM[6]
2806d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2807d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pand	$twmask, $twres		# isolate carry and residue
2808d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	($inp), @XMM[0]
2809d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	$twres, @XMM[7]
2810d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2811d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg1
2812d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[7], @XMM[0]
2813d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg2
2814d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], 0x20(%rbp)
2815d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
2816d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_decrypt		# doesn't touch %xmm
2817d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[7]
2818d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$out, %rdx
2819d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[7], ($out)
2820d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2821d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_steal:
2822d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movzb	16($inp), %eax
2823d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movzb	(%rdx), %ecx
2824d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	1($inp), $inp
2825d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%al, (%rdx)
2826d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%cl, 16(%rdx)
2827d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	1(%rdx), %rdx
2828d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$1,%ebx
2829d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Lxts_dec_steal
2830d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2831d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	($out), @XMM[0]
2832d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg1
2833d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	@XMM[6], @XMM[0]
2834d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rbp), $arg2
2835d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	@XMM[0], 0x20(%rbp)
2836d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($key), $arg3
2837d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	asm_AES_decrypt		# doesn't touch %xmm
2838d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	0x20(%rbp), @XMM[6]
2839d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqu	@XMM[6], ($out)
2840d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2841d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_ret:
2842d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	(%rsp), %rax
2843d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0, %xmm0
2844d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_bzero:			# wipe key schedule [if any]
2845d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x00(%rax)
2846d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movdqa	%xmm0, 0x10(%rax)
2847d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x20(%rax), %rax
2848d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%rax, %rbp
2849d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ja	.Lxts_dec_bzero
2850d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2851a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0x78(%rbp),%rax
2852d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2853d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($win64);
2854d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x40(%rbp), %xmm6
2855d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x50(%rbp), %xmm7
2856d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x60(%rbp), %xmm8
2857d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x70(%rbp), %xmm9
2858d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x80(%rbp), %xmm10
2859d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0x90(%rbp), %xmm11
2860d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xa0(%rbp), %xmm12
2861d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xb0(%rbp), %xmm13
2862d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xc0(%rbp), %xmm14
2863d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movaps	0xd0(%rbp), %xmm15
2864a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0xa0(%rax), %rax
2865a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan.Lxts_dec_tail:
2866d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2867d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2868a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-48(%rax), %r15
2869a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-40(%rax), %r14
2870a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-32(%rax), %r13
2871a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-24(%rax), %r12
2872a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-16(%rax), %rbx
2873a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-8(%rax), %rbp
2874a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	(%rax), %rsp		# restore %rsp
2875d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_epilogue:
2876d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
2877d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2878d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2879d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
2880d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2881d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	_bsaes_const,\@object
2882d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	64
2883d9e397b599b13d642138480a28c14db7a136bf0Adam Langley_bsaes_const:
2884d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LM0ISR:	# InvShiftRows constants
2885d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2886d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LISRM0:
2887d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2888d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LISR:
2889d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2890d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LBS0:		# bit-slice constants
2891d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x5555555555555555, 0x5555555555555555
2892d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LBS1:
2893d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x3333333333333333, 0x3333333333333333
2894d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LBS2:
2895d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2896d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LSR:		# shiftrows constants
2897d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2898d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LSRM0:
2899d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2900d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LM0SR:
2901d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2902d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LSWPUP:	# byte-swap upper dword
2903d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2904d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LSWPUPM0SR:
2905d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2906d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LADD1:		# counter increment constants
2907d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0000000000000000, 0x0000000100000000
2908d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LADD2:
2909d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0000000000000000, 0x0000000200000000
2910d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LADD3:
2911d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0000000000000000, 0x0000000300000000
2912d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LADD4:
2913d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0000000000000000, 0x0000000400000000
2914d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LADD5:
2915d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0000000000000000, 0x0000000500000000
2916d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LADD6:
2917d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0000000000000000, 0x0000000600000000
2918d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LADD7:
2919d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0000000000000000, 0x0000000700000000
2920d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LADD8:
2921d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0000000000000000, 0x0000000800000000
2922d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_magic:
2923d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.long	0x87,0,1,0
2924d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lmasks:
2925d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0101010101010101, 0x0101010101010101
2926d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0202020202020202, 0x0202020202020202
2927d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0404040404040404, 0x0404040404040404
2928d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x0808080808080808, 0x0808080808080808
2929d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LM0:
2930d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2931d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.L63:
2932d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.quad	0x6363636363636363, 0x6363636363636363
2933d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2934d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	64
2935d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	_bsaes_const,.-_bsaes_const
2936d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
2937d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2938d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2939d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2940d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif ($win64) {
2941d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$rec="%rcx";
2942d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$frame="%rdx";
2943d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$context="%r8";
2944d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$disp="%r9";
2945d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2946d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
2947d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.extern	__imp_RtlVirtualUnwind
2948d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	se_handler,\@abi-omnipotent
2949d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
2950d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyse_handler:
2951d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rsi
2952d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rdi
2953d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
2954d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
2955d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
2956d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
2957d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
2958d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
2959d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pushfq
2960d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$64,%rsp
2961d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2962d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	120($context),%rax	# pull context->Rax
2963d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	248($context),%rbx	# pull context->Rip
2964d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2965d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	8($disp),%rsi		# disp->ImageBase
2966d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	56($disp),%r11		# disp->HandlerData
2967d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2968d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0(%r11),%r10d		# HandlerData[0]
2969d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	(%rsi,%r10),%r10	# prologue label
2970a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	cmp	%r10,%rbx		# context->Rip<=prologue label
2971a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	jbe	.Lin_prologue
2972d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2973d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	4(%r11),%r10d		# HandlerData[1]
2974d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	(%rsi,%r10),%r10	# epilogue label
2975d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%r10,%rbx		# context->Rip>=epilogue label
2976d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jae	.Lin_prologue
2977d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2978a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	8(%r11),%r10d		# HandlerData[2]
2979a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	(%rsi,%r10),%r10	# epilogue label
2980a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	cmp	%r10,%rbx		# context->Rip>=tail label
2981a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	jae	.Lin_tail
2982a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan
2983d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	160($context),%rax	# pull context->Rbp
2984d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
2985d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	0x40(%rax),%rsi		# %xmm save area
2986d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	512($context),%rdi	# &context.Xmm6
2987d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2988d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.long	0xa548f3fc		# cld; rep movsq
2989a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
2990a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan
2991a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan.Lin_tail:
2992a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-48(%rax),%rbp
2993a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-40(%rax),%rbx
2994a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-32(%rax),%r12
2995a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-24(%rax),%r13
2996a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-16(%rax),%r14
2997a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	mov	-8(%rax),%r15
2998d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rbx,144($context)	# restore context->Rbx
2999d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rbp,160($context)	# restore context->Rbp
3000d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r12,216($context)	# restore context->R12
3001d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r13,224($context)	# restore context->R13
3002d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r14,232($context)	# restore context->R14
3003d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r15,240($context)	# restore context->R15
3004d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3005d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lin_prologue:
3006d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rax,152($context)	# restore context->Rsp
3007d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3008d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	40($disp),%rdi		# disp->ContextRecord
3009d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$context,%rsi		# context
3010d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3011d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.long	0xa548f3fc		# cld; rep movsq
3012d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3013d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$disp,%rsi
3014d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3015d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3016d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3017d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3018d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	40(%rsi),%r10		# disp->ContextRecord
3019d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	56(%rsi),%r11		# &disp->HandlerData
3020d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3021d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r10,32(%rsp)		# arg5
3022d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r11,40(%rsp)		# arg6
3023d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r12,48(%rsp)		# arg7
3024d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rcx,56(%rsp)		# arg8, (NULL)
3025d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	*__imp_RtlVirtualUnwind(%rip)
3026d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3027d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	\$1,%eax		# ExceptionContinueSearch
3028d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$64,%rsp
3029d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	popfq
3030d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%r15
3031d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%r14
3032d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%r13
3033d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%r12
3034d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%rbp
3035d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%rbx
3036d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%rdi
3037d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%rsi
3038d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
3039d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	se_handler,.-se_handler
3040d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3041d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.section	.pdata
3042d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	4
3043d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
3044d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($ecb);
3045d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lecb_enc_prologue
3046d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lecb_enc_epilogue
3047d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lecb_enc_info
3048d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3049d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lecb_dec_prologue
3050d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lecb_dec_epilogue
3051d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lecb_dec_info
3052d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
3053d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
3054d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lcbc_dec_prologue
3055d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lcbc_dec_epilogue
3056d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lcbc_dec_info
3057d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3058d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lctr_enc_prologue
3059d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lctr_enc_epilogue
3060d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lctr_enc_info
3061d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3062d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lxts_enc_prologue
3063d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lxts_enc_epilogue
3064d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lxts_enc_info
3065d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3066d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lxts_dec_prologue
3067d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lxts_dec_epilogue
3068d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lxts_dec_info
3069d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3070d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.section	.xdata
3071d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	8
3072d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
3073d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($ecb);
3074d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_enc_info:
3075d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.byte	9,0,0,0
3076d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	se_handler
3077d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3078a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.rva	.Lecb_enc_tail
3079a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.long	0
3080d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lecb_dec_info:
3081d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.byte	9,0,0,0
3082d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	se_handler
3083d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3084a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.rva	.Lecb_dec_tail
3085a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.long	0
3086d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
3087d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
3088d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lcbc_dec_info:
3089d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.byte	9,0,0,0
3090d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	se_handler
3091d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3092a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.rva	.Lcbc_dec_tail
3093a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.long	0
3094d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lctr_enc_info:
3095d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.byte	9,0,0,0
3096d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	se_handler
3097d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3098a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.rva	.Lctr_enc_tail
3099a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.long	0
3100d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_enc_info:
3101d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.byte	9,0,0,0
3102d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	se_handler
3103d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3104a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.rva	.Lxts_enc_tail
3105a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.long	0
3106d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lxts_dec_info:
3107d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.byte	9,0,0,0
3108d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	se_handler
3109d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3110a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.rva	.Lxts_dec_tail
3111a94fe0531b3c196ad078174259af2201b2e3a246Robert Sloan	.long	0
3112d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
3113d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
3114d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3115d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3116d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3117d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyprint $code;
3118d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
3119d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyclose STDOUT;
3120