1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13# details].
14#
15# Performance.
16#
17# Given aes(enc|dec) instructions' latency asymptotic performance for
18# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19# processed with 128-bit key. And given their throughput asymptotic
20# performance for parallelizable modes is 1.25 cycles per byte. Being
21# asymptotic limit it's not something you commonly achieve in reality,
22# but how close does one get? Below are results collected for
23# different modes and block sized. Pairs of numbers are for en-/
24# decryption.
25#
26#	16-byte     64-byte     256-byte    1-KB        8-KB
27# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
28# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
29# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
30# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
31# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
32# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
33#
34# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37# The results were collected with specially crafted speed.c benchmark
38# in order to compare them with results reported in "Intel Advanced
39# Encryption Standard (AES) New Instruction Set" White Paper Revision
40# 3.0 dated May 2010. All above results are consistently better. This
41# module also provides better performance for block sizes smaller than
42# 128 bytes in points *not* represented in the above table.
43#
44# Looking at the results for 8-KB buffer.
45#
46# CFB and OFB results are far from the limit, because implementation
47# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48# single-block aesni_encrypt, which is not the most optimal way to go.
49# CBC encrypt result is unexpectedly high and there is no documented
50# explanation for it. Seemingly there is a small penalty for feeding
51# the result back to AES unit the way it's done in CBC mode. There is
52# nothing one can do and the result appears optimal. CCM result is
53# identical to CBC, because CBC-MAC is essentially CBC encrypt without
54# saving output. CCM CTR "stays invisible," because it's neatly
55# interleaved wih CBC-MAC. This provides ~30% improvement over
56# "straghtforward" CCM implementation with CTR and CBC-MAC performed
57# disjointly. Parallelizable modes practically achieve the theoretical
58# limit.
59#
60# Looking at how results vary with buffer size.
61#
62# Curves are practically saturated at 1-KB buffer size. In most cases
63# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64# CTR curve doesn't follow this pattern and is "slowest" changing one
65# with "256-byte" result being 87% of "8-KB." This is because overhead
66# in CTR mode is most computationally intensive. Small-block CCM
67# decrypt is slower than encrypt, because first CTR and last CBC-MAC
68# iterations can't be interleaved.
69#
70# Results for 192- and 256-bit keys.
71#
72# EVP-free results were observed to scale perfectly with number of
73# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75# are a tad smaller, because the above mentioned penalty biases all
76# results by same constant value. In similar way function call
77# overhead affects small-block performance, as well as OFB and CFB
78# results. Differences are not large, most common coefficients are
79# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
82# January 2011
83#
84# While Westmere processor features 6 cycles latency for aes[enc|dec]
85# instructions, which can be scheduled every second cycle, Sandy
86# Bridge spends 8 cycles per instruction, but it can schedule them
87# every cycle. This means that code targeting Westmere would perform
88# suboptimally on Sandy Bridge. Therefore this update.
89#
90# In addition, non-parallelizable CBC encrypt (as well as CCM) is
91# optimized. Relative improvement might appear modest, 8% on Westmere,
92# but in absolute terms it's 3.77 cycles per byte encrypted with
93# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94# should be compared to asymptotic limits of 3.75 for Westmere and
95# 5.00 for Sandy Bridge. Actually, the fact that they get this close
96# to asymptotic limits is quite amazing. Indeed, the limit is
97# calculated as latency times number of rounds, 10 for 128-bit key,
98# and divided by 16, the number of bytes in block, or in other words
99# it accounts *solely* for aesenc instructions. But there are extra
100# instructions, and numbers so close to the asymptotic limits mean
101# that it's as if it takes as little as *one* additional cycle to
102# execute all of them. How is it possible? It is possible thanks to
103# out-of-order execution logic, which manages to overlap post-
104# processing of previous block, things like saving the output, with
105# actual encryption of current block, as well as pre-processing of
106# current block, things like fetching input and xor-ing it with
107# 0-round element of the key schedule, with actual encryption of
108# previous block. Keep this in mind...
109#
110# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111# performance is achieved by interleaving instructions working on
112# independent blocks. In which case asymptotic limit for such modes
113# can be obtained by dividing above mentioned numbers by AES
114# instructions' interleave factor. Westmere can execute at most 3
115# instructions at a time, meaning that optimal interleave factor is 3,
116# and that's where the "magic" number of 1.25 come from. "Optimal
117# interleave factor" means that increase of interleave factor does
118# not improve performance. The formula has proven to reflect reality
119# pretty well on Westmere... Sandy Bridge on the other hand can
120# execute up to 8 AES instructions at a time, so how does varying
121# interleave factor affect the performance? Here is table for ECB
122# (numbers are cycles per byte processed with 128-bit key):
123#
124# instruction interleave factor		3x	6x	8x
125# theoretical asymptotic limit		1.67	0.83	0.625
126# measured performance for 8KB block	1.05	0.86	0.84
127#
128# "as if" interleave factor		4.7x	5.8x	6.0x
129#
130# Further data for other parallelizable modes:
131#
132# CBC decrypt				1.16	0.93	0.93
133# CTR					1.14	0.91	n/a
134#
135# Well, given 3x column it's probably inappropriate to call the limit
136# asymptotic, if it can be surpassed, isn't it? What happens there?
137# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138# magic is responsible for this. Processor overlaps not only the
139# additional instructions with AES ones, but even AES instuctions
140# processing adjacent triplets of independent blocks. In the 6x case
141# additional instructions  still claim disproportionally small amount
142# of additional cycles, but in 8x case number of instructions must be
143# a tad too high for out-of-order logic to cope with, and AES unit
144# remains underutilized... As you can see 8x interleave is hardly
145# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146# utilizies 6x interleave because of limited register bank capacity.
147#
148# Higher interleave factors do have negative impact on Westmere
149# performance. While for ECB mode it's negligible ~1.5%, other
150# parallelizables perform ~5% worse, which is outweighed by ~25%
151# improvement on Sandy Bridge. To balance regression on Westmere
152# CTR mode was implemented with 6x aesenc interleave factor.
153
154# April 2011
155#
156# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
157# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
158# in CTR mode AES instruction interleave factor was chosen to be 6x.
159
160$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
161			# generates drop-in replacement for
162			# crypto/aes/asm/aes-x86_64.pl:-)
163
164$flavour = shift;
165$output  = shift;
166if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
167
168$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
169
170$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
171( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
172( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
173die "can't locate x86_64-xlate.pl";
174
175open STDOUT,"| $^X $xlate $flavour $output";
176
177$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
178@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
179		("%rdi","%rsi","%rdx","%rcx");	# Unix order
180
181$code=".text\n";
182
183$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
184# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
185$inp="%rdi";
186$out="%rsi";
187$len="%rdx";
188$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
189$ivp="%r8";	# cbc, ctr, ...
190
191$rnds_="%r10d";	# backup copy for $rounds
192$key_="%r11";	# backup copy for $key
193
194# %xmm register layout
195$rndkey0="%xmm0";	$rndkey1="%xmm1";
196$inout0="%xmm2";	$inout1="%xmm3";
197$inout2="%xmm4";	$inout3="%xmm5";
198$inout4="%xmm6";	$inout5="%xmm7";
199$inout6="%xmm8";	$inout7="%xmm9";
200
201$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
202$in0="%xmm8";		$iv="%xmm9";
203
204# Inline version of internal aesni_[en|de]crypt1.
205#
206# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
207# cycles which take care of loop variables...
208{ my $sn;
209sub aesni_generate1 {
210my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
211++$sn;
212$code.=<<___;
213	$movkey	($key),$rndkey0
214	$movkey	16($key),$rndkey1
215___
216$code.=<<___ if (defined($ivec));
217	xorps	$rndkey0,$ivec
218	lea	32($key),$key
219	xorps	$ivec,$inout
220___
221$code.=<<___ if (!defined($ivec));
222	lea	32($key),$key
223	xorps	$rndkey0,$inout
224___
225$code.=<<___;
226.Loop_${p}1_$sn:
227	aes${p}	$rndkey1,$inout
228	dec	$rounds
229	$movkey	($key),$rndkey1
230	lea	16($key),$key
231	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
232	aes${p}last	$rndkey1,$inout
233___
234}}
235# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
236#
237{ my ($inp,$out,$key) = @_4args;
238
239$code.=<<___;
240.globl	${PREFIX}_encrypt
241.type	${PREFIX}_encrypt,\@abi-omnipotent
242.align	16
243${PREFIX}_encrypt:
244	movups	($inp),$inout0		# load input
245	mov	240($key),$rounds	# key->rounds
246___
247	&aesni_generate1("enc",$key,$rounds);
248$code.=<<___;
249	movups	$inout0,($out)		# output
250	ret
251.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
252
253.globl	${PREFIX}_decrypt
254.type	${PREFIX}_decrypt,\@abi-omnipotent
255.align	16
256${PREFIX}_decrypt:
257	movups	($inp),$inout0		# load input
258	mov	240($key),$rounds	# key->rounds
259___
260	&aesni_generate1("dec",$key,$rounds);
261$code.=<<___;
262	movups	$inout0,($out)		# output
263	ret
264.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
265___
266}
267
268# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
269# factor. Why 3x subroutine were originally used in loops? Even though
270# aes[enc|dec] latency was originally 6, it could be scheduled only
271# every *2nd* cycle. Thus 3x interleave was the one providing optimal
272# utilization, i.e. when subroutine's throughput is virtually same as
273# of non-interleaved subroutine [for number of input blocks up to 3].
274# This is why it makes no sense to implement 2x subroutine.
275# aes[enc|dec] latency in next processor generation is 8, but the
276# instructions can be scheduled every cycle. Optimal interleave for
277# new processor is therefore 8x...
278sub aesni_generate3 {
279my $dir=shift;
280# As already mentioned it takes in $key and $rounds, which are *not*
281# preserved. $inout[0-2] is cipher/clear text...
282$code.=<<___;
283.type	_aesni_${dir}rypt3,\@abi-omnipotent
284.align	16
285_aesni_${dir}rypt3:
286	$movkey	($key),$rndkey0
287	shr	\$1,$rounds
288	$movkey	16($key),$rndkey1
289	lea	32($key),$key
290	xorps	$rndkey0,$inout0
291	xorps	$rndkey0,$inout1
292	xorps	$rndkey0,$inout2
293	$movkey		($key),$rndkey0
294
295.L${dir}_loop3:
296	aes${dir}	$rndkey1,$inout0
297	aes${dir}	$rndkey1,$inout1
298	dec		$rounds
299	aes${dir}	$rndkey1,$inout2
300	$movkey		16($key),$rndkey1
301	aes${dir}	$rndkey0,$inout0
302	aes${dir}	$rndkey0,$inout1
303	lea		32($key),$key
304	aes${dir}	$rndkey0,$inout2
305	$movkey		($key),$rndkey0
306	jnz		.L${dir}_loop3
307
308	aes${dir}	$rndkey1,$inout0
309	aes${dir}	$rndkey1,$inout1
310	aes${dir}	$rndkey1,$inout2
311	aes${dir}last	$rndkey0,$inout0
312	aes${dir}last	$rndkey0,$inout1
313	aes${dir}last	$rndkey0,$inout2
314	ret
315.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
316___
317}
318# 4x interleave is implemented to improve small block performance,
319# most notably [and naturally] 4 block by ~30%. One can argue that one
320# should have implemented 5x as well, but improvement would be <20%,
321# so it's not worth it...
322sub aesni_generate4 {
323my $dir=shift;
324# As already mentioned it takes in $key and $rounds, which are *not*
325# preserved. $inout[0-3] is cipher/clear text...
326$code.=<<___;
327.type	_aesni_${dir}rypt4,\@abi-omnipotent
328.align	16
329_aesni_${dir}rypt4:
330	$movkey	($key),$rndkey0
331	shr	\$1,$rounds
332	$movkey	16($key),$rndkey1
333	lea	32($key),$key
334	xorps	$rndkey0,$inout0
335	xorps	$rndkey0,$inout1
336	xorps	$rndkey0,$inout2
337	xorps	$rndkey0,$inout3
338	$movkey	($key),$rndkey0
339
340.L${dir}_loop4:
341	aes${dir}	$rndkey1,$inout0
342	aes${dir}	$rndkey1,$inout1
343	dec		$rounds
344	aes${dir}	$rndkey1,$inout2
345	aes${dir}	$rndkey1,$inout3
346	$movkey		16($key),$rndkey1
347	aes${dir}	$rndkey0,$inout0
348	aes${dir}	$rndkey0,$inout1
349	lea		32($key),$key
350	aes${dir}	$rndkey0,$inout2
351	aes${dir}	$rndkey0,$inout3
352	$movkey		($key),$rndkey0
353	jnz		.L${dir}_loop4
354
355	aes${dir}	$rndkey1,$inout0
356	aes${dir}	$rndkey1,$inout1
357	aes${dir}	$rndkey1,$inout2
358	aes${dir}	$rndkey1,$inout3
359	aes${dir}last	$rndkey0,$inout0
360	aes${dir}last	$rndkey0,$inout1
361	aes${dir}last	$rndkey0,$inout2
362	aes${dir}last	$rndkey0,$inout3
363	ret
364.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
365___
366}
367sub aesni_generate6 {
368my $dir=shift;
369# As already mentioned it takes in $key and $rounds, which are *not*
370# preserved. $inout[0-5] is cipher/clear text...
371$code.=<<___;
372.type	_aesni_${dir}rypt6,\@abi-omnipotent
373.align	16
374_aesni_${dir}rypt6:
375	$movkey		($key),$rndkey0
376	shr		\$1,$rounds
377	$movkey		16($key),$rndkey1
378	lea		32($key),$key
379	xorps		$rndkey0,$inout0
380	pxor		$rndkey0,$inout1
381	aes${dir}	$rndkey1,$inout0
382	pxor		$rndkey0,$inout2
383	aes${dir}	$rndkey1,$inout1
384	pxor		$rndkey0,$inout3
385	aes${dir}	$rndkey1,$inout2
386	pxor		$rndkey0,$inout4
387	aes${dir}	$rndkey1,$inout3
388	pxor		$rndkey0,$inout5
389	dec		$rounds
390	aes${dir}	$rndkey1,$inout4
391	$movkey		($key),$rndkey0
392	aes${dir}	$rndkey1,$inout5
393	jmp		.L${dir}_loop6_enter
394.align	16
395.L${dir}_loop6:
396	aes${dir}	$rndkey1,$inout0
397	aes${dir}	$rndkey1,$inout1
398	dec		$rounds
399	aes${dir}	$rndkey1,$inout2
400	aes${dir}	$rndkey1,$inout3
401	aes${dir}	$rndkey1,$inout4
402	aes${dir}	$rndkey1,$inout5
403.L${dir}_loop6_enter:				# happens to be 16-byte aligned
404	$movkey		16($key),$rndkey1
405	aes${dir}	$rndkey0,$inout0
406	aes${dir}	$rndkey0,$inout1
407	lea		32($key),$key
408	aes${dir}	$rndkey0,$inout2
409	aes${dir}	$rndkey0,$inout3
410	aes${dir}	$rndkey0,$inout4
411	aes${dir}	$rndkey0,$inout5
412	$movkey		($key),$rndkey0
413	jnz		.L${dir}_loop6
414
415	aes${dir}	$rndkey1,$inout0
416	aes${dir}	$rndkey1,$inout1
417	aes${dir}	$rndkey1,$inout2
418	aes${dir}	$rndkey1,$inout3
419	aes${dir}	$rndkey1,$inout4
420	aes${dir}	$rndkey1,$inout5
421	aes${dir}last	$rndkey0,$inout0
422	aes${dir}last	$rndkey0,$inout1
423	aes${dir}last	$rndkey0,$inout2
424	aes${dir}last	$rndkey0,$inout3
425	aes${dir}last	$rndkey0,$inout4
426	aes${dir}last	$rndkey0,$inout5
427	ret
428.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
429___
430}
431sub aesni_generate8 {
432my $dir=shift;
433# As already mentioned it takes in $key and $rounds, which are *not*
434# preserved. $inout[0-7] is cipher/clear text...
435$code.=<<___;
436.type	_aesni_${dir}rypt8,\@abi-omnipotent
437.align	16
438_aesni_${dir}rypt8:
439	$movkey		($key),$rndkey0
440	shr		\$1,$rounds
441	$movkey		16($key),$rndkey1
442	lea		32($key),$key
443	xorps		$rndkey0,$inout0
444	xorps		$rndkey0,$inout1
445	aes${dir}	$rndkey1,$inout0
446	pxor		$rndkey0,$inout2
447	aes${dir}	$rndkey1,$inout1
448	pxor		$rndkey0,$inout3
449	aes${dir}	$rndkey1,$inout2
450	pxor		$rndkey0,$inout4
451	aes${dir}	$rndkey1,$inout3
452	pxor		$rndkey0,$inout5
453	dec		$rounds
454	aes${dir}	$rndkey1,$inout4
455	pxor		$rndkey0,$inout6
456	aes${dir}	$rndkey1,$inout5
457	pxor		$rndkey0,$inout7
458	$movkey		($key),$rndkey0
459	aes${dir}	$rndkey1,$inout6
460	aes${dir}	$rndkey1,$inout7
461	$movkey		16($key),$rndkey1
462	jmp		.L${dir}_loop8_enter
463.align	16
464.L${dir}_loop8:
465	aes${dir}	$rndkey1,$inout0
466	aes${dir}	$rndkey1,$inout1
467	dec		$rounds
468	aes${dir}	$rndkey1,$inout2
469	aes${dir}	$rndkey1,$inout3
470	aes${dir}	$rndkey1,$inout4
471	aes${dir}	$rndkey1,$inout5
472	aes${dir}	$rndkey1,$inout6
473	aes${dir}	$rndkey1,$inout7
474	$movkey		16($key),$rndkey1
475.L${dir}_loop8_enter:				# happens to be 16-byte aligned
476	aes${dir}	$rndkey0,$inout0
477	aes${dir}	$rndkey0,$inout1
478	lea		32($key),$key
479	aes${dir}	$rndkey0,$inout2
480	aes${dir}	$rndkey0,$inout3
481	aes${dir}	$rndkey0,$inout4
482	aes${dir}	$rndkey0,$inout5
483	aes${dir}	$rndkey0,$inout6
484	aes${dir}	$rndkey0,$inout7
485	$movkey		($key),$rndkey0
486	jnz		.L${dir}_loop8
487
488	aes${dir}	$rndkey1,$inout0
489	aes${dir}	$rndkey1,$inout1
490	aes${dir}	$rndkey1,$inout2
491	aes${dir}	$rndkey1,$inout3
492	aes${dir}	$rndkey1,$inout4
493	aes${dir}	$rndkey1,$inout5
494	aes${dir}	$rndkey1,$inout6
495	aes${dir}	$rndkey1,$inout7
496	aes${dir}last	$rndkey0,$inout0
497	aes${dir}last	$rndkey0,$inout1
498	aes${dir}last	$rndkey0,$inout2
499	aes${dir}last	$rndkey0,$inout3
500	aes${dir}last	$rndkey0,$inout4
501	aes${dir}last	$rndkey0,$inout5
502	aes${dir}last	$rndkey0,$inout6
503	aes${dir}last	$rndkey0,$inout7
504	ret
505.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
506___
507}
508&aesni_generate3("enc") if ($PREFIX eq "aesni");
509&aesni_generate3("dec");
510&aesni_generate4("enc") if ($PREFIX eq "aesni");
511&aesni_generate4("dec");
512&aesni_generate6("enc") if ($PREFIX eq "aesni");
513&aesni_generate6("dec");
514&aesni_generate8("enc") if ($PREFIX eq "aesni");
515&aesni_generate8("dec");
516
517if ($PREFIX eq "aesni") {
518########################################################################
519# void aesni_ecb_encrypt (const void *in, void *out,
520#			  size_t length, const AES_KEY *key,
521#			  int enc);
522$code.=<<___;
523.globl	aesni_ecb_encrypt
524.type	aesni_ecb_encrypt,\@function,5
525.align	16
526aesni_ecb_encrypt:
527	and	\$-16,$len
528	jz	.Lecb_ret
529
530	mov	240($key),$rounds	# key->rounds
531	$movkey	($key),$rndkey0
532	mov	$key,$key_		# backup $key
533	mov	$rounds,$rnds_		# backup $rounds
534	test	%r8d,%r8d		# 5th argument
535	jz	.Lecb_decrypt
536#--------------------------- ECB ENCRYPT ------------------------------#
537	cmp	\$0x80,$len
538	jb	.Lecb_enc_tail
539
540	movdqu	($inp),$inout0
541	movdqu	0x10($inp),$inout1
542	movdqu	0x20($inp),$inout2
543	movdqu	0x30($inp),$inout3
544	movdqu	0x40($inp),$inout4
545	movdqu	0x50($inp),$inout5
546	movdqu	0x60($inp),$inout6
547	movdqu	0x70($inp),$inout7
548	lea	0x80($inp),$inp
549	sub	\$0x80,$len
550	jmp	.Lecb_enc_loop8_enter
551.align 16
552.Lecb_enc_loop8:
553	movups	$inout0,($out)
554	mov	$key_,$key		# restore $key
555	movdqu	($inp),$inout0
556	mov	$rnds_,$rounds		# restore $rounds
557	movups	$inout1,0x10($out)
558	movdqu	0x10($inp),$inout1
559	movups	$inout2,0x20($out)
560	movdqu	0x20($inp),$inout2
561	movups	$inout3,0x30($out)
562	movdqu	0x30($inp),$inout3
563	movups	$inout4,0x40($out)
564	movdqu	0x40($inp),$inout4
565	movups	$inout5,0x50($out)
566	movdqu	0x50($inp),$inout5
567	movups	$inout6,0x60($out)
568	movdqu	0x60($inp),$inout6
569	movups	$inout7,0x70($out)
570	lea	0x80($out),$out
571	movdqu	0x70($inp),$inout7
572	lea	0x80($inp),$inp
573.Lecb_enc_loop8_enter:
574
575	call	_aesni_encrypt8
576
577	sub	\$0x80,$len
578	jnc	.Lecb_enc_loop8
579
580	movups	$inout0,($out)
581	mov	$key_,$key		# restore $key
582	movups	$inout1,0x10($out)
583	mov	$rnds_,$rounds		# restore $rounds
584	movups	$inout2,0x20($out)
585	movups	$inout3,0x30($out)
586	movups	$inout4,0x40($out)
587	movups	$inout5,0x50($out)
588	movups	$inout6,0x60($out)
589	movups	$inout7,0x70($out)
590	lea	0x80($out),$out
591	add	\$0x80,$len
592	jz	.Lecb_ret
593
594.Lecb_enc_tail:
595	movups	($inp),$inout0
596	cmp	\$0x20,$len
597	jb	.Lecb_enc_one
598	movups	0x10($inp),$inout1
599	je	.Lecb_enc_two
600	movups	0x20($inp),$inout2
601	cmp	\$0x40,$len
602	jb	.Lecb_enc_three
603	movups	0x30($inp),$inout3
604	je	.Lecb_enc_four
605	movups	0x40($inp),$inout4
606	cmp	\$0x60,$len
607	jb	.Lecb_enc_five
608	movups	0x50($inp),$inout5
609	je	.Lecb_enc_six
610	movdqu	0x60($inp),$inout6
611	call	_aesni_encrypt8
612	movups	$inout0,($out)
613	movups	$inout1,0x10($out)
614	movups	$inout2,0x20($out)
615	movups	$inout3,0x30($out)
616	movups	$inout4,0x40($out)
617	movups	$inout5,0x50($out)
618	movups	$inout6,0x60($out)
619	jmp	.Lecb_ret
620.align	16
621.Lecb_enc_one:
622___
623	&aesni_generate1("enc",$key,$rounds);
624$code.=<<___;
625	movups	$inout0,($out)
626	jmp	.Lecb_ret
627.align	16
628.Lecb_enc_two:
629	xorps	$inout2,$inout2
630	call	_aesni_encrypt3
631	movups	$inout0,($out)
632	movups	$inout1,0x10($out)
633	jmp	.Lecb_ret
634.align	16
635.Lecb_enc_three:
636	call	_aesni_encrypt3
637	movups	$inout0,($out)
638	movups	$inout1,0x10($out)
639	movups	$inout2,0x20($out)
640	jmp	.Lecb_ret
641.align	16
642.Lecb_enc_four:
643	call	_aesni_encrypt4
644	movups	$inout0,($out)
645	movups	$inout1,0x10($out)
646	movups	$inout2,0x20($out)
647	movups	$inout3,0x30($out)
648	jmp	.Lecb_ret
649.align	16
650.Lecb_enc_five:
651	xorps	$inout5,$inout5
652	call	_aesni_encrypt6
653	movups	$inout0,($out)
654	movups	$inout1,0x10($out)
655	movups	$inout2,0x20($out)
656	movups	$inout3,0x30($out)
657	movups	$inout4,0x40($out)
658	jmp	.Lecb_ret
659.align	16
660.Lecb_enc_six:
661	call	_aesni_encrypt6
662	movups	$inout0,($out)
663	movups	$inout1,0x10($out)
664	movups	$inout2,0x20($out)
665	movups	$inout3,0x30($out)
666	movups	$inout4,0x40($out)
667	movups	$inout5,0x50($out)
668	jmp	.Lecb_ret
669#--------------------------- ECB DECRYPT ------------------------------#
670.align	16
671.Lecb_decrypt:
672	cmp	\$0x80,$len
673	jb	.Lecb_dec_tail
674
675	movdqu	($inp),$inout0
676	movdqu	0x10($inp),$inout1
677	movdqu	0x20($inp),$inout2
678	movdqu	0x30($inp),$inout3
679	movdqu	0x40($inp),$inout4
680	movdqu	0x50($inp),$inout5
681	movdqu	0x60($inp),$inout6
682	movdqu	0x70($inp),$inout7
683	lea	0x80($inp),$inp
684	sub	\$0x80,$len
685	jmp	.Lecb_dec_loop8_enter
686.align 16
687.Lecb_dec_loop8:
688	movups	$inout0,($out)
689	mov	$key_,$key		# restore $key
690	movdqu	($inp),$inout0
691	mov	$rnds_,$rounds		# restore $rounds
692	movups	$inout1,0x10($out)
693	movdqu	0x10($inp),$inout1
694	movups	$inout2,0x20($out)
695	movdqu	0x20($inp),$inout2
696	movups	$inout3,0x30($out)
697	movdqu	0x30($inp),$inout3
698	movups	$inout4,0x40($out)
699	movdqu	0x40($inp),$inout4
700	movups	$inout5,0x50($out)
701	movdqu	0x50($inp),$inout5
702	movups	$inout6,0x60($out)
703	movdqu	0x60($inp),$inout6
704	movups	$inout7,0x70($out)
705	lea	0x80($out),$out
706	movdqu	0x70($inp),$inout7
707	lea	0x80($inp),$inp
708.Lecb_dec_loop8_enter:
709
710	call	_aesni_decrypt8
711
712	$movkey	($key_),$rndkey0
713	sub	\$0x80,$len
714	jnc	.Lecb_dec_loop8
715
716	movups	$inout0,($out)
717	mov	$key_,$key		# restore $key
718	movups	$inout1,0x10($out)
719	mov	$rnds_,$rounds		# restore $rounds
720	movups	$inout2,0x20($out)
721	movups	$inout3,0x30($out)
722	movups	$inout4,0x40($out)
723	movups	$inout5,0x50($out)
724	movups	$inout6,0x60($out)
725	movups	$inout7,0x70($out)
726	lea	0x80($out),$out
727	add	\$0x80,$len
728	jz	.Lecb_ret
729
730.Lecb_dec_tail:
731	movups	($inp),$inout0
732	cmp	\$0x20,$len
733	jb	.Lecb_dec_one
734	movups	0x10($inp),$inout1
735	je	.Lecb_dec_two
736	movups	0x20($inp),$inout2
737	cmp	\$0x40,$len
738	jb	.Lecb_dec_three
739	movups	0x30($inp),$inout3
740	je	.Lecb_dec_four
741	movups	0x40($inp),$inout4
742	cmp	\$0x60,$len
743	jb	.Lecb_dec_five
744	movups	0x50($inp),$inout5
745	je	.Lecb_dec_six
746	movups	0x60($inp),$inout6
747	$movkey	($key),$rndkey0
748	call	_aesni_decrypt8
749	movups	$inout0,($out)
750	movups	$inout1,0x10($out)
751	movups	$inout2,0x20($out)
752	movups	$inout3,0x30($out)
753	movups	$inout4,0x40($out)
754	movups	$inout5,0x50($out)
755	movups	$inout6,0x60($out)
756	jmp	.Lecb_ret
757.align	16
758.Lecb_dec_one:
759___
760	&aesni_generate1("dec",$key,$rounds);
761$code.=<<___;
762	movups	$inout0,($out)
763	jmp	.Lecb_ret
764.align	16
765.Lecb_dec_two:
766	xorps	$inout2,$inout2
767	call	_aesni_decrypt3
768	movups	$inout0,($out)
769	movups	$inout1,0x10($out)
770	jmp	.Lecb_ret
771.align	16
772.Lecb_dec_three:
773	call	_aesni_decrypt3
774	movups	$inout0,($out)
775	movups	$inout1,0x10($out)
776	movups	$inout2,0x20($out)
777	jmp	.Lecb_ret
778.align	16
779.Lecb_dec_four:
780	call	_aesni_decrypt4
781	movups	$inout0,($out)
782	movups	$inout1,0x10($out)
783	movups	$inout2,0x20($out)
784	movups	$inout3,0x30($out)
785	jmp	.Lecb_ret
786.align	16
787.Lecb_dec_five:
788	xorps	$inout5,$inout5
789	call	_aesni_decrypt6
790	movups	$inout0,($out)
791	movups	$inout1,0x10($out)
792	movups	$inout2,0x20($out)
793	movups	$inout3,0x30($out)
794	movups	$inout4,0x40($out)
795	jmp	.Lecb_ret
796.align	16
797.Lecb_dec_six:
798	call	_aesni_decrypt6
799	movups	$inout0,($out)
800	movups	$inout1,0x10($out)
801	movups	$inout2,0x20($out)
802	movups	$inout3,0x30($out)
803	movups	$inout4,0x40($out)
804	movups	$inout5,0x50($out)
805
806.Lecb_ret:
807	ret
808.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
809___
810
811{
812######################################################################
813# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
814#                         size_t blocks, const AES_KEY *key,
815#                         const char *ivec,char *cmac);
816#
817# Handles only complete blocks, operates on 64-bit counter and
818# does not update *ivec! Nor does it finalize CMAC value
819# (see engine/eng_aesni.c for details)
820#
821{
822my $cmac="%r9";	# 6th argument
823
824my $increment="%xmm6";
825my $bswap_mask="%xmm7";
826
827$code.=<<___;
828.globl	aesni_ccm64_encrypt_blocks
829.type	aesni_ccm64_encrypt_blocks,\@function,6
830.align	16
831aesni_ccm64_encrypt_blocks:
832___
833$code.=<<___ if ($win64);
834	lea	-0x58(%rsp),%rsp
835	movaps	%xmm6,(%rsp)
836	movaps	%xmm7,0x10(%rsp)
837	movaps	%xmm8,0x20(%rsp)
838	movaps	%xmm9,0x30(%rsp)
839.Lccm64_enc_body:
840___
841$code.=<<___;
842	mov	240($key),$rounds		# key->rounds
843	movdqu	($ivp),$iv
844	movdqa	.Lincrement64(%rip),$increment
845	movdqa	.Lbswap_mask(%rip),$bswap_mask
846
847	shr	\$1,$rounds
848	lea	0($key),$key_
849	movdqu	($cmac),$inout1
850	movdqa	$iv,$inout0
851	mov	$rounds,$rnds_
852	pshufb	$bswap_mask,$iv
853	jmp	.Lccm64_enc_outer
854.align	16
855.Lccm64_enc_outer:
856	$movkey	($key_),$rndkey0
857	mov	$rnds_,$rounds
858	movups	($inp),$in0			# load inp
859
860	xorps	$rndkey0,$inout0		# counter
861	$movkey	16($key_),$rndkey1
862	xorps	$in0,$rndkey0
863	lea	32($key_),$key
864	xorps	$rndkey0,$inout1		# cmac^=inp
865	$movkey	($key),$rndkey0
866
867.Lccm64_enc2_loop:
868	aesenc	$rndkey1,$inout0
869	dec	$rounds
870	aesenc	$rndkey1,$inout1
871	$movkey	16($key),$rndkey1
872	aesenc	$rndkey0,$inout0
873	lea	32($key),$key
874	aesenc	$rndkey0,$inout1
875	$movkey	0($key),$rndkey0
876	jnz	.Lccm64_enc2_loop
877	aesenc	$rndkey1,$inout0
878	aesenc	$rndkey1,$inout1
879	paddq	$increment,$iv
880	aesenclast	$rndkey0,$inout0
881	aesenclast	$rndkey0,$inout1
882
883	dec	$len
884	lea	16($inp),$inp
885	xorps	$inout0,$in0			# inp ^= E(iv)
886	movdqa	$iv,$inout0
887	movups	$in0,($out)			# save output
888	lea	16($out),$out
889	pshufb	$bswap_mask,$inout0
890	jnz	.Lccm64_enc_outer
891
892	movups	$inout1,($cmac)
893___
894$code.=<<___ if ($win64);
895	movaps	(%rsp),%xmm6
896	movaps	0x10(%rsp),%xmm7
897	movaps	0x20(%rsp),%xmm8
898	movaps	0x30(%rsp),%xmm9
899	lea	0x58(%rsp),%rsp
900.Lccm64_enc_ret:
901___
902$code.=<<___;
903	ret
904.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
905___
906######################################################################
907$code.=<<___;
908.globl	aesni_ccm64_decrypt_blocks
909.type	aesni_ccm64_decrypt_blocks,\@function,6
910.align	16
911aesni_ccm64_decrypt_blocks:
912___
913$code.=<<___ if ($win64);
914	lea	-0x58(%rsp),%rsp
915	movaps	%xmm6,(%rsp)
916	movaps	%xmm7,0x10(%rsp)
917	movaps	%xmm8,0x20(%rsp)
918	movaps	%xmm9,0x30(%rsp)
919.Lccm64_dec_body:
920___
921$code.=<<___;
922	mov	240($key),$rounds		# key->rounds
923	movups	($ivp),$iv
924	movdqu	($cmac),$inout1
925	movdqa	.Lincrement64(%rip),$increment
926	movdqa	.Lbswap_mask(%rip),$bswap_mask
927
928	movaps	$iv,$inout0
929	mov	$rounds,$rnds_
930	mov	$key,$key_
931	pshufb	$bswap_mask,$iv
932___
933	&aesni_generate1("enc",$key,$rounds);
934$code.=<<___;
935	movups	($inp),$in0			# load inp
936	paddq	$increment,$iv
937	lea	16($inp),$inp
938	jmp	.Lccm64_dec_outer
939.align	16
940.Lccm64_dec_outer:
941	xorps	$inout0,$in0			# inp ^= E(iv)
942	movdqa	$iv,$inout0
943	mov	$rnds_,$rounds
944	movups	$in0,($out)			# save output
945	lea	16($out),$out
946	pshufb	$bswap_mask,$inout0
947
948	sub	\$1,$len
949	jz	.Lccm64_dec_break
950
951	$movkey	($key_),$rndkey0
952	shr	\$1,$rounds
953	$movkey	16($key_),$rndkey1
954	xorps	$rndkey0,$in0
955	lea	32($key_),$key
956	xorps	$rndkey0,$inout0
957	xorps	$in0,$inout1			# cmac^=out
958	$movkey	($key),$rndkey0
959
960.Lccm64_dec2_loop:
961	aesenc	$rndkey1,$inout0
962	dec	$rounds
963	aesenc	$rndkey1,$inout1
964	$movkey	16($key),$rndkey1
965	aesenc	$rndkey0,$inout0
966	lea	32($key),$key
967	aesenc	$rndkey0,$inout1
968	$movkey	0($key),$rndkey0
969	jnz	.Lccm64_dec2_loop
970	movups	($inp),$in0			# load inp
971	paddq	$increment,$iv
972	aesenc	$rndkey1,$inout0
973	aesenc	$rndkey1,$inout1
974	lea	16($inp),$inp
975	aesenclast	$rndkey0,$inout0
976	aesenclast	$rndkey0,$inout1
977	jmp	.Lccm64_dec_outer
978
979.align	16
980.Lccm64_dec_break:
981	#xorps	$in0,$inout1			# cmac^=out
982___
983	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
984$code.=<<___;
985	movups	$inout1,($cmac)
986___
987$code.=<<___ if ($win64);
988	movaps	(%rsp),%xmm6
989	movaps	0x10(%rsp),%xmm7
990	movaps	0x20(%rsp),%xmm8
991	movaps	0x30(%rsp),%xmm9
992	lea	0x58(%rsp),%rsp
993.Lccm64_dec_ret:
994___
995$code.=<<___;
996	ret
997.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
998___
999}
1000######################################################################
1001# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1002#                         size_t blocks, const AES_KEY *key,
1003#                         const char *ivec);
1004#
1005# Handles only complete blocks, operates on 32-bit counter and
1006# does not update *ivec! (see engine/eng_aesni.c for details)
1007#
1008{
1009my $reserved = $win64?0:-0x28;
1010my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
1011my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
1012my $bswap_mask="%xmm15";
1013
1014$code.=<<___;
1015.globl	aesni_ctr32_encrypt_blocks
1016.type	aesni_ctr32_encrypt_blocks,\@function,5
1017.align	16
1018aesni_ctr32_encrypt_blocks:
1019___
1020$code.=<<___ if ($win64);
1021	lea	-0xc8(%rsp),%rsp
1022	movaps	%xmm6,0x20(%rsp)
1023	movaps	%xmm7,0x30(%rsp)
1024	movaps	%xmm8,0x40(%rsp)
1025	movaps	%xmm9,0x50(%rsp)
1026	movaps	%xmm10,0x60(%rsp)
1027	movaps	%xmm11,0x70(%rsp)
1028	movaps	%xmm12,0x80(%rsp)
1029	movaps	%xmm13,0x90(%rsp)
1030	movaps	%xmm14,0xa0(%rsp)
1031	movaps	%xmm15,0xb0(%rsp)
1032.Lctr32_body:
1033___
1034$code.=<<___;
1035	cmp	\$1,$len
1036	je	.Lctr32_one_shortcut
1037
1038	movdqu	($ivp),$ivec
1039	movdqa	.Lbswap_mask(%rip),$bswap_mask
1040	xor	$rounds,$rounds
1041	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
1042	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
1043
1044	mov	240($key),$rounds		# key->rounds
1045	bswap	$rnds_
1046	pxor	$iv0,$iv0			# vector of 3 32-bit counters
1047	pxor	$iv1,$iv1			# vector of 3 32-bit counters
1048	pinsrd	\$0,$rnds_,$iv0
1049	lea	3($rnds_),$key_
1050	pinsrd	\$0,$key_,$iv1
1051	inc	$rnds_
1052	pinsrd	\$1,$rnds_,$iv0
1053	inc	$key_
1054	pinsrd	\$1,$key_,$iv1
1055	inc	$rnds_
1056	pinsrd	\$2,$rnds_,$iv0
1057	inc	$key_
1058	pinsrd	\$2,$key_,$iv1
1059	movdqa	$iv0,$reserved(%rsp)
1060	pshufb	$bswap_mask,$iv0
1061	movdqa	$iv1,`$reserved+0x10`(%rsp)
1062	pshufb	$bswap_mask,$iv1
1063
1064	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
1065	pshufd	\$`2<<6`,$iv0,$inout1
1066	pshufd	\$`1<<6`,$iv0,$inout2
1067	cmp	\$6,$len
1068	jb	.Lctr32_tail
1069	shr	\$1,$rounds
1070	mov	$key,$key_			# backup $key
1071	mov	$rounds,$rnds_			# backup $rounds
1072	sub	\$6,$len
1073	jmp	.Lctr32_loop6
1074
1075.align	16
1076.Lctr32_loop6:
1077	pshufd	\$`3<<6`,$iv1,$inout3
1078	por	$ivec,$inout0			# merge counter-less ivec
1079	 $movkey	($key_),$rndkey0
1080	pshufd	\$`2<<6`,$iv1,$inout4
1081	por	$ivec,$inout1
1082	 $movkey	16($key_),$rndkey1
1083	pshufd	\$`1<<6`,$iv1,$inout5
1084	por	$ivec,$inout2
1085	por	$ivec,$inout3
1086	 xorps		$rndkey0,$inout0
1087	por	$ivec,$inout4
1088	por	$ivec,$inout5
1089
1090	# inline _aesni_encrypt6 and interleave last rounds
1091	# with own code...
1092
1093	pxor		$rndkey0,$inout1
1094	aesenc		$rndkey1,$inout0
1095	lea		32($key_),$key
1096	pxor		$rndkey0,$inout2
1097	aesenc		$rndkey1,$inout1
1098	 movdqa		.Lincrement32(%rip),$iv1
1099	pxor		$rndkey0,$inout3
1100	aesenc		$rndkey1,$inout2
1101	 movdqa		$reserved(%rsp),$iv0
1102	pxor		$rndkey0,$inout4
1103	aesenc		$rndkey1,$inout3
1104	pxor		$rndkey0,$inout5
1105	$movkey		($key),$rndkey0
1106	dec		$rounds
1107	aesenc		$rndkey1,$inout4
1108	aesenc		$rndkey1,$inout5
1109	jmp		.Lctr32_enc_loop6_enter
1110.align	16
1111.Lctr32_enc_loop6:
1112	aesenc		$rndkey1,$inout0
1113	aesenc		$rndkey1,$inout1
1114	dec		$rounds
1115	aesenc		$rndkey1,$inout2
1116	aesenc		$rndkey1,$inout3
1117	aesenc		$rndkey1,$inout4
1118	aesenc		$rndkey1,$inout5
1119.Lctr32_enc_loop6_enter:
1120	$movkey		16($key),$rndkey1
1121	aesenc		$rndkey0,$inout0
1122	aesenc		$rndkey0,$inout1
1123	lea		32($key),$key
1124	aesenc		$rndkey0,$inout2
1125	aesenc		$rndkey0,$inout3
1126	aesenc		$rndkey0,$inout4
1127	aesenc		$rndkey0,$inout5
1128	$movkey		($key),$rndkey0
1129	jnz		.Lctr32_enc_loop6
1130
1131	aesenc		$rndkey1,$inout0
1132	 paddd		$iv1,$iv0		# increment counter vector
1133	aesenc		$rndkey1,$inout1
1134	 paddd		`$reserved+0x10`(%rsp),$iv1
1135	aesenc		$rndkey1,$inout2
1136	 movdqa		$iv0,$reserved(%rsp)	# save counter vector
1137	aesenc		$rndkey1,$inout3
1138	 movdqa		$iv1,`$reserved+0x10`(%rsp)
1139	aesenc		$rndkey1,$inout4
1140	 pshufb		$bswap_mask,$iv0	# byte swap
1141	aesenc		$rndkey1,$inout5
1142	 pshufb		$bswap_mask,$iv1
1143
1144	aesenclast	$rndkey0,$inout0
1145	 movups		($inp),$in0		# load input
1146	aesenclast	$rndkey0,$inout1
1147	 movups		0x10($inp),$in1
1148	aesenclast	$rndkey0,$inout2
1149	 movups		0x20($inp),$in2
1150	aesenclast	$rndkey0,$inout3
1151	 movups		0x30($inp),$in3
1152	aesenclast	$rndkey0,$inout4
1153	 movups		0x40($inp),$rndkey1
1154	aesenclast	$rndkey0,$inout5
1155	 movups		0x50($inp),$rndkey0
1156	 lea	0x60($inp),$inp
1157
1158	xorps	$inout0,$in0			# xor
1159	 pshufd	\$`3<<6`,$iv0,$inout0
1160	xorps	$inout1,$in1
1161	 pshufd	\$`2<<6`,$iv0,$inout1
1162	movups	$in0,($out)			# store output
1163	xorps	$inout2,$in2
1164	 pshufd	\$`1<<6`,$iv0,$inout2
1165	movups	$in1,0x10($out)
1166	xorps	$inout3,$in3
1167	movups	$in2,0x20($out)
1168	xorps	$inout4,$rndkey1
1169	movups	$in3,0x30($out)
1170	xorps	$inout5,$rndkey0
1171	movups	$rndkey1,0x40($out)
1172	movups	$rndkey0,0x50($out)
1173	lea	0x60($out),$out
1174	mov	$rnds_,$rounds
1175	sub	\$6,$len
1176	jnc	.Lctr32_loop6
1177
1178	add	\$6,$len
1179	jz	.Lctr32_done
1180	mov	$key_,$key			# restore $key
1181	lea	1($rounds,$rounds),$rounds	# restore original value
1182
1183.Lctr32_tail:
1184	por	$ivec,$inout0
1185	movups	($inp),$in0
1186	cmp	\$2,$len
1187	jb	.Lctr32_one
1188
1189	por	$ivec,$inout1
1190	movups	0x10($inp),$in1
1191	je	.Lctr32_two
1192
1193	pshufd	\$`3<<6`,$iv1,$inout3
1194	por	$ivec,$inout2
1195	movups	0x20($inp),$in2
1196	cmp	\$4,$len
1197	jb	.Lctr32_three
1198
1199	pshufd	\$`2<<6`,$iv1,$inout4
1200	por	$ivec,$inout3
1201	movups	0x30($inp),$in3
1202	je	.Lctr32_four
1203
1204	por	$ivec,$inout4
1205	xorps	$inout5,$inout5
1206
1207	call	_aesni_encrypt6
1208
1209	movups	0x40($inp),$rndkey1
1210	xorps	$inout0,$in0
1211	xorps	$inout1,$in1
1212	movups	$in0,($out)
1213	xorps	$inout2,$in2
1214	movups	$in1,0x10($out)
1215	xorps	$inout3,$in3
1216	movups	$in2,0x20($out)
1217	xorps	$inout4,$rndkey1
1218	movups	$in3,0x30($out)
1219	movups	$rndkey1,0x40($out)
1220	jmp	.Lctr32_done
1221
1222.align	16
1223.Lctr32_one_shortcut:
1224	movups	($ivp),$inout0
1225	movups	($inp),$in0
1226	mov	240($key),$rounds		# key->rounds
1227.Lctr32_one:
1228___
1229	&aesni_generate1("enc",$key,$rounds);
1230$code.=<<___;
1231	xorps	$inout0,$in0
1232	movups	$in0,($out)
1233	jmp	.Lctr32_done
1234
1235.align	16
1236.Lctr32_two:
1237	xorps	$inout2,$inout2
1238	call	_aesni_encrypt3
1239	xorps	$inout0,$in0
1240	xorps	$inout1,$in1
1241	movups	$in0,($out)
1242	movups	$in1,0x10($out)
1243	jmp	.Lctr32_done
1244
1245.align	16
1246.Lctr32_three:
1247	call	_aesni_encrypt3
1248	xorps	$inout0,$in0
1249	xorps	$inout1,$in1
1250	movups	$in0,($out)
1251	xorps	$inout2,$in2
1252	movups	$in1,0x10($out)
1253	movups	$in2,0x20($out)
1254	jmp	.Lctr32_done
1255
1256.align	16
1257.Lctr32_four:
1258	call	_aesni_encrypt4
1259	xorps	$inout0,$in0
1260	xorps	$inout1,$in1
1261	movups	$in0,($out)
1262	xorps	$inout2,$in2
1263	movups	$in1,0x10($out)
1264	xorps	$inout3,$in3
1265	movups	$in2,0x20($out)
1266	movups	$in3,0x30($out)
1267
1268.Lctr32_done:
1269___
1270$code.=<<___ if ($win64);
1271	movaps	0x20(%rsp),%xmm6
1272	movaps	0x30(%rsp),%xmm7
1273	movaps	0x40(%rsp),%xmm8
1274	movaps	0x50(%rsp),%xmm9
1275	movaps	0x60(%rsp),%xmm10
1276	movaps	0x70(%rsp),%xmm11
1277	movaps	0x80(%rsp),%xmm12
1278	movaps	0x90(%rsp),%xmm13
1279	movaps	0xa0(%rsp),%xmm14
1280	movaps	0xb0(%rsp),%xmm15
1281	lea	0xc8(%rsp),%rsp
1282.Lctr32_ret:
1283___
1284$code.=<<___;
1285	ret
1286.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1287___
1288}
1289
1290######################################################################
1291# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1292#	const AES_KEY *key1, const AES_KEY *key2
1293#	const unsigned char iv[16]);
1294#
1295{
1296my @tweak=map("%xmm$_",(10..15));
1297my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1298my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1299my $frame_size = 0x68 + ($win64?160:0);
1300
1301$code.=<<___;
1302.globl	aesni_xts_encrypt
1303.type	aesni_xts_encrypt,\@function,6
1304.align	16
1305aesni_xts_encrypt:
1306	lea	-$frame_size(%rsp),%rsp
1307___
1308$code.=<<___ if ($win64);
1309	movaps	%xmm6,0x60(%rsp)
1310	movaps	%xmm7,0x70(%rsp)
1311	movaps	%xmm8,0x80(%rsp)
1312	movaps	%xmm9,0x90(%rsp)
1313	movaps	%xmm10,0xa0(%rsp)
1314	movaps	%xmm11,0xb0(%rsp)
1315	movaps	%xmm12,0xc0(%rsp)
1316	movaps	%xmm13,0xd0(%rsp)
1317	movaps	%xmm14,0xe0(%rsp)
1318	movaps	%xmm15,0xf0(%rsp)
1319.Lxts_enc_body:
1320___
1321$code.=<<___;
1322	movups	($ivp),@tweak[5]		# load clear-text tweak
1323	mov	240(%r8),$rounds		# key2->rounds
1324	mov	240($key),$rnds_		# key1->rounds
1325___
1326	# generate the tweak
1327	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1328$code.=<<___;
1329	mov	$key,$key_			# backup $key
1330	mov	$rnds_,$rounds			# backup $rounds
1331	mov	$len,$len_			# backup $len
1332	and	\$-16,$len
1333
1334	movdqa	.Lxts_magic(%rip),$twmask
1335	pxor	$twtmp,$twtmp
1336	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
1337___
1338    for ($i=0;$i<4;$i++) {
1339    $code.=<<___;
1340	pshufd	\$0x13,$twtmp,$twres
1341	pxor	$twtmp,$twtmp
1342	movdqa	@tweak[5],@tweak[$i]
1343	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1344	pand	$twmask,$twres			# isolate carry and residue
1345	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1346	pxor	$twres,@tweak[5]
1347___
1348    }
1349$code.=<<___;
1350	sub	\$16*6,$len
1351	jc	.Lxts_enc_short
1352
1353	shr	\$1,$rounds
1354	sub	\$1,$rounds
1355	mov	$rounds,$rnds_
1356	jmp	.Lxts_enc_grandloop
1357
1358.align	16
1359.Lxts_enc_grandloop:
1360	pshufd	\$0x13,$twtmp,$twres
1361	movdqa	@tweak[5],@tweak[4]
1362	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1363	movdqu	`16*0`($inp),$inout0		# load input
1364	pand	$twmask,$twres			# isolate carry and residue
1365	movdqu	`16*1`($inp),$inout1
1366	pxor	$twres,@tweak[5]
1367
1368	movdqu	`16*2`($inp),$inout2
1369	pxor	@tweak[0],$inout0		# input^=tweak
1370	movdqu	`16*3`($inp),$inout3
1371	pxor	@tweak[1],$inout1
1372	movdqu	`16*4`($inp),$inout4
1373	pxor	@tweak[2],$inout2
1374	movdqu	`16*5`($inp),$inout5
1375	lea	`16*6`($inp),$inp
1376	pxor	@tweak[3],$inout3
1377	$movkey		($key_),$rndkey0
1378	pxor	@tweak[4],$inout4
1379	pxor	@tweak[5],$inout5
1380
1381	# inline _aesni_encrypt6 and interleave first and last rounds
1382	# with own code...
1383	$movkey		16($key_),$rndkey1
1384	pxor		$rndkey0,$inout0
1385	pxor		$rndkey0,$inout1
1386	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
1387	aesenc		$rndkey1,$inout0
1388	lea		32($key_),$key
1389	pxor		$rndkey0,$inout2
1390	 movdqa	@tweak[1],`16*1`(%rsp)
1391	aesenc		$rndkey1,$inout1
1392	pxor		$rndkey0,$inout3
1393	 movdqa	@tweak[2],`16*2`(%rsp)
1394	aesenc		$rndkey1,$inout2
1395	pxor		$rndkey0,$inout4
1396	 movdqa	@tweak[3],`16*3`(%rsp)
1397	aesenc		$rndkey1,$inout3
1398	pxor		$rndkey0,$inout5
1399	$movkey		($key),$rndkey0
1400	dec		$rounds
1401	 movdqa	@tweak[4],`16*4`(%rsp)
1402	aesenc		$rndkey1,$inout4
1403	 movdqa	@tweak[5],`16*5`(%rsp)
1404	aesenc		$rndkey1,$inout5
1405	pxor	$twtmp,$twtmp
1406	pcmpgtd	@tweak[5],$twtmp
1407	jmp		.Lxts_enc_loop6_enter
1408
1409.align	16
1410.Lxts_enc_loop6:
1411	aesenc		$rndkey1,$inout0
1412	aesenc		$rndkey1,$inout1
1413	dec		$rounds
1414	aesenc		$rndkey1,$inout2
1415	aesenc		$rndkey1,$inout3
1416	aesenc		$rndkey1,$inout4
1417	aesenc		$rndkey1,$inout5
1418.Lxts_enc_loop6_enter:
1419	$movkey		16($key),$rndkey1
1420	aesenc		$rndkey0,$inout0
1421	aesenc		$rndkey0,$inout1
1422	lea		32($key),$key
1423	aesenc		$rndkey0,$inout2
1424	aesenc		$rndkey0,$inout3
1425	aesenc		$rndkey0,$inout4
1426	aesenc		$rndkey0,$inout5
1427	$movkey		($key),$rndkey0
1428	jnz		.Lxts_enc_loop6
1429
1430	pshufd	\$0x13,$twtmp,$twres
1431	pxor	$twtmp,$twtmp
1432	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1433	 aesenc		$rndkey1,$inout0
1434	pand	$twmask,$twres			# isolate carry and residue
1435	 aesenc		$rndkey1,$inout1
1436	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
1437	 aesenc		$rndkey1,$inout2
1438	pxor	$twres,@tweak[5]
1439	 aesenc		$rndkey1,$inout3
1440	 aesenc		$rndkey1,$inout4
1441	 aesenc		$rndkey1,$inout5
1442	 $movkey	16($key),$rndkey1
1443
1444	pshufd	\$0x13,$twtmp,$twres
1445	pxor	$twtmp,$twtmp
1446	movdqa	@tweak[5],@tweak[0]
1447	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1448	 aesenc		$rndkey0,$inout0
1449	pand	$twmask,$twres			# isolate carry and residue
1450	 aesenc		$rndkey0,$inout1
1451	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1452	 aesenc		$rndkey0,$inout2
1453	pxor	$twres,@tweak[5]
1454	 aesenc		$rndkey0,$inout3
1455	 aesenc		$rndkey0,$inout4
1456	 aesenc		$rndkey0,$inout5
1457	 $movkey	32($key),$rndkey0
1458
1459	pshufd	\$0x13,$twtmp,$twres
1460	pxor	$twtmp,$twtmp
1461	movdqa	@tweak[5],@tweak[1]
1462	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1463	 aesenc		$rndkey1,$inout0
1464	pand	$twmask,$twres			# isolate carry and residue
1465	 aesenc		$rndkey1,$inout1
1466	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1467	 aesenc		$rndkey1,$inout2
1468	pxor	$twres,@tweak[5]
1469	 aesenc		$rndkey1,$inout3
1470	 aesenc		$rndkey1,$inout4
1471	 aesenc		$rndkey1,$inout5
1472
1473	pshufd	\$0x13,$twtmp,$twres
1474	pxor	$twtmp,$twtmp
1475	movdqa	@tweak[5],@tweak[2]
1476	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1477	 aesenclast	$rndkey0,$inout0
1478	pand	$twmask,$twres			# isolate carry and residue
1479	 aesenclast	$rndkey0,$inout1
1480	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1481	 aesenclast	$rndkey0,$inout2
1482	pxor	$twres,@tweak[5]
1483	 aesenclast	$rndkey0,$inout3
1484	 aesenclast	$rndkey0,$inout4
1485	 aesenclast	$rndkey0,$inout5
1486
1487	pshufd	\$0x13,$twtmp,$twres
1488	pxor	$twtmp,$twtmp
1489	movdqa	@tweak[5],@tweak[3]
1490	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1491	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
1492	pand	$twmask,$twres			# isolate carry and residue
1493	 xorps	`16*1`(%rsp),$inout1
1494	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1495	pxor	$twres,@tweak[5]
1496
1497	xorps	`16*2`(%rsp),$inout2
1498	movups	$inout0,`16*0`($out)		# write output
1499	xorps	`16*3`(%rsp),$inout3
1500	movups	$inout1,`16*1`($out)
1501	xorps	`16*4`(%rsp),$inout4
1502	movups	$inout2,`16*2`($out)
1503	xorps	`16*5`(%rsp),$inout5
1504	movups	$inout3,`16*3`($out)
1505	mov	$rnds_,$rounds			# restore $rounds
1506	movups	$inout4,`16*4`($out)
1507	movups	$inout5,`16*5`($out)
1508	lea	`16*6`($out),$out
1509	sub	\$16*6,$len
1510	jnc	.Lxts_enc_grandloop
1511
1512	lea	3($rounds,$rounds),$rounds	# restore original value
1513	mov	$key_,$key			# restore $key
1514	mov	$rounds,$rnds_			# backup $rounds
1515
1516.Lxts_enc_short:
1517	add	\$16*6,$len
1518	jz	.Lxts_enc_done
1519
1520	cmp	\$0x20,$len
1521	jb	.Lxts_enc_one
1522	je	.Lxts_enc_two
1523
1524	cmp	\$0x40,$len
1525	jb	.Lxts_enc_three
1526	je	.Lxts_enc_four
1527
1528	pshufd	\$0x13,$twtmp,$twres
1529	movdqa	@tweak[5],@tweak[4]
1530	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1531	 movdqu	($inp),$inout0
1532	pand	$twmask,$twres			# isolate carry and residue
1533	 movdqu	16*1($inp),$inout1
1534	pxor	$twres,@tweak[5]
1535
1536	movdqu	16*2($inp),$inout2
1537	pxor	@tweak[0],$inout0
1538	movdqu	16*3($inp),$inout3
1539	pxor	@tweak[1],$inout1
1540	movdqu	16*4($inp),$inout4
1541	lea	16*5($inp),$inp
1542	pxor	@tweak[2],$inout2
1543	pxor	@tweak[3],$inout3
1544	pxor	@tweak[4],$inout4
1545
1546	call	_aesni_encrypt6
1547
1548	xorps	@tweak[0],$inout0
1549	movdqa	@tweak[5],@tweak[0]
1550	xorps	@tweak[1],$inout1
1551	xorps	@tweak[2],$inout2
1552	movdqu	$inout0,($out)
1553	xorps	@tweak[3],$inout3
1554	movdqu	$inout1,16*1($out)
1555	xorps	@tweak[4],$inout4
1556	movdqu	$inout2,16*2($out)
1557	movdqu	$inout3,16*3($out)
1558	movdqu	$inout4,16*4($out)
1559	lea	16*5($out),$out
1560	jmp	.Lxts_enc_done
1561
1562.align	16
1563.Lxts_enc_one:
1564	movups	($inp),$inout0
1565	lea	16*1($inp),$inp
1566	xorps	@tweak[0],$inout0
1567___
1568	&aesni_generate1("enc",$key,$rounds);
1569$code.=<<___;
1570	xorps	@tweak[0],$inout0
1571	movdqa	@tweak[1],@tweak[0]
1572	movups	$inout0,($out)
1573	lea	16*1($out),$out
1574	jmp	.Lxts_enc_done
1575
1576.align	16
1577.Lxts_enc_two:
1578	movups	($inp),$inout0
1579	movups	16($inp),$inout1
1580	lea	32($inp),$inp
1581	xorps	@tweak[0],$inout0
1582	xorps	@tweak[1],$inout1
1583
1584	call	_aesni_encrypt3
1585
1586	xorps	@tweak[0],$inout0
1587	movdqa	@tweak[2],@tweak[0]
1588	xorps	@tweak[1],$inout1
1589	movups	$inout0,($out)
1590	movups	$inout1,16*1($out)
1591	lea	16*2($out),$out
1592	jmp	.Lxts_enc_done
1593
1594.align	16
1595.Lxts_enc_three:
1596	movups	($inp),$inout0
1597	movups	16*1($inp),$inout1
1598	movups	16*2($inp),$inout2
1599	lea	16*3($inp),$inp
1600	xorps	@tweak[0],$inout0
1601	xorps	@tweak[1],$inout1
1602	xorps	@tweak[2],$inout2
1603
1604	call	_aesni_encrypt3
1605
1606	xorps	@tweak[0],$inout0
1607	movdqa	@tweak[3],@tweak[0]
1608	xorps	@tweak[1],$inout1
1609	xorps	@tweak[2],$inout2
1610	movups	$inout0,($out)
1611	movups	$inout1,16*1($out)
1612	movups	$inout2,16*2($out)
1613	lea	16*3($out),$out
1614	jmp	.Lxts_enc_done
1615
1616.align	16
1617.Lxts_enc_four:
1618	movups	($inp),$inout0
1619	movups	16*1($inp),$inout1
1620	movups	16*2($inp),$inout2
1621	xorps	@tweak[0],$inout0
1622	movups	16*3($inp),$inout3
1623	lea	16*4($inp),$inp
1624	xorps	@tweak[1],$inout1
1625	xorps	@tweak[2],$inout2
1626	xorps	@tweak[3],$inout3
1627
1628	call	_aesni_encrypt4
1629
1630	xorps	@tweak[0],$inout0
1631	movdqa	@tweak[5],@tweak[0]
1632	xorps	@tweak[1],$inout1
1633	xorps	@tweak[2],$inout2
1634	movups	$inout0,($out)
1635	xorps	@tweak[3],$inout3
1636	movups	$inout1,16*1($out)
1637	movups	$inout2,16*2($out)
1638	movups	$inout3,16*3($out)
1639	lea	16*4($out),$out
1640	jmp	.Lxts_enc_done
1641
1642.align	16
1643.Lxts_enc_done:
1644	and	\$15,$len_
1645	jz	.Lxts_enc_ret
1646	mov	$len_,$len
1647
1648.Lxts_enc_steal:
1649	movzb	($inp),%eax			# borrow $rounds ...
1650	movzb	-16($out),%ecx			# ... and $key
1651	lea	1($inp),$inp
1652	mov	%al,-16($out)
1653	mov	%cl,0($out)
1654	lea	1($out),$out
1655	sub	\$1,$len
1656	jnz	.Lxts_enc_steal
1657
1658	sub	$len_,$out			# rewind $out
1659	mov	$key_,$key			# restore $key
1660	mov	$rnds_,$rounds			# restore $rounds
1661
1662	movups	-16($out),$inout0
1663	xorps	@tweak[0],$inout0
1664___
1665	&aesni_generate1("enc",$key,$rounds);
1666$code.=<<___;
1667	xorps	@tweak[0],$inout0
1668	movups	$inout0,-16($out)
1669
1670.Lxts_enc_ret:
1671___
1672$code.=<<___ if ($win64);
1673	movaps	0x60(%rsp),%xmm6
1674	movaps	0x70(%rsp),%xmm7
1675	movaps	0x80(%rsp),%xmm8
1676	movaps	0x90(%rsp),%xmm9
1677	movaps	0xa0(%rsp),%xmm10
1678	movaps	0xb0(%rsp),%xmm11
1679	movaps	0xc0(%rsp),%xmm12
1680	movaps	0xd0(%rsp),%xmm13
1681	movaps	0xe0(%rsp),%xmm14
1682	movaps	0xf0(%rsp),%xmm15
1683___
1684$code.=<<___;
1685	lea	$frame_size(%rsp),%rsp
1686.Lxts_enc_epilogue:
1687	ret
1688.size	aesni_xts_encrypt,.-aesni_xts_encrypt
1689___
1690
1691$code.=<<___;
1692.globl	aesni_xts_decrypt
1693.type	aesni_xts_decrypt,\@function,6
1694.align	16
1695aesni_xts_decrypt:
1696	lea	-$frame_size(%rsp),%rsp
1697___
1698$code.=<<___ if ($win64);
1699	movaps	%xmm6,0x60(%rsp)
1700	movaps	%xmm7,0x70(%rsp)
1701	movaps	%xmm8,0x80(%rsp)
1702	movaps	%xmm9,0x90(%rsp)
1703	movaps	%xmm10,0xa0(%rsp)
1704	movaps	%xmm11,0xb0(%rsp)
1705	movaps	%xmm12,0xc0(%rsp)
1706	movaps	%xmm13,0xd0(%rsp)
1707	movaps	%xmm14,0xe0(%rsp)
1708	movaps	%xmm15,0xf0(%rsp)
1709.Lxts_dec_body:
1710___
1711$code.=<<___;
1712	movups	($ivp),@tweak[5]		# load clear-text tweak
1713	mov	240($key2),$rounds		# key2->rounds
1714	mov	240($key),$rnds_		# key1->rounds
1715___
1716	# generate the tweak
1717	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1718$code.=<<___;
1719	xor	%eax,%eax			# if ($len%16) len-=16;
1720	test	\$15,$len
1721	setnz	%al
1722	shl	\$4,%rax
1723	sub	%rax,$len
1724
1725	mov	$key,$key_			# backup $key
1726	mov	$rnds_,$rounds			# backup $rounds
1727	mov	$len,$len_			# backup $len
1728	and	\$-16,$len
1729
1730	movdqa	.Lxts_magic(%rip),$twmask
1731	pxor	$twtmp,$twtmp
1732	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
1733___
1734    for ($i=0;$i<4;$i++) {
1735    $code.=<<___;
1736	pshufd	\$0x13,$twtmp,$twres
1737	pxor	$twtmp,$twtmp
1738	movdqa	@tweak[5],@tweak[$i]
1739	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1740	pand	$twmask,$twres			# isolate carry and residue
1741	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1742	pxor	$twres,@tweak[5]
1743___
1744    }
1745$code.=<<___;
1746	sub	\$16*6,$len
1747	jc	.Lxts_dec_short
1748
1749	shr	\$1,$rounds
1750	sub	\$1,$rounds
1751	mov	$rounds,$rnds_
1752	jmp	.Lxts_dec_grandloop
1753
1754.align	16
1755.Lxts_dec_grandloop:
1756	pshufd	\$0x13,$twtmp,$twres
1757	movdqa	@tweak[5],@tweak[4]
1758	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1759	movdqu	`16*0`($inp),$inout0		# load input
1760	pand	$twmask,$twres			# isolate carry and residue
1761	movdqu	`16*1`($inp),$inout1
1762	pxor	$twres,@tweak[5]
1763
1764	movdqu	`16*2`($inp),$inout2
1765	pxor	@tweak[0],$inout0		# input^=tweak
1766	movdqu	`16*3`($inp),$inout3
1767	pxor	@tweak[1],$inout1
1768	movdqu	`16*4`($inp),$inout4
1769	pxor	@tweak[2],$inout2
1770	movdqu	`16*5`($inp),$inout5
1771	lea	`16*6`($inp),$inp
1772	pxor	@tweak[3],$inout3
1773	$movkey		($key_),$rndkey0
1774	pxor	@tweak[4],$inout4
1775	pxor	@tweak[5],$inout5
1776
1777	# inline _aesni_decrypt6 and interleave first and last rounds
1778	# with own code...
1779	$movkey		16($key_),$rndkey1
1780	pxor		$rndkey0,$inout0
1781	pxor		$rndkey0,$inout1
1782	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
1783	aesdec		$rndkey1,$inout0
1784	lea		32($key_),$key
1785	pxor		$rndkey0,$inout2
1786	 movdqa	@tweak[1],`16*1`(%rsp)
1787	aesdec		$rndkey1,$inout1
1788	pxor		$rndkey0,$inout3
1789	 movdqa	@tweak[2],`16*2`(%rsp)
1790	aesdec		$rndkey1,$inout2
1791	pxor		$rndkey0,$inout4
1792	 movdqa	@tweak[3],`16*3`(%rsp)
1793	aesdec		$rndkey1,$inout3
1794	pxor		$rndkey0,$inout5
1795	$movkey		($key),$rndkey0
1796	dec		$rounds
1797	 movdqa	@tweak[4],`16*4`(%rsp)
1798	aesdec		$rndkey1,$inout4
1799	 movdqa	@tweak[5],`16*5`(%rsp)
1800	aesdec		$rndkey1,$inout5
1801	pxor	$twtmp,$twtmp
1802	pcmpgtd	@tweak[5],$twtmp
1803	jmp		.Lxts_dec_loop6_enter
1804
1805.align	16
1806.Lxts_dec_loop6:
1807	aesdec		$rndkey1,$inout0
1808	aesdec		$rndkey1,$inout1
1809	dec		$rounds
1810	aesdec		$rndkey1,$inout2
1811	aesdec		$rndkey1,$inout3
1812	aesdec		$rndkey1,$inout4
1813	aesdec		$rndkey1,$inout5
1814.Lxts_dec_loop6_enter:
1815	$movkey		16($key),$rndkey1
1816	aesdec		$rndkey0,$inout0
1817	aesdec		$rndkey0,$inout1
1818	lea		32($key),$key
1819	aesdec		$rndkey0,$inout2
1820	aesdec		$rndkey0,$inout3
1821	aesdec		$rndkey0,$inout4
1822	aesdec		$rndkey0,$inout5
1823	$movkey		($key),$rndkey0
1824	jnz		.Lxts_dec_loop6
1825
1826	pshufd	\$0x13,$twtmp,$twres
1827	pxor	$twtmp,$twtmp
1828	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1829	 aesdec		$rndkey1,$inout0
1830	pand	$twmask,$twres			# isolate carry and residue
1831	 aesdec		$rndkey1,$inout1
1832	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
1833	 aesdec		$rndkey1,$inout2
1834	pxor	$twres,@tweak[5]
1835	 aesdec		$rndkey1,$inout3
1836	 aesdec		$rndkey1,$inout4
1837	 aesdec		$rndkey1,$inout5
1838	 $movkey	16($key),$rndkey1
1839
1840	pshufd	\$0x13,$twtmp,$twres
1841	pxor	$twtmp,$twtmp
1842	movdqa	@tweak[5],@tweak[0]
1843	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1844	 aesdec		$rndkey0,$inout0
1845	pand	$twmask,$twres			# isolate carry and residue
1846	 aesdec		$rndkey0,$inout1
1847	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1848	 aesdec		$rndkey0,$inout2
1849	pxor	$twres,@tweak[5]
1850	 aesdec		$rndkey0,$inout3
1851	 aesdec		$rndkey0,$inout4
1852	 aesdec		$rndkey0,$inout5
1853	 $movkey	32($key),$rndkey0
1854
1855	pshufd	\$0x13,$twtmp,$twres
1856	pxor	$twtmp,$twtmp
1857	movdqa	@tweak[5],@tweak[1]
1858	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1859	 aesdec		$rndkey1,$inout0
1860	pand	$twmask,$twres			# isolate carry and residue
1861	 aesdec		$rndkey1,$inout1
1862	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1863	 aesdec		$rndkey1,$inout2
1864	pxor	$twres,@tweak[5]
1865	 aesdec		$rndkey1,$inout3
1866	 aesdec		$rndkey1,$inout4
1867	 aesdec		$rndkey1,$inout5
1868
1869	pshufd	\$0x13,$twtmp,$twres
1870	pxor	$twtmp,$twtmp
1871	movdqa	@tweak[5],@tweak[2]
1872	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1873	 aesdeclast	$rndkey0,$inout0
1874	pand	$twmask,$twres			# isolate carry and residue
1875	 aesdeclast	$rndkey0,$inout1
1876	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1877	 aesdeclast	$rndkey0,$inout2
1878	pxor	$twres,@tweak[5]
1879	 aesdeclast	$rndkey0,$inout3
1880	 aesdeclast	$rndkey0,$inout4
1881	 aesdeclast	$rndkey0,$inout5
1882
1883	pshufd	\$0x13,$twtmp,$twres
1884	pxor	$twtmp,$twtmp
1885	movdqa	@tweak[5],@tweak[3]
1886	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1887	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
1888	pand	$twmask,$twres			# isolate carry and residue
1889	 xorps	`16*1`(%rsp),$inout1
1890	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1891	pxor	$twres,@tweak[5]
1892
1893	xorps	`16*2`(%rsp),$inout2
1894	movups	$inout0,`16*0`($out)		# write output
1895	xorps	`16*3`(%rsp),$inout3
1896	movups	$inout1,`16*1`($out)
1897	xorps	`16*4`(%rsp),$inout4
1898	movups	$inout2,`16*2`($out)
1899	xorps	`16*5`(%rsp),$inout5
1900	movups	$inout3,`16*3`($out)
1901	mov	$rnds_,$rounds			# restore $rounds
1902	movups	$inout4,`16*4`($out)
1903	movups	$inout5,`16*5`($out)
1904	lea	`16*6`($out),$out
1905	sub	\$16*6,$len
1906	jnc	.Lxts_dec_grandloop
1907
1908	lea	3($rounds,$rounds),$rounds	# restore original value
1909	mov	$key_,$key			# restore $key
1910	mov	$rounds,$rnds_			# backup $rounds
1911
1912.Lxts_dec_short:
1913	add	\$16*6,$len
1914	jz	.Lxts_dec_done
1915
1916	cmp	\$0x20,$len
1917	jb	.Lxts_dec_one
1918	je	.Lxts_dec_two
1919
1920	cmp	\$0x40,$len
1921	jb	.Lxts_dec_three
1922	je	.Lxts_dec_four
1923
1924	pshufd	\$0x13,$twtmp,$twres
1925	movdqa	@tweak[5],@tweak[4]
1926	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1927	 movdqu	($inp),$inout0
1928	pand	$twmask,$twres			# isolate carry and residue
1929	 movdqu	16*1($inp),$inout1
1930	pxor	$twres,@tweak[5]
1931
1932	movdqu	16*2($inp),$inout2
1933	pxor	@tweak[0],$inout0
1934	movdqu	16*3($inp),$inout3
1935	pxor	@tweak[1],$inout1
1936	movdqu	16*4($inp),$inout4
1937	lea	16*5($inp),$inp
1938	pxor	@tweak[2],$inout2
1939	pxor	@tweak[3],$inout3
1940	pxor	@tweak[4],$inout4
1941
1942	call	_aesni_decrypt6
1943
1944	xorps	@tweak[0],$inout0
1945	xorps	@tweak[1],$inout1
1946	xorps	@tweak[2],$inout2
1947	movdqu	$inout0,($out)
1948	xorps	@tweak[3],$inout3
1949	movdqu	$inout1,16*1($out)
1950	xorps	@tweak[4],$inout4
1951	movdqu	$inout2,16*2($out)
1952	 pxor		$twtmp,$twtmp
1953	movdqu	$inout3,16*3($out)
1954	 pcmpgtd	@tweak[5],$twtmp
1955	movdqu	$inout4,16*4($out)
1956	lea	16*5($out),$out
1957	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
1958	and	\$15,$len_
1959	jz	.Lxts_dec_ret
1960
1961	movdqa	@tweak[5],@tweak[0]
1962	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1963	pand	$twmask,@tweak[1]		# isolate carry and residue
1964	pxor	@tweak[5],@tweak[1]
1965	jmp	.Lxts_dec_done2
1966
1967.align	16
1968.Lxts_dec_one:
1969	movups	($inp),$inout0
1970	lea	16*1($inp),$inp
1971	xorps	@tweak[0],$inout0
1972___
1973	&aesni_generate1("dec",$key,$rounds);
1974$code.=<<___;
1975	xorps	@tweak[0],$inout0
1976	movdqa	@tweak[1],@tweak[0]
1977	movups	$inout0,($out)
1978	movdqa	@tweak[2],@tweak[1]
1979	lea	16*1($out),$out
1980	jmp	.Lxts_dec_done
1981
1982.align	16
1983.Lxts_dec_two:
1984	movups	($inp),$inout0
1985	movups	16($inp),$inout1
1986	lea	32($inp),$inp
1987	xorps	@tweak[0],$inout0
1988	xorps	@tweak[1],$inout1
1989
1990	call	_aesni_decrypt3
1991
1992	xorps	@tweak[0],$inout0
1993	movdqa	@tweak[2],@tweak[0]
1994	xorps	@tweak[1],$inout1
1995	movdqa	@tweak[3],@tweak[1]
1996	movups	$inout0,($out)
1997	movups	$inout1,16*1($out)
1998	lea	16*2($out),$out
1999	jmp	.Lxts_dec_done
2000
2001.align	16
2002.Lxts_dec_three:
2003	movups	($inp),$inout0
2004	movups	16*1($inp),$inout1
2005	movups	16*2($inp),$inout2
2006	lea	16*3($inp),$inp
2007	xorps	@tweak[0],$inout0
2008	xorps	@tweak[1],$inout1
2009	xorps	@tweak[2],$inout2
2010
2011	call	_aesni_decrypt3
2012
2013	xorps	@tweak[0],$inout0
2014	movdqa	@tweak[3],@tweak[0]
2015	xorps	@tweak[1],$inout1
2016	movdqa	@tweak[5],@tweak[1]
2017	xorps	@tweak[2],$inout2
2018	movups	$inout0,($out)
2019	movups	$inout1,16*1($out)
2020	movups	$inout2,16*2($out)
2021	lea	16*3($out),$out
2022	jmp	.Lxts_dec_done
2023
2024.align	16
2025.Lxts_dec_four:
2026	pshufd	\$0x13,$twtmp,$twres
2027	movdqa	@tweak[5],@tweak[4]
2028	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2029	 movups	($inp),$inout0
2030	pand	$twmask,$twres			# isolate carry and residue
2031	 movups	16*1($inp),$inout1
2032	pxor	$twres,@tweak[5]
2033
2034	movups	16*2($inp),$inout2
2035	xorps	@tweak[0],$inout0
2036	movups	16*3($inp),$inout3
2037	lea	16*4($inp),$inp
2038	xorps	@tweak[1],$inout1
2039	xorps	@tweak[2],$inout2
2040	xorps	@tweak[3],$inout3
2041
2042	call	_aesni_decrypt4
2043
2044	xorps	@tweak[0],$inout0
2045	movdqa	@tweak[4],@tweak[0]
2046	xorps	@tweak[1],$inout1
2047	movdqa	@tweak[5],@tweak[1]
2048	xorps	@tweak[2],$inout2
2049	movups	$inout0,($out)
2050	xorps	@tweak[3],$inout3
2051	movups	$inout1,16*1($out)
2052	movups	$inout2,16*2($out)
2053	movups	$inout3,16*3($out)
2054	lea	16*4($out),$out
2055	jmp	.Lxts_dec_done
2056
2057.align	16
2058.Lxts_dec_done:
2059	and	\$15,$len_
2060	jz	.Lxts_dec_ret
2061.Lxts_dec_done2:
2062	mov	$len_,$len
2063	mov	$key_,$key			# restore $key
2064	mov	$rnds_,$rounds			# restore $rounds
2065
2066	movups	($inp),$inout0
2067	xorps	@tweak[1],$inout0
2068___
2069	&aesni_generate1("dec",$key,$rounds);
2070$code.=<<___;
2071	xorps	@tweak[1],$inout0
2072	movups	$inout0,($out)
2073
2074.Lxts_dec_steal:
2075	movzb	16($inp),%eax			# borrow $rounds ...
2076	movzb	($out),%ecx			# ... and $key
2077	lea	1($inp),$inp
2078	mov	%al,($out)
2079	mov	%cl,16($out)
2080	lea	1($out),$out
2081	sub	\$1,$len
2082	jnz	.Lxts_dec_steal
2083
2084	sub	$len_,$out			# rewind $out
2085	mov	$key_,$key			# restore $key
2086	mov	$rnds_,$rounds			# restore $rounds
2087
2088	movups	($out),$inout0
2089	xorps	@tweak[0],$inout0
2090___
2091	&aesni_generate1("dec",$key,$rounds);
2092$code.=<<___;
2093	xorps	@tweak[0],$inout0
2094	movups	$inout0,($out)
2095
2096.Lxts_dec_ret:
2097___
2098$code.=<<___ if ($win64);
2099	movaps	0x60(%rsp),%xmm6
2100	movaps	0x70(%rsp),%xmm7
2101	movaps	0x80(%rsp),%xmm8
2102	movaps	0x90(%rsp),%xmm9
2103	movaps	0xa0(%rsp),%xmm10
2104	movaps	0xb0(%rsp),%xmm11
2105	movaps	0xc0(%rsp),%xmm12
2106	movaps	0xd0(%rsp),%xmm13
2107	movaps	0xe0(%rsp),%xmm14
2108	movaps	0xf0(%rsp),%xmm15
2109___
2110$code.=<<___;
2111	lea	$frame_size(%rsp),%rsp
2112.Lxts_dec_epilogue:
2113	ret
2114.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2115___
2116} }}
2117
2118########################################################################
2119# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2120#			    size_t length, const AES_KEY *key,
2121#			    unsigned char *ivp,const int enc);
2122{
2123my $reserved = $win64?0x40:-0x18;	# used in decrypt
2124$code.=<<___;
2125.globl	${PREFIX}_cbc_encrypt
2126.type	${PREFIX}_cbc_encrypt,\@function,6
2127.align	16
2128${PREFIX}_cbc_encrypt:
2129	test	$len,$len		# check length
2130	jz	.Lcbc_ret
2131
2132	mov	240($key),$rnds_	# key->rounds
2133	mov	$key,$key_		# backup $key
2134	test	%r9d,%r9d		# 6th argument
2135	jz	.Lcbc_decrypt
2136#--------------------------- CBC ENCRYPT ------------------------------#
2137	movups	($ivp),$inout0		# load iv as initial state
2138	mov	$rnds_,$rounds
2139	cmp	\$16,$len
2140	jb	.Lcbc_enc_tail
2141	sub	\$16,$len
2142	jmp	.Lcbc_enc_loop
2143.align	16
2144.Lcbc_enc_loop:
2145	movups	($inp),$inout1		# load input
2146	lea	16($inp),$inp
2147	#xorps	$inout1,$inout0
2148___
2149	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
2150$code.=<<___;
2151	mov	$rnds_,$rounds		# restore $rounds
2152	mov	$key_,$key		# restore $key
2153	movups	$inout0,0($out)		# store output
2154	lea	16($out),$out
2155	sub	\$16,$len
2156	jnc	.Lcbc_enc_loop
2157	add	\$16,$len
2158	jnz	.Lcbc_enc_tail
2159	movups	$inout0,($ivp)
2160	jmp	.Lcbc_ret
2161
2162.Lcbc_enc_tail:
2163	mov	$len,%rcx	# zaps $key
2164	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
2165	.long	0x9066A4F3	# rep movsb
2166	mov	\$16,%ecx	# zero tail
2167	sub	$len,%rcx
2168	xor	%eax,%eax
2169	.long	0x9066AAF3	# rep stosb
2170	lea	-16(%rdi),%rdi	# rewind $out by 1 block
2171	mov	$rnds_,$rounds	# restore $rounds
2172	mov	%rdi,%rsi	# $inp and $out are the same
2173	mov	$key_,$key	# restore $key
2174	xor	$len,$len	# len=16
2175	jmp	.Lcbc_enc_loop	# one more spin
2176#--------------------------- CBC DECRYPT ------------------------------#
2177.align	16
2178.Lcbc_decrypt:
2179___
2180$code.=<<___ if ($win64);
2181	lea	-0x58(%rsp),%rsp
2182	movaps	%xmm6,(%rsp)
2183	movaps	%xmm7,0x10(%rsp)
2184	movaps	%xmm8,0x20(%rsp)
2185	movaps	%xmm9,0x30(%rsp)
2186.Lcbc_decrypt_body:
2187___
2188$code.=<<___;
2189	movups	($ivp),$iv
2190	mov	$rnds_,$rounds
2191	cmp	\$0x70,$len
2192	jbe	.Lcbc_dec_tail
2193	shr	\$1,$rnds_
2194	sub	\$0x70,$len
2195	mov	$rnds_,$rounds
2196	movaps	$iv,$reserved(%rsp)
2197	jmp	.Lcbc_dec_loop8_enter
2198.align	16
2199.Lcbc_dec_loop8:
2200	movaps	$rndkey0,$reserved(%rsp)	# save IV
2201	movups	$inout7,($out)
2202	lea	0x10($out),$out
2203.Lcbc_dec_loop8_enter:
2204	$movkey		($key),$rndkey0
2205	movups	($inp),$inout0			# load input
2206	movups	0x10($inp),$inout1
2207	$movkey		16($key),$rndkey1
2208
2209	lea		32($key),$key
2210	movdqu	0x20($inp),$inout2
2211	xorps		$rndkey0,$inout0
2212	movdqu	0x30($inp),$inout3
2213	xorps		$rndkey0,$inout1
2214	movdqu	0x40($inp),$inout4
2215	aesdec		$rndkey1,$inout0
2216	pxor		$rndkey0,$inout2
2217	movdqu	0x50($inp),$inout5
2218	aesdec		$rndkey1,$inout1
2219	pxor		$rndkey0,$inout3
2220	movdqu	0x60($inp),$inout6
2221	aesdec		$rndkey1,$inout2
2222	pxor		$rndkey0,$inout4
2223	movdqu	0x70($inp),$inout7
2224	aesdec		$rndkey1,$inout3
2225	pxor		$rndkey0,$inout5
2226	dec		$rounds
2227	aesdec		$rndkey1,$inout4
2228	pxor		$rndkey0,$inout6
2229	aesdec		$rndkey1,$inout5
2230	pxor		$rndkey0,$inout7
2231	$movkey		($key),$rndkey0
2232	aesdec		$rndkey1,$inout6
2233	aesdec		$rndkey1,$inout7
2234	$movkey		16($key),$rndkey1
2235
2236	call		.Ldec_loop8_enter
2237
2238	movups	($inp),$rndkey1		# re-load input
2239	movups	0x10($inp),$rndkey0
2240	xorps	$reserved(%rsp),$inout0	# ^= IV
2241	xorps	$rndkey1,$inout1
2242	movups	0x20($inp),$rndkey1
2243	xorps	$rndkey0,$inout2
2244	movups	0x30($inp),$rndkey0
2245	xorps	$rndkey1,$inout3
2246	movups	0x40($inp),$rndkey1
2247	xorps	$rndkey0,$inout4
2248	movups	0x50($inp),$rndkey0
2249	xorps	$rndkey1,$inout5
2250	movups	0x60($inp),$rndkey1
2251	xorps	$rndkey0,$inout6
2252	movups	0x70($inp),$rndkey0	# IV
2253	xorps	$rndkey1,$inout7
2254	movups	$inout0,($out)
2255	movups	$inout1,0x10($out)
2256	movups	$inout2,0x20($out)
2257	movups	$inout3,0x30($out)
2258	mov	$rnds_,$rounds		# restore $rounds
2259	movups	$inout4,0x40($out)
2260	mov	$key_,$key		# restore $key
2261	movups	$inout5,0x50($out)
2262	lea	0x80($inp),$inp
2263	movups	$inout6,0x60($out)
2264	lea	0x70($out),$out
2265	sub	\$0x80,$len
2266	ja	.Lcbc_dec_loop8
2267
2268	movaps	$inout7,$inout0
2269	movaps	$rndkey0,$iv
2270	add	\$0x70,$len
2271	jle	.Lcbc_dec_tail_collected
2272	movups	$inout0,($out)
2273	lea	1($rnds_,$rnds_),$rounds
2274	lea	0x10($out),$out
2275.Lcbc_dec_tail:
2276	movups	($inp),$inout0
2277	movaps	$inout0,$in0
2278	cmp	\$0x10,$len
2279	jbe	.Lcbc_dec_one
2280
2281	movups	0x10($inp),$inout1
2282	movaps	$inout1,$in1
2283	cmp	\$0x20,$len
2284	jbe	.Lcbc_dec_two
2285
2286	movups	0x20($inp),$inout2
2287	movaps	$inout2,$in2
2288	cmp	\$0x30,$len
2289	jbe	.Lcbc_dec_three
2290
2291	movups	0x30($inp),$inout3
2292	cmp	\$0x40,$len
2293	jbe	.Lcbc_dec_four
2294
2295	movups	0x40($inp),$inout4
2296	cmp	\$0x50,$len
2297	jbe	.Lcbc_dec_five
2298
2299	movups	0x50($inp),$inout5
2300	cmp	\$0x60,$len
2301	jbe	.Lcbc_dec_six
2302
2303	movups	0x60($inp),$inout6
2304	movaps	$iv,$reserved(%rsp)	# save IV
2305	call	_aesni_decrypt8
2306	movups	($inp),$rndkey1
2307	movups	0x10($inp),$rndkey0
2308	xorps	$reserved(%rsp),$inout0	# ^= IV
2309	xorps	$rndkey1,$inout1
2310	movups	0x20($inp),$rndkey1
2311	xorps	$rndkey0,$inout2
2312	movups	0x30($inp),$rndkey0
2313	xorps	$rndkey1,$inout3
2314	movups	0x40($inp),$rndkey1
2315	xorps	$rndkey0,$inout4
2316	movups	0x50($inp),$rndkey0
2317	xorps	$rndkey1,$inout5
2318	movups	0x60($inp),$iv		# IV
2319	xorps	$rndkey0,$inout6
2320	movups	$inout0,($out)
2321	movups	$inout1,0x10($out)
2322	movups	$inout2,0x20($out)
2323	movups	$inout3,0x30($out)
2324	movups	$inout4,0x40($out)
2325	movups	$inout5,0x50($out)
2326	lea	0x60($out),$out
2327	movaps	$inout6,$inout0
2328	sub	\$0x70,$len
2329	jmp	.Lcbc_dec_tail_collected
2330.align	16
2331.Lcbc_dec_one:
2332___
2333	&aesni_generate1("dec",$key,$rounds);
2334$code.=<<___;
2335	xorps	$iv,$inout0
2336	movaps	$in0,$iv
2337	sub	\$0x10,$len
2338	jmp	.Lcbc_dec_tail_collected
2339.align	16
2340.Lcbc_dec_two:
2341	xorps	$inout2,$inout2
2342	call	_aesni_decrypt3
2343	xorps	$iv,$inout0
2344	xorps	$in0,$inout1
2345	movups	$inout0,($out)
2346	movaps	$in1,$iv
2347	movaps	$inout1,$inout0
2348	lea	0x10($out),$out
2349	sub	\$0x20,$len
2350	jmp	.Lcbc_dec_tail_collected
2351.align	16
2352.Lcbc_dec_three:
2353	call	_aesni_decrypt3
2354	xorps	$iv,$inout0
2355	xorps	$in0,$inout1
2356	movups	$inout0,($out)
2357	xorps	$in1,$inout2
2358	movups	$inout1,0x10($out)
2359	movaps	$in2,$iv
2360	movaps	$inout2,$inout0
2361	lea	0x20($out),$out
2362	sub	\$0x30,$len
2363	jmp	.Lcbc_dec_tail_collected
2364.align	16
2365.Lcbc_dec_four:
2366	call	_aesni_decrypt4
2367	xorps	$iv,$inout0
2368	movups	0x30($inp),$iv
2369	xorps	$in0,$inout1
2370	movups	$inout0,($out)
2371	xorps	$in1,$inout2
2372	movups	$inout1,0x10($out)
2373	xorps	$in2,$inout3
2374	movups	$inout2,0x20($out)
2375	movaps	$inout3,$inout0
2376	lea	0x30($out),$out
2377	sub	\$0x40,$len
2378	jmp	.Lcbc_dec_tail_collected
2379.align	16
2380.Lcbc_dec_five:
2381	xorps	$inout5,$inout5
2382	call	_aesni_decrypt6
2383	movups	0x10($inp),$rndkey1
2384	movups	0x20($inp),$rndkey0
2385	xorps	$iv,$inout0
2386	xorps	$in0,$inout1
2387	xorps	$rndkey1,$inout2
2388	movups	0x30($inp),$rndkey1
2389	xorps	$rndkey0,$inout3
2390	movups	0x40($inp),$iv
2391	xorps	$rndkey1,$inout4
2392	movups	$inout0,($out)
2393	movups	$inout1,0x10($out)
2394	movups	$inout2,0x20($out)
2395	movups	$inout3,0x30($out)
2396	lea	0x40($out),$out
2397	movaps	$inout4,$inout0
2398	sub	\$0x50,$len
2399	jmp	.Lcbc_dec_tail_collected
2400.align	16
2401.Lcbc_dec_six:
2402	call	_aesni_decrypt6
2403	movups	0x10($inp),$rndkey1
2404	movups	0x20($inp),$rndkey0
2405	xorps	$iv,$inout0
2406	xorps	$in0,$inout1
2407	xorps	$rndkey1,$inout2
2408	movups	0x30($inp),$rndkey1
2409	xorps	$rndkey0,$inout3
2410	movups	0x40($inp),$rndkey0
2411	xorps	$rndkey1,$inout4
2412	movups	0x50($inp),$iv
2413	xorps	$rndkey0,$inout5
2414	movups	$inout0,($out)
2415	movups	$inout1,0x10($out)
2416	movups	$inout2,0x20($out)
2417	movups	$inout3,0x30($out)
2418	movups	$inout4,0x40($out)
2419	lea	0x50($out),$out
2420	movaps	$inout5,$inout0
2421	sub	\$0x60,$len
2422	jmp	.Lcbc_dec_tail_collected
2423.align	16
2424.Lcbc_dec_tail_collected:
2425	and	\$15,$len
2426	movups	$iv,($ivp)
2427	jnz	.Lcbc_dec_tail_partial
2428	movups	$inout0,($out)
2429	jmp	.Lcbc_dec_ret
2430.align	16
2431.Lcbc_dec_tail_partial:
2432	movaps	$inout0,$reserved(%rsp)
2433	mov	\$16,%rcx
2434	mov	$out,%rdi
2435	sub	$len,%rcx
2436	lea	$reserved(%rsp),%rsi
2437	.long	0x9066A4F3	# rep movsb
2438
2439.Lcbc_dec_ret:
2440___
2441$code.=<<___ if ($win64);
2442	movaps	(%rsp),%xmm6
2443	movaps	0x10(%rsp),%xmm7
2444	movaps	0x20(%rsp),%xmm8
2445	movaps	0x30(%rsp),%xmm9
2446	lea	0x58(%rsp),%rsp
2447___
2448$code.=<<___;
2449.Lcbc_ret:
2450	ret
2451.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
2452___
2453}
2454# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
2455#				int bits, AES_KEY *key)
2456{ my ($inp,$bits,$key) = @_4args;
2457  $bits =~ s/%r/%e/;
2458
2459$code.=<<___;
2460.globl	${PREFIX}_set_decrypt_key
2461.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
2462.align	16
2463${PREFIX}_set_decrypt_key:
2464	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
2465	call	__aesni_set_encrypt_key
2466	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
2467	test	%eax,%eax
2468	jnz	.Ldec_key_ret
2469	lea	16($key,$bits),$inp	# points at the end of key schedule
2470
2471	$movkey	($key),%xmm0		# just swap
2472	$movkey	($inp),%xmm1
2473	$movkey	%xmm0,($inp)
2474	$movkey	%xmm1,($key)
2475	lea	16($key),$key
2476	lea	-16($inp),$inp
2477
2478.Ldec_key_inverse:
2479	$movkey	($key),%xmm0		# swap and inverse
2480	$movkey	($inp),%xmm1
2481	aesimc	%xmm0,%xmm0
2482	aesimc	%xmm1,%xmm1
2483	lea	16($key),$key
2484	lea	-16($inp),$inp
2485	$movkey	%xmm0,16($inp)
2486	$movkey	%xmm1,-16($key)
2487	cmp	$key,$inp
2488	ja	.Ldec_key_inverse
2489
2490	$movkey	($key),%xmm0		# inverse middle
2491	aesimc	%xmm0,%xmm0
2492	$movkey	%xmm0,($inp)
2493.Ldec_key_ret:
2494	add	\$8,%rsp
2495	ret
2496.LSEH_end_set_decrypt_key:
2497.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
2498___
2499
2500# This is based on submission by
2501#
2502#	Huang Ying <ying.huang@intel.com>
2503#	Vinodh Gopal <vinodh.gopal@intel.com>
2504#	Kahraman Akdemir
2505#
2506# Agressively optimized in respect to aeskeygenassist's critical path
2507# and is contained in %xmm0-5 to meet Win64 ABI requirement.
2508#
2509$code.=<<___;
2510.globl	${PREFIX}_set_encrypt_key
2511.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
2512.align	16
2513${PREFIX}_set_encrypt_key:
2514__aesni_set_encrypt_key:
2515	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
2516	mov	\$-1,%rax
2517	test	$inp,$inp
2518	jz	.Lenc_key_ret
2519	test	$key,$key
2520	jz	.Lenc_key_ret
2521
2522	movups	($inp),%xmm0		# pull first 128 bits of *userKey
2523	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
2524	lea	16($key),%rax
2525	cmp	\$256,$bits
2526	je	.L14rounds
2527	cmp	\$192,$bits
2528	je	.L12rounds
2529	cmp	\$128,$bits
2530	jne	.Lbad_keybits
2531
2532.L10rounds:
2533	mov	\$9,$bits			# 10 rounds for 128-bit key
2534	$movkey	%xmm0,($key)			# round 0
2535	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
2536	call		.Lkey_expansion_128_cold
2537	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
2538	call		.Lkey_expansion_128
2539	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
2540	call		.Lkey_expansion_128
2541	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
2542	call		.Lkey_expansion_128
2543	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
2544	call		.Lkey_expansion_128
2545	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
2546	call		.Lkey_expansion_128
2547	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
2548	call		.Lkey_expansion_128
2549	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
2550	call		.Lkey_expansion_128
2551	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
2552	call		.Lkey_expansion_128
2553	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
2554	call		.Lkey_expansion_128
2555	$movkey	%xmm0,(%rax)
2556	mov	$bits,80(%rax)	# 240(%rdx)
2557	xor	%eax,%eax
2558	jmp	.Lenc_key_ret
2559
2560.align	16
2561.L12rounds:
2562	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
2563	mov	\$11,$bits			# 12 rounds for 192
2564	$movkey	%xmm0,($key)			# round 0
2565	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
2566	call		.Lkey_expansion_192a_cold
2567	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
2568	call		.Lkey_expansion_192b
2569	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
2570	call		.Lkey_expansion_192a
2571	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
2572	call		.Lkey_expansion_192b
2573	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
2574	call		.Lkey_expansion_192a
2575	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
2576	call		.Lkey_expansion_192b
2577	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
2578	call		.Lkey_expansion_192a
2579	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
2580	call		.Lkey_expansion_192b
2581	$movkey	%xmm0,(%rax)
2582	mov	$bits,48(%rax)	# 240(%rdx)
2583	xor	%rax, %rax
2584	jmp	.Lenc_key_ret
2585
2586.align	16
2587.L14rounds:
2588	movups	16($inp),%xmm2			# remaning half of *userKey
2589	mov	\$13,$bits			# 14 rounds for 256
2590	lea	16(%rax),%rax
2591	$movkey	%xmm0,($key)			# round 0
2592	$movkey	%xmm2,16($key)			# round 1
2593	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
2594	call		.Lkey_expansion_256a_cold
2595	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
2596	call		.Lkey_expansion_256b
2597	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
2598	call		.Lkey_expansion_256a
2599	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
2600	call		.Lkey_expansion_256b
2601	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
2602	call		.Lkey_expansion_256a
2603	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
2604	call		.Lkey_expansion_256b
2605	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
2606	call		.Lkey_expansion_256a
2607	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
2608	call		.Lkey_expansion_256b
2609	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
2610	call		.Lkey_expansion_256a
2611	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
2612	call		.Lkey_expansion_256b
2613	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
2614	call		.Lkey_expansion_256a
2615	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
2616	call		.Lkey_expansion_256b
2617	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
2618	call		.Lkey_expansion_256a
2619	$movkey	%xmm0,(%rax)
2620	mov	$bits,16(%rax)	# 240(%rdx)
2621	xor	%rax,%rax
2622	jmp	.Lenc_key_ret
2623
2624.align	16
2625.Lbad_keybits:
2626	mov	\$-2,%rax
2627.Lenc_key_ret:
2628	add	\$8,%rsp
2629	ret
2630.LSEH_end_set_encrypt_key:
2631
2632.align	16
2633.Lkey_expansion_128:
2634	$movkey	%xmm0,(%rax)
2635	lea	16(%rax),%rax
2636.Lkey_expansion_128_cold:
2637	shufps	\$0b00010000,%xmm0,%xmm4
2638	xorps	%xmm4, %xmm0
2639	shufps	\$0b10001100,%xmm0,%xmm4
2640	xorps	%xmm4, %xmm0
2641	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
2642	xorps	%xmm1,%xmm0
2643	ret
2644
2645.align 16
2646.Lkey_expansion_192a:
2647	$movkey	%xmm0,(%rax)
2648	lea	16(%rax),%rax
2649.Lkey_expansion_192a_cold:
2650	movaps	%xmm2, %xmm5
2651.Lkey_expansion_192b_warm:
2652	shufps	\$0b00010000,%xmm0,%xmm4
2653	movdqa	%xmm2,%xmm3
2654	xorps	%xmm4,%xmm0
2655	shufps	\$0b10001100,%xmm0,%xmm4
2656	pslldq	\$4,%xmm3
2657	xorps	%xmm4,%xmm0
2658	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
2659	pxor	%xmm3,%xmm2
2660	pxor	%xmm1,%xmm0
2661	pshufd	\$0b11111111,%xmm0,%xmm3
2662	pxor	%xmm3,%xmm2
2663	ret
2664
2665.align 16
2666.Lkey_expansion_192b:
2667	movaps	%xmm0,%xmm3
2668	shufps	\$0b01000100,%xmm0,%xmm5
2669	$movkey	%xmm5,(%rax)
2670	shufps	\$0b01001110,%xmm2,%xmm3
2671	$movkey	%xmm3,16(%rax)
2672	lea	32(%rax),%rax
2673	jmp	.Lkey_expansion_192b_warm
2674
2675.align	16
2676.Lkey_expansion_256a:
2677	$movkey	%xmm2,(%rax)
2678	lea	16(%rax),%rax
2679.Lkey_expansion_256a_cold:
2680	shufps	\$0b00010000,%xmm0,%xmm4
2681	xorps	%xmm4,%xmm0
2682	shufps	\$0b10001100,%xmm0,%xmm4
2683	xorps	%xmm4,%xmm0
2684	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
2685	xorps	%xmm1,%xmm0
2686	ret
2687
2688.align 16
2689.Lkey_expansion_256b:
2690	$movkey	%xmm0,(%rax)
2691	lea	16(%rax),%rax
2692
2693	shufps	\$0b00010000,%xmm2,%xmm4
2694	xorps	%xmm4,%xmm2
2695	shufps	\$0b10001100,%xmm2,%xmm4
2696	xorps	%xmm4,%xmm2
2697	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
2698	xorps	%xmm1,%xmm2
2699	ret
2700.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
2701.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
2702___
2703}
2704
2705$code.=<<___;
2706.align	64
2707.Lbswap_mask:
2708	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
2709.Lincrement32:
2710	.long	6,6,6,0
2711.Lincrement64:
2712	.long	1,0,0,0
2713.Lxts_magic:
2714	.long	0x87,0,1,0
2715
2716.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
2717.align	64
2718___
2719
2720# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2721#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2722if ($win64) {
2723$rec="%rcx";
2724$frame="%rdx";
2725$context="%r8";
2726$disp="%r9";
2727
2728$code.=<<___;
2729.extern	__imp_RtlVirtualUnwind
2730___
2731$code.=<<___ if ($PREFIX eq "aesni");
2732.type	ecb_se_handler,\@abi-omnipotent
2733.align	16
2734ecb_se_handler:
2735	push	%rsi
2736	push	%rdi
2737	push	%rbx
2738	push	%rbp
2739	push	%r12
2740	push	%r13
2741	push	%r14
2742	push	%r15
2743	pushfq
2744	sub	\$64,%rsp
2745
2746	mov	152($context),%rax	# pull context->Rsp
2747
2748	jmp	.Lcommon_seh_tail
2749.size	ecb_se_handler,.-ecb_se_handler
2750
2751.type	ccm64_se_handler,\@abi-omnipotent
2752.align	16
2753ccm64_se_handler:
2754	push	%rsi
2755	push	%rdi
2756	push	%rbx
2757	push	%rbp
2758	push	%r12
2759	push	%r13
2760	push	%r14
2761	push	%r15
2762	pushfq
2763	sub	\$64,%rsp
2764
2765	mov	120($context),%rax	# pull context->Rax
2766	mov	248($context),%rbx	# pull context->Rip
2767
2768	mov	8($disp),%rsi		# disp->ImageBase
2769	mov	56($disp),%r11		# disp->HandlerData
2770
2771	mov	0(%r11),%r10d		# HandlerData[0]
2772	lea	(%rsi,%r10),%r10	# prologue label
2773	cmp	%r10,%rbx		# context->Rip<prologue label
2774	jb	.Lcommon_seh_tail
2775
2776	mov	152($context),%rax	# pull context->Rsp
2777
2778	mov	4(%r11),%r10d		# HandlerData[1]
2779	lea	(%rsi,%r10),%r10	# epilogue label
2780	cmp	%r10,%rbx		# context->Rip>=epilogue label
2781	jae	.Lcommon_seh_tail
2782
2783	lea	0(%rax),%rsi		# %xmm save area
2784	lea	512($context),%rdi	# &context.Xmm6
2785	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
2786	.long	0xa548f3fc		# cld; rep movsq
2787	lea	0x58(%rax),%rax		# adjust stack pointer
2788
2789	jmp	.Lcommon_seh_tail
2790.size	ccm64_se_handler,.-ccm64_se_handler
2791
2792.type	ctr32_se_handler,\@abi-omnipotent
2793.align	16
2794ctr32_se_handler:
2795	push	%rsi
2796	push	%rdi
2797	push	%rbx
2798	push	%rbp
2799	push	%r12
2800	push	%r13
2801	push	%r14
2802	push	%r15
2803	pushfq
2804	sub	\$64,%rsp
2805
2806	mov	120($context),%rax	# pull context->Rax
2807	mov	248($context),%rbx	# pull context->Rip
2808
2809	lea	.Lctr32_body(%rip),%r10
2810	cmp	%r10,%rbx		# context->Rip<"prologue" label
2811	jb	.Lcommon_seh_tail
2812
2813	mov	152($context),%rax	# pull context->Rsp
2814
2815	lea	.Lctr32_ret(%rip),%r10
2816	cmp	%r10,%rbx
2817	jae	.Lcommon_seh_tail
2818
2819	lea	0x20(%rax),%rsi		# %xmm save area
2820	lea	512($context),%rdi	# &context.Xmm6
2821	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2822	.long	0xa548f3fc		# cld; rep movsq
2823	lea	0xc8(%rax),%rax		# adjust stack pointer
2824
2825	jmp	.Lcommon_seh_tail
2826.size	ctr32_se_handler,.-ctr32_se_handler
2827
2828.type	xts_se_handler,\@abi-omnipotent
2829.align	16
2830xts_se_handler:
2831	push	%rsi
2832	push	%rdi
2833	push	%rbx
2834	push	%rbp
2835	push	%r12
2836	push	%r13
2837	push	%r14
2838	push	%r15
2839	pushfq
2840	sub	\$64,%rsp
2841
2842	mov	120($context),%rax	# pull context->Rax
2843	mov	248($context),%rbx	# pull context->Rip
2844
2845	mov	8($disp),%rsi		# disp->ImageBase
2846	mov	56($disp),%r11		# disp->HandlerData
2847
2848	mov	0(%r11),%r10d		# HandlerData[0]
2849	lea	(%rsi,%r10),%r10	# prologue lable
2850	cmp	%r10,%rbx		# context->Rip<prologue label
2851	jb	.Lcommon_seh_tail
2852
2853	mov	152($context),%rax	# pull context->Rsp
2854
2855	mov	4(%r11),%r10d		# HandlerData[1]
2856	lea	(%rsi,%r10),%r10	# epilogue label
2857	cmp	%r10,%rbx		# context->Rip>=epilogue label
2858	jae	.Lcommon_seh_tail
2859
2860	lea	0x60(%rax),%rsi		# %xmm save area
2861	lea	512($context),%rdi	# & context.Xmm6
2862	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2863	.long	0xa548f3fc		# cld; rep movsq
2864	lea	0x68+160(%rax),%rax	# adjust stack pointer
2865
2866	jmp	.Lcommon_seh_tail
2867.size	xts_se_handler,.-xts_se_handler
2868___
2869$code.=<<___;
2870.type	cbc_se_handler,\@abi-omnipotent
2871.align	16
2872cbc_se_handler:
2873	push	%rsi
2874	push	%rdi
2875	push	%rbx
2876	push	%rbp
2877	push	%r12
2878	push	%r13
2879	push	%r14
2880	push	%r15
2881	pushfq
2882	sub	\$64,%rsp
2883
2884	mov	152($context),%rax	# pull context->Rsp
2885	mov	248($context),%rbx	# pull context->Rip
2886
2887	lea	.Lcbc_decrypt(%rip),%r10
2888	cmp	%r10,%rbx		# context->Rip<"prologue" label
2889	jb	.Lcommon_seh_tail
2890
2891	lea	.Lcbc_decrypt_body(%rip),%r10
2892	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
2893	jb	.Lrestore_cbc_rax
2894
2895	lea	.Lcbc_ret(%rip),%r10
2896	cmp	%r10,%rbx		# context->Rip>="epilogue" label
2897	jae	.Lcommon_seh_tail
2898
2899	lea	0(%rax),%rsi		# top of stack
2900	lea	512($context),%rdi	# &context.Xmm6
2901	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
2902	.long	0xa548f3fc		# cld; rep movsq
2903	lea	0x58(%rax),%rax		# adjust stack pointer
2904	jmp	.Lcommon_seh_tail
2905
2906.Lrestore_cbc_rax:
2907	mov	120($context),%rax
2908
2909.Lcommon_seh_tail:
2910	mov	8(%rax),%rdi
2911	mov	16(%rax),%rsi
2912	mov	%rax,152($context)	# restore context->Rsp
2913	mov	%rsi,168($context)	# restore context->Rsi
2914	mov	%rdi,176($context)	# restore context->Rdi
2915
2916	mov	40($disp),%rdi		# disp->ContextRecord
2917	mov	$context,%rsi		# context
2918	mov	\$154,%ecx		# sizeof(CONTEXT)
2919	.long	0xa548f3fc		# cld; rep movsq
2920
2921	mov	$disp,%rsi
2922	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2923	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2924	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2925	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2926	mov	40(%rsi),%r10		# disp->ContextRecord
2927	lea	56(%rsi),%r11		# &disp->HandlerData
2928	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2929	mov	%r10,32(%rsp)		# arg5
2930	mov	%r11,40(%rsp)		# arg6
2931	mov	%r12,48(%rsp)		# arg7
2932	mov	%rcx,56(%rsp)		# arg8, (NULL)
2933	call	*__imp_RtlVirtualUnwind(%rip)
2934
2935	mov	\$1,%eax		# ExceptionContinueSearch
2936	add	\$64,%rsp
2937	popfq
2938	pop	%r15
2939	pop	%r14
2940	pop	%r13
2941	pop	%r12
2942	pop	%rbp
2943	pop	%rbx
2944	pop	%rdi
2945	pop	%rsi
2946	ret
2947.size	cbc_se_handler,.-cbc_se_handler
2948
2949.section	.pdata
2950.align	4
2951___
2952$code.=<<___ if ($PREFIX eq "aesni");
2953	.rva	.LSEH_begin_aesni_ecb_encrypt
2954	.rva	.LSEH_end_aesni_ecb_encrypt
2955	.rva	.LSEH_info_ecb
2956
2957	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
2958	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
2959	.rva	.LSEH_info_ccm64_enc
2960
2961	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
2962	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
2963	.rva	.LSEH_info_ccm64_dec
2964
2965	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
2966	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
2967	.rva	.LSEH_info_ctr32
2968
2969	.rva	.LSEH_begin_aesni_xts_encrypt
2970	.rva	.LSEH_end_aesni_xts_encrypt
2971	.rva	.LSEH_info_xts_enc
2972
2973	.rva	.LSEH_begin_aesni_xts_decrypt
2974	.rva	.LSEH_end_aesni_xts_decrypt
2975	.rva	.LSEH_info_xts_dec
2976___
2977$code.=<<___;
2978	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
2979	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
2980	.rva	.LSEH_info_cbc
2981
2982	.rva	${PREFIX}_set_decrypt_key
2983	.rva	.LSEH_end_set_decrypt_key
2984	.rva	.LSEH_info_key
2985
2986	.rva	${PREFIX}_set_encrypt_key
2987	.rva	.LSEH_end_set_encrypt_key
2988	.rva	.LSEH_info_key
2989.section	.xdata
2990.align	8
2991___
2992$code.=<<___ if ($PREFIX eq "aesni");
2993.LSEH_info_ecb:
2994	.byte	9,0,0,0
2995	.rva	ecb_se_handler
2996.LSEH_info_ccm64_enc:
2997	.byte	9,0,0,0
2998	.rva	ccm64_se_handler
2999	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
3000.LSEH_info_ccm64_dec:
3001	.byte	9,0,0,0
3002	.rva	ccm64_se_handler
3003	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
3004.LSEH_info_ctr32:
3005	.byte	9,0,0,0
3006	.rva	ctr32_se_handler
3007.LSEH_info_xts_enc:
3008	.byte	9,0,0,0
3009	.rva	xts_se_handler
3010	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3011.LSEH_info_xts_dec:
3012	.byte	9,0,0,0
3013	.rva	xts_se_handler
3014	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3015___
3016$code.=<<___;
3017.LSEH_info_cbc:
3018	.byte	9,0,0,0
3019	.rva	cbc_se_handler
3020.LSEH_info_key:
3021	.byte	0x01,0x04,0x01,0x00
3022	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
3023___
3024}
3025
3026sub rex {
3027  local *opcode=shift;
3028  my ($dst,$src)=@_;
3029  my $rex=0;
3030
3031    $rex|=0x04			if($dst>=8);
3032    $rex|=0x01			if($src>=8);
3033    push @opcode,$rex|0x40	if($rex);
3034}
3035
3036sub aesni {
3037  my $line=shift;
3038  my @opcode=(0x66);
3039
3040    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
3041	rex(\@opcode,$4,$3);
3042	push @opcode,0x0f,0x3a,0xdf;
3043	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
3044	my $c=$2;
3045	push @opcode,$c=~/^0/?oct($c):$c;
3046	return ".byte\t".join(',',@opcode);
3047    }
3048    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
3049	my %opcodelet = (
3050		"aesimc" => 0xdb,
3051		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
3052		"aesdec" => 0xde,	"aesdeclast" => 0xdf
3053	);
3054	return undef if (!defined($opcodelet{$1}));
3055	rex(\@opcode,$3,$2);
3056	push @opcode,0x0f,0x38,$opcodelet{$1};
3057	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
3058	return ".byte\t".join(',',@opcode);
3059    }
3060    return $line;
3061}
3062
3063$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3064$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
3065
3066print $code;
3067
3068close STDOUT;
3069