1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
63# "straghtforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instuctions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizies 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176#		CBC en-/decrypt	CTR	XTS	ECB	OCB
177# Westmere	3.77/1.25	1.25	1.25	1.26
178# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
179# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
180# Skylake	2.62/0.63	0.63	0.63	0.63
181# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
182# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
183# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
184#
185# (*)	Atom Silvermont ECB result is suboptimal because of penalties
186#	incurred by operations on %xmm8-15. As ECB is not considered
187#	critical, nothing was done to mitigate the problem.
188
189$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
190			# generates drop-in replacement for
191			# crypto/aes/asm/aes-x86_64.pl:-)
192
193$flavour = shift;
194$output  = shift;
195if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
196
197$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
198
199$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
200( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
201( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
202die "can't locate x86_64-xlate.pl";
203
204open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
205*STDOUT=*OUT;
206
207$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
208@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
209		("%rdi","%rsi","%rdx","%rcx");	# Unix order
210
211$code=".text\n";
212$code.=".extern	OPENSSL_ia32cap_P\n";
213
214$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
215# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
216$inp="%rdi";
217$out="%rsi";
218$len="%rdx";
219$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
220$ivp="%r8";	# cbc, ctr, ...
221
222$rnds_="%r10d";	# backup copy for $rounds
223$key_="%r11";	# backup copy for $key
224
225# %xmm register layout
226$rndkey0="%xmm0";	$rndkey1="%xmm1";
227$inout0="%xmm2";	$inout1="%xmm3";
228$inout2="%xmm4";	$inout3="%xmm5";
229$inout4="%xmm6";	$inout5="%xmm7";
230$inout6="%xmm8";	$inout7="%xmm9";
231
232$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
233$in0="%xmm8";		$iv="%xmm9";
234
235# Inline version of internal aesni_[en|de]crypt1.
236#
237# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
238# cycles which take care of loop variables...
239{ my $sn;
240sub aesni_generate1 {
241my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
242++$sn;
243$code.=<<___;
244	$movkey	($key),$rndkey0
245	$movkey	16($key),$rndkey1
246___
247$code.=<<___ if (defined($ivec));
248	xorps	$rndkey0,$ivec
249	lea	32($key),$key
250	xorps	$ivec,$inout
251___
252$code.=<<___ if (!defined($ivec));
253	lea	32($key),$key
254	xorps	$rndkey0,$inout
255___
256$code.=<<___;
257.Loop_${p}1_$sn:
258	aes${p}	$rndkey1,$inout
259	dec	$rounds
260	$movkey	($key),$rndkey1
261	lea	16($key),$key
262	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
263	aes${p}last	$rndkey1,$inout
264___
265}}
266# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
267#
268{ my ($inp,$out,$key) = @_4args;
269
270$code.=<<___;
271.globl	${PREFIX}_encrypt
272.type	${PREFIX}_encrypt,\@abi-omnipotent
273.align	16
274${PREFIX}_encrypt:
275	movups	($inp),$inout0		# load input
276	mov	240($key),$rounds	# key->rounds
277___
278	&aesni_generate1("enc",$key,$rounds);
279$code.=<<___;
280	 pxor	$rndkey0,$rndkey0	# clear register bank
281	 pxor	$rndkey1,$rndkey1
282	movups	$inout0,($out)		# output
283	 pxor	$inout0,$inout0
284	ret
285.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
286
287.globl	${PREFIX}_decrypt
288.type	${PREFIX}_decrypt,\@abi-omnipotent
289.align	16
290${PREFIX}_decrypt:
291	movups	($inp),$inout0		# load input
292	mov	240($key),$rounds	# key->rounds
293___
294	&aesni_generate1("dec",$key,$rounds);
295$code.=<<___;
296	 pxor	$rndkey0,$rndkey0	# clear register bank
297	 pxor	$rndkey1,$rndkey1
298	movups	$inout0,($out)		# output
299	 pxor	$inout0,$inout0
300	ret
301.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
302___
303}
304
305# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
306# factor. Why 3x subroutine were originally used in loops? Even though
307# aes[enc|dec] latency was originally 6, it could be scheduled only
308# every *2nd* cycle. Thus 3x interleave was the one providing optimal
309# utilization, i.e. when subroutine's throughput is virtually same as
310# of non-interleaved subroutine [for number of input blocks up to 3].
311# This is why it originally made no sense to implement 2x subroutine.
312# But times change and it became appropriate to spend extra 192 bytes
313# on 2x subroutine on Atom Silvermont account. For processors that
314# can schedule aes[enc|dec] every cycle optimal interleave factor
315# equals to corresponding instructions latency. 8x is optimal for
316# * Bridge and "super-optimal" for other Intel CPUs...
317
318sub aesni_generate2 {
319my $dir=shift;
320# As already mentioned it takes in $key and $rounds, which are *not*
321# preserved. $inout[0-1] is cipher/clear text...
322$code.=<<___;
323.type	_aesni_${dir}rypt2,\@abi-omnipotent
324.align	16
325_aesni_${dir}rypt2:
326	$movkey	($key),$rndkey0
327	shl	\$4,$rounds
328	$movkey	16($key),$rndkey1
329	xorps	$rndkey0,$inout0
330	xorps	$rndkey0,$inout1
331	$movkey	32($key),$rndkey0
332	lea	32($key,$rounds),$key
333	neg	%rax				# $rounds
334	add	\$16,%rax
335
336.L${dir}_loop2:
337	aes${dir}	$rndkey1,$inout0
338	aes${dir}	$rndkey1,$inout1
339	$movkey		($key,%rax),$rndkey1
340	add		\$32,%rax
341	aes${dir}	$rndkey0,$inout0
342	aes${dir}	$rndkey0,$inout1
343	$movkey		-16($key,%rax),$rndkey0
344	jnz		.L${dir}_loop2
345
346	aes${dir}	$rndkey1,$inout0
347	aes${dir}	$rndkey1,$inout1
348	aes${dir}last	$rndkey0,$inout0
349	aes${dir}last	$rndkey0,$inout1
350	ret
351.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
352___
353}
354sub aesni_generate3 {
355my $dir=shift;
356# As already mentioned it takes in $key and $rounds, which are *not*
357# preserved. $inout[0-2] is cipher/clear text...
358$code.=<<___;
359.type	_aesni_${dir}rypt3,\@abi-omnipotent
360.align	16
361_aesni_${dir}rypt3:
362	$movkey	($key),$rndkey0
363	shl	\$4,$rounds
364	$movkey	16($key),$rndkey1
365	xorps	$rndkey0,$inout0
366	xorps	$rndkey0,$inout1
367	xorps	$rndkey0,$inout2
368	$movkey	32($key),$rndkey0
369	lea	32($key,$rounds),$key
370	neg	%rax				# $rounds
371	add	\$16,%rax
372
373.L${dir}_loop3:
374	aes${dir}	$rndkey1,$inout0
375	aes${dir}	$rndkey1,$inout1
376	aes${dir}	$rndkey1,$inout2
377	$movkey		($key,%rax),$rndkey1
378	add		\$32,%rax
379	aes${dir}	$rndkey0,$inout0
380	aes${dir}	$rndkey0,$inout1
381	aes${dir}	$rndkey0,$inout2
382	$movkey		-16($key,%rax),$rndkey0
383	jnz		.L${dir}_loop3
384
385	aes${dir}	$rndkey1,$inout0
386	aes${dir}	$rndkey1,$inout1
387	aes${dir}	$rndkey1,$inout2
388	aes${dir}last	$rndkey0,$inout0
389	aes${dir}last	$rndkey0,$inout1
390	aes${dir}last	$rndkey0,$inout2
391	ret
392.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
393___
394}
395# 4x interleave is implemented to improve small block performance,
396# most notably [and naturally] 4 block by ~30%. One can argue that one
397# should have implemented 5x as well, but improvement would be <20%,
398# so it's not worth it...
399sub aesni_generate4 {
400my $dir=shift;
401# As already mentioned it takes in $key and $rounds, which are *not*
402# preserved. $inout[0-3] is cipher/clear text...
403$code.=<<___;
404.type	_aesni_${dir}rypt4,\@abi-omnipotent
405.align	16
406_aesni_${dir}rypt4:
407	$movkey	($key),$rndkey0
408	shl	\$4,$rounds
409	$movkey	16($key),$rndkey1
410	xorps	$rndkey0,$inout0
411	xorps	$rndkey0,$inout1
412	xorps	$rndkey0,$inout2
413	xorps	$rndkey0,$inout3
414	$movkey	32($key),$rndkey0
415	lea	32($key,$rounds),$key
416	neg	%rax				# $rounds
417	.byte	0x0f,0x1f,0x00
418	add	\$16,%rax
419
420.L${dir}_loop4:
421	aes${dir}	$rndkey1,$inout0
422	aes${dir}	$rndkey1,$inout1
423	aes${dir}	$rndkey1,$inout2
424	aes${dir}	$rndkey1,$inout3
425	$movkey		($key,%rax),$rndkey1
426	add		\$32,%rax
427	aes${dir}	$rndkey0,$inout0
428	aes${dir}	$rndkey0,$inout1
429	aes${dir}	$rndkey0,$inout2
430	aes${dir}	$rndkey0,$inout3
431	$movkey		-16($key,%rax),$rndkey0
432	jnz		.L${dir}_loop4
433
434	aes${dir}	$rndkey1,$inout0
435	aes${dir}	$rndkey1,$inout1
436	aes${dir}	$rndkey1,$inout2
437	aes${dir}	$rndkey1,$inout3
438	aes${dir}last	$rndkey0,$inout0
439	aes${dir}last	$rndkey0,$inout1
440	aes${dir}last	$rndkey0,$inout2
441	aes${dir}last	$rndkey0,$inout3
442	ret
443.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
444___
445}
446sub aesni_generate6 {
447my $dir=shift;
448# As already mentioned it takes in $key and $rounds, which are *not*
449# preserved. $inout[0-5] is cipher/clear text...
450$code.=<<___;
451.type	_aesni_${dir}rypt6,\@abi-omnipotent
452.align	16
453_aesni_${dir}rypt6:
454	$movkey		($key),$rndkey0
455	shl		\$4,$rounds
456	$movkey		16($key),$rndkey1
457	xorps		$rndkey0,$inout0
458	pxor		$rndkey0,$inout1
459	pxor		$rndkey0,$inout2
460	aes${dir}	$rndkey1,$inout0
461	lea		32($key,$rounds),$key
462	neg		%rax			# $rounds
463	aes${dir}	$rndkey1,$inout1
464	pxor		$rndkey0,$inout3
465	pxor		$rndkey0,$inout4
466	aes${dir}	$rndkey1,$inout2
467	pxor		$rndkey0,$inout5
468	$movkey		($key,%rax),$rndkey0
469	add		\$16,%rax
470	jmp		.L${dir}_loop6_enter
471.align	16
472.L${dir}_loop6:
473	aes${dir}	$rndkey1,$inout0
474	aes${dir}	$rndkey1,$inout1
475	aes${dir}	$rndkey1,$inout2
476.L${dir}_loop6_enter:
477	aes${dir}	$rndkey1,$inout3
478	aes${dir}	$rndkey1,$inout4
479	aes${dir}	$rndkey1,$inout5
480	$movkey		($key,%rax),$rndkey1
481	add		\$32,%rax
482	aes${dir}	$rndkey0,$inout0
483	aes${dir}	$rndkey0,$inout1
484	aes${dir}	$rndkey0,$inout2
485	aes${dir}	$rndkey0,$inout3
486	aes${dir}	$rndkey0,$inout4
487	aes${dir}	$rndkey0,$inout5
488	$movkey		-16($key,%rax),$rndkey0
489	jnz		.L${dir}_loop6
490
491	aes${dir}	$rndkey1,$inout0
492	aes${dir}	$rndkey1,$inout1
493	aes${dir}	$rndkey1,$inout2
494	aes${dir}	$rndkey1,$inout3
495	aes${dir}	$rndkey1,$inout4
496	aes${dir}	$rndkey1,$inout5
497	aes${dir}last	$rndkey0,$inout0
498	aes${dir}last	$rndkey0,$inout1
499	aes${dir}last	$rndkey0,$inout2
500	aes${dir}last	$rndkey0,$inout3
501	aes${dir}last	$rndkey0,$inout4
502	aes${dir}last	$rndkey0,$inout5
503	ret
504.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
505___
506}
507sub aesni_generate8 {
508my $dir=shift;
509# As already mentioned it takes in $key and $rounds, which are *not*
510# preserved. $inout[0-7] is cipher/clear text...
511$code.=<<___;
512.type	_aesni_${dir}rypt8,\@abi-omnipotent
513.align	16
514_aesni_${dir}rypt8:
515	$movkey		($key),$rndkey0
516	shl		\$4,$rounds
517	$movkey		16($key),$rndkey1
518	xorps		$rndkey0,$inout0
519	xorps		$rndkey0,$inout1
520	pxor		$rndkey0,$inout2
521	pxor		$rndkey0,$inout3
522	pxor		$rndkey0,$inout4
523	lea		32($key,$rounds),$key
524	neg		%rax			# $rounds
525	aes${dir}	$rndkey1,$inout0
526	pxor		$rndkey0,$inout5
527	pxor		$rndkey0,$inout6
528	aes${dir}	$rndkey1,$inout1
529	pxor		$rndkey0,$inout7
530	$movkey		($key,%rax),$rndkey0
531	add		\$16,%rax
532	jmp		.L${dir}_loop8_inner
533.align	16
534.L${dir}_loop8:
535	aes${dir}	$rndkey1,$inout0
536	aes${dir}	$rndkey1,$inout1
537.L${dir}_loop8_inner:
538	aes${dir}	$rndkey1,$inout2
539	aes${dir}	$rndkey1,$inout3
540	aes${dir}	$rndkey1,$inout4
541	aes${dir}	$rndkey1,$inout5
542	aes${dir}	$rndkey1,$inout6
543	aes${dir}	$rndkey1,$inout7
544.L${dir}_loop8_enter:
545	$movkey		($key,%rax),$rndkey1
546	add		\$32,%rax
547	aes${dir}	$rndkey0,$inout0
548	aes${dir}	$rndkey0,$inout1
549	aes${dir}	$rndkey0,$inout2
550	aes${dir}	$rndkey0,$inout3
551	aes${dir}	$rndkey0,$inout4
552	aes${dir}	$rndkey0,$inout5
553	aes${dir}	$rndkey0,$inout6
554	aes${dir}	$rndkey0,$inout7
555	$movkey		-16($key,%rax),$rndkey0
556	jnz		.L${dir}_loop8
557
558	aes${dir}	$rndkey1,$inout0
559	aes${dir}	$rndkey1,$inout1
560	aes${dir}	$rndkey1,$inout2
561	aes${dir}	$rndkey1,$inout3
562	aes${dir}	$rndkey1,$inout4
563	aes${dir}	$rndkey1,$inout5
564	aes${dir}	$rndkey1,$inout6
565	aes${dir}	$rndkey1,$inout7
566	aes${dir}last	$rndkey0,$inout0
567	aes${dir}last	$rndkey0,$inout1
568	aes${dir}last	$rndkey0,$inout2
569	aes${dir}last	$rndkey0,$inout3
570	aes${dir}last	$rndkey0,$inout4
571	aes${dir}last	$rndkey0,$inout5
572	aes${dir}last	$rndkey0,$inout6
573	aes${dir}last	$rndkey0,$inout7
574	ret
575.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
576___
577}
578&aesni_generate2("enc") if ($PREFIX eq "aesni");
579&aesni_generate2("dec");
580&aesni_generate3("enc") if ($PREFIX eq "aesni");
581&aesni_generate3("dec");
582&aesni_generate4("enc") if ($PREFIX eq "aesni");
583&aesni_generate4("dec");
584&aesni_generate6("enc") if ($PREFIX eq "aesni");
585&aesni_generate6("dec");
586&aesni_generate8("enc") if ($PREFIX eq "aesni");
587&aesni_generate8("dec");
588
589if ($PREFIX eq "aesni") {
590########################################################################
591# void aesni_ecb_encrypt (const void *in, void *out,
592#			  size_t length, const AES_KEY *key,
593#			  int enc);
594$code.=<<___;
595.globl	aesni_ecb_encrypt
596.type	aesni_ecb_encrypt,\@function,5
597.align	16
598aesni_ecb_encrypt:
599___
600$code.=<<___ if ($win64);
601	lea	-0x58(%rsp),%rsp
602	movaps	%xmm6,(%rsp)		# offload $inout4..7
603	movaps	%xmm7,0x10(%rsp)
604	movaps	%xmm8,0x20(%rsp)
605	movaps	%xmm9,0x30(%rsp)
606.Lecb_enc_body:
607___
608$code.=<<___;
609	and	\$-16,$len		# if ($len<16)
610	jz	.Lecb_ret		# return
611
612	mov	240($key),$rounds	# key->rounds
613	$movkey	($key),$rndkey0
614	mov	$key,$key_		# backup $key
615	mov	$rounds,$rnds_		# backup $rounds
616	test	%r8d,%r8d		# 5th argument
617	jz	.Lecb_decrypt
618#--------------------------- ECB ENCRYPT ------------------------------#
619	cmp	\$0x80,$len		# if ($len<8*16)
620	jb	.Lecb_enc_tail		# short input
621
622	movdqu	($inp),$inout0		# load 8 input blocks
623	movdqu	0x10($inp),$inout1
624	movdqu	0x20($inp),$inout2
625	movdqu	0x30($inp),$inout3
626	movdqu	0x40($inp),$inout4
627	movdqu	0x50($inp),$inout5
628	movdqu	0x60($inp),$inout6
629	movdqu	0x70($inp),$inout7
630	lea	0x80($inp),$inp		# $inp+=8*16
631	sub	\$0x80,$len		# $len-=8*16 (can be zero)
632	jmp	.Lecb_enc_loop8_enter
633.align 16
634.Lecb_enc_loop8:
635	movups	$inout0,($out)		# store 8 output blocks
636	mov	$key_,$key		# restore $key
637	movdqu	($inp),$inout0		# load 8 input blocks
638	mov	$rnds_,$rounds		# restore $rounds
639	movups	$inout1,0x10($out)
640	movdqu	0x10($inp),$inout1
641	movups	$inout2,0x20($out)
642	movdqu	0x20($inp),$inout2
643	movups	$inout3,0x30($out)
644	movdqu	0x30($inp),$inout3
645	movups	$inout4,0x40($out)
646	movdqu	0x40($inp),$inout4
647	movups	$inout5,0x50($out)
648	movdqu	0x50($inp),$inout5
649	movups	$inout6,0x60($out)
650	movdqu	0x60($inp),$inout6
651	movups	$inout7,0x70($out)
652	lea	0x80($out),$out		# $out+=8*16
653	movdqu	0x70($inp),$inout7
654	lea	0x80($inp),$inp		# $inp+=8*16
655.Lecb_enc_loop8_enter:
656
657	call	_aesni_encrypt8
658
659	sub	\$0x80,$len
660	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
661
662	movups	$inout0,($out)		# store 8 output blocks
663	mov	$key_,$key		# restore $key
664	movups	$inout1,0x10($out)
665	mov	$rnds_,$rounds		# restore $rounds
666	movups	$inout2,0x20($out)
667	movups	$inout3,0x30($out)
668	movups	$inout4,0x40($out)
669	movups	$inout5,0x50($out)
670	movups	$inout6,0x60($out)
671	movups	$inout7,0x70($out)
672	lea	0x80($out),$out		# $out+=8*16
673	add	\$0x80,$len		# restore real remaining $len
674	jz	.Lecb_ret		# done if ($len==0)
675
676.Lecb_enc_tail:				# $len is less than 8*16
677	movups	($inp),$inout0
678	cmp	\$0x20,$len
679	jb	.Lecb_enc_one
680	movups	0x10($inp),$inout1
681	je	.Lecb_enc_two
682	movups	0x20($inp),$inout2
683	cmp	\$0x40,$len
684	jb	.Lecb_enc_three
685	movups	0x30($inp),$inout3
686	je	.Lecb_enc_four
687	movups	0x40($inp),$inout4
688	cmp	\$0x60,$len
689	jb	.Lecb_enc_five
690	movups	0x50($inp),$inout5
691	je	.Lecb_enc_six
692	movdqu	0x60($inp),$inout6
693	xorps	$inout7,$inout7
694	call	_aesni_encrypt8
695	movups	$inout0,($out)		# store 7 output blocks
696	movups	$inout1,0x10($out)
697	movups	$inout2,0x20($out)
698	movups	$inout3,0x30($out)
699	movups	$inout4,0x40($out)
700	movups	$inout5,0x50($out)
701	movups	$inout6,0x60($out)
702	jmp	.Lecb_ret
703.align	16
704.Lecb_enc_one:
705___
706	&aesni_generate1("enc",$key,$rounds);
707$code.=<<___;
708	movups	$inout0,($out)		# store one output block
709	jmp	.Lecb_ret
710.align	16
711.Lecb_enc_two:
712	call	_aesni_encrypt2
713	movups	$inout0,($out)		# store 2 output blocks
714	movups	$inout1,0x10($out)
715	jmp	.Lecb_ret
716.align	16
717.Lecb_enc_three:
718	call	_aesni_encrypt3
719	movups	$inout0,($out)		# store 3 output blocks
720	movups	$inout1,0x10($out)
721	movups	$inout2,0x20($out)
722	jmp	.Lecb_ret
723.align	16
724.Lecb_enc_four:
725	call	_aesni_encrypt4
726	movups	$inout0,($out)		# store 4 output blocks
727	movups	$inout1,0x10($out)
728	movups	$inout2,0x20($out)
729	movups	$inout3,0x30($out)
730	jmp	.Lecb_ret
731.align	16
732.Lecb_enc_five:
733	xorps	$inout5,$inout5
734	call	_aesni_encrypt6
735	movups	$inout0,($out)		# store 5 output blocks
736	movups	$inout1,0x10($out)
737	movups	$inout2,0x20($out)
738	movups	$inout3,0x30($out)
739	movups	$inout4,0x40($out)
740	jmp	.Lecb_ret
741.align	16
742.Lecb_enc_six:
743	call	_aesni_encrypt6
744	movups	$inout0,($out)		# store 6 output blocks
745	movups	$inout1,0x10($out)
746	movups	$inout2,0x20($out)
747	movups	$inout3,0x30($out)
748	movups	$inout4,0x40($out)
749	movups	$inout5,0x50($out)
750	jmp	.Lecb_ret
751#--------------------------- ECB DECRYPT ------------------------------#
752.align	16
753.Lecb_decrypt:
754	cmp	\$0x80,$len		# if ($len<8*16)
755	jb	.Lecb_dec_tail		# short input
756
757	movdqu	($inp),$inout0		# load 8 input blocks
758	movdqu	0x10($inp),$inout1
759	movdqu	0x20($inp),$inout2
760	movdqu	0x30($inp),$inout3
761	movdqu	0x40($inp),$inout4
762	movdqu	0x50($inp),$inout5
763	movdqu	0x60($inp),$inout6
764	movdqu	0x70($inp),$inout7
765	lea	0x80($inp),$inp		# $inp+=8*16
766	sub	\$0x80,$len		# $len-=8*16 (can be zero)
767	jmp	.Lecb_dec_loop8_enter
768.align 16
769.Lecb_dec_loop8:
770	movups	$inout0,($out)		# store 8 output blocks
771	mov	$key_,$key		# restore $key
772	movdqu	($inp),$inout0		# load 8 input blocks
773	mov	$rnds_,$rounds		# restore $rounds
774	movups	$inout1,0x10($out)
775	movdqu	0x10($inp),$inout1
776	movups	$inout2,0x20($out)
777	movdqu	0x20($inp),$inout2
778	movups	$inout3,0x30($out)
779	movdqu	0x30($inp),$inout3
780	movups	$inout4,0x40($out)
781	movdqu	0x40($inp),$inout4
782	movups	$inout5,0x50($out)
783	movdqu	0x50($inp),$inout5
784	movups	$inout6,0x60($out)
785	movdqu	0x60($inp),$inout6
786	movups	$inout7,0x70($out)
787	lea	0x80($out),$out		# $out+=8*16
788	movdqu	0x70($inp),$inout7
789	lea	0x80($inp),$inp		# $inp+=8*16
790.Lecb_dec_loop8_enter:
791
792	call	_aesni_decrypt8
793
794	$movkey	($key_),$rndkey0
795	sub	\$0x80,$len
796	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
797
798	movups	$inout0,($out)		# store 8 output blocks
799	 pxor	$inout0,$inout0		# clear register bank
800	mov	$key_,$key		# restore $key
801	movups	$inout1,0x10($out)
802	 pxor	$inout1,$inout1
803	mov	$rnds_,$rounds		# restore $rounds
804	movups	$inout2,0x20($out)
805	 pxor	$inout2,$inout2
806	movups	$inout3,0x30($out)
807	 pxor	$inout3,$inout3
808	movups	$inout4,0x40($out)
809	 pxor	$inout4,$inout4
810	movups	$inout5,0x50($out)
811	 pxor	$inout5,$inout5
812	movups	$inout6,0x60($out)
813	 pxor	$inout6,$inout6
814	movups	$inout7,0x70($out)
815	 pxor	$inout7,$inout7
816	lea	0x80($out),$out		# $out+=8*16
817	add	\$0x80,$len		# restore real remaining $len
818	jz	.Lecb_ret		# done if ($len==0)
819
820.Lecb_dec_tail:
821	movups	($inp),$inout0
822	cmp	\$0x20,$len
823	jb	.Lecb_dec_one
824	movups	0x10($inp),$inout1
825	je	.Lecb_dec_two
826	movups	0x20($inp),$inout2
827	cmp	\$0x40,$len
828	jb	.Lecb_dec_three
829	movups	0x30($inp),$inout3
830	je	.Lecb_dec_four
831	movups	0x40($inp),$inout4
832	cmp	\$0x60,$len
833	jb	.Lecb_dec_five
834	movups	0x50($inp),$inout5
835	je	.Lecb_dec_six
836	movups	0x60($inp),$inout6
837	$movkey	($key),$rndkey0
838	xorps	$inout7,$inout7
839	call	_aesni_decrypt8
840	movups	$inout0,($out)		# store 7 output blocks
841	 pxor	$inout0,$inout0		# clear register bank
842	movups	$inout1,0x10($out)
843	 pxor	$inout1,$inout1
844	movups	$inout2,0x20($out)
845	 pxor	$inout2,$inout2
846	movups	$inout3,0x30($out)
847	 pxor	$inout3,$inout3
848	movups	$inout4,0x40($out)
849	 pxor	$inout4,$inout4
850	movups	$inout5,0x50($out)
851	 pxor	$inout5,$inout5
852	movups	$inout6,0x60($out)
853	 pxor	$inout6,$inout6
854	 pxor	$inout7,$inout7
855	jmp	.Lecb_ret
856.align	16
857.Lecb_dec_one:
858___
859	&aesni_generate1("dec",$key,$rounds);
860$code.=<<___;
861	movups	$inout0,($out)		# store one output block
862	 pxor	$inout0,$inout0		# clear register bank
863	jmp	.Lecb_ret
864.align	16
865.Lecb_dec_two:
866	call	_aesni_decrypt2
867	movups	$inout0,($out)		# store 2 output blocks
868	 pxor	$inout0,$inout0		# clear register bank
869	movups	$inout1,0x10($out)
870	 pxor	$inout1,$inout1
871	jmp	.Lecb_ret
872.align	16
873.Lecb_dec_three:
874	call	_aesni_decrypt3
875	movups	$inout0,($out)		# store 3 output blocks
876	 pxor	$inout0,$inout0		# clear register bank
877	movups	$inout1,0x10($out)
878	 pxor	$inout1,$inout1
879	movups	$inout2,0x20($out)
880	 pxor	$inout2,$inout2
881	jmp	.Lecb_ret
882.align	16
883.Lecb_dec_four:
884	call	_aesni_decrypt4
885	movups	$inout0,($out)		# store 4 output blocks
886	 pxor	$inout0,$inout0		# clear register bank
887	movups	$inout1,0x10($out)
888	 pxor	$inout1,$inout1
889	movups	$inout2,0x20($out)
890	 pxor	$inout2,$inout2
891	movups	$inout3,0x30($out)
892	 pxor	$inout3,$inout3
893	jmp	.Lecb_ret
894.align	16
895.Lecb_dec_five:
896	xorps	$inout5,$inout5
897	call	_aesni_decrypt6
898	movups	$inout0,($out)		# store 5 output blocks
899	 pxor	$inout0,$inout0		# clear register bank
900	movups	$inout1,0x10($out)
901	 pxor	$inout1,$inout1
902	movups	$inout2,0x20($out)
903	 pxor	$inout2,$inout2
904	movups	$inout3,0x30($out)
905	 pxor	$inout3,$inout3
906	movups	$inout4,0x40($out)
907	 pxor	$inout4,$inout4
908	 pxor	$inout5,$inout5
909	jmp	.Lecb_ret
910.align	16
911.Lecb_dec_six:
912	call	_aesni_decrypt6
913	movups	$inout0,($out)		# store 6 output blocks
914	 pxor	$inout0,$inout0		# clear register bank
915	movups	$inout1,0x10($out)
916	 pxor	$inout1,$inout1
917	movups	$inout2,0x20($out)
918	 pxor	$inout2,$inout2
919	movups	$inout3,0x30($out)
920	 pxor	$inout3,$inout3
921	movups	$inout4,0x40($out)
922	 pxor	$inout4,$inout4
923	movups	$inout5,0x50($out)
924	 pxor	$inout5,$inout5
925
926.Lecb_ret:
927	xorps	$rndkey0,$rndkey0	# %xmm0
928	pxor	$rndkey1,$rndkey1
929___
930$code.=<<___ if ($win64);
931	movaps	(%rsp),%xmm6
932	movaps	%xmm0,(%rsp)		# clear stack
933	movaps	0x10(%rsp),%xmm7
934	movaps	%xmm0,0x10(%rsp)
935	movaps	0x20(%rsp),%xmm8
936	movaps	%xmm0,0x20(%rsp)
937	movaps	0x30(%rsp),%xmm9
938	movaps	%xmm0,0x30(%rsp)
939	lea	0x58(%rsp),%rsp
940.Lecb_enc_ret:
941___
942$code.=<<___;
943	ret
944.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
945___
946
947{
948######################################################################
949# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
950#                         size_t blocks, const AES_KEY *key,
951#                         const char *ivec,char *cmac);
952#
953# Handles only complete blocks, operates on 64-bit counter and
954# does not update *ivec! Nor does it finalize CMAC value
955# (see engine/eng_aesni.c for details)
956#
957{
958my $cmac="%r9";	# 6th argument
959
960my $increment="%xmm9";
961my $iv="%xmm6";
962my $bswap_mask="%xmm7";
963
964$code.=<<___;
965.globl	aesni_ccm64_encrypt_blocks
966.type	aesni_ccm64_encrypt_blocks,\@function,6
967.align	16
968aesni_ccm64_encrypt_blocks:
969___
970$code.=<<___ if ($win64);
971	lea	-0x58(%rsp),%rsp
972	movaps	%xmm6,(%rsp)		# $iv
973	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
974	movaps	%xmm8,0x20(%rsp)	# $in0
975	movaps	%xmm9,0x30(%rsp)	# $increment
976.Lccm64_enc_body:
977___
978$code.=<<___;
979	mov	240($key),$rounds		# key->rounds
980	movdqu	($ivp),$iv
981	movdqa	.Lincrement64(%rip),$increment
982	movdqa	.Lbswap_mask(%rip),$bswap_mask
983
984	shl	\$4,$rounds
985	mov	\$16,$rnds_
986	lea	0($key),$key_
987	movdqu	($cmac),$inout1
988	movdqa	$iv,$inout0
989	lea	32($key,$rounds),$key		# end of key schedule
990	pshufb	$bswap_mask,$iv
991	sub	%rax,%r10			# twisted $rounds
992	jmp	.Lccm64_enc_outer
993.align	16
994.Lccm64_enc_outer:
995	$movkey	($key_),$rndkey0
996	mov	%r10,%rax
997	movups	($inp),$in0			# load inp
998
999	xorps	$rndkey0,$inout0		# counter
1000	$movkey	16($key_),$rndkey1
1001	xorps	$in0,$rndkey0
1002	xorps	$rndkey0,$inout1		# cmac^=inp
1003	$movkey	32($key_),$rndkey0
1004
1005.Lccm64_enc2_loop:
1006	aesenc	$rndkey1,$inout0
1007	aesenc	$rndkey1,$inout1
1008	$movkey	($key,%rax),$rndkey1
1009	add	\$32,%rax
1010	aesenc	$rndkey0,$inout0
1011	aesenc	$rndkey0,$inout1
1012	$movkey	-16($key,%rax),$rndkey0
1013	jnz	.Lccm64_enc2_loop
1014	aesenc	$rndkey1,$inout0
1015	aesenc	$rndkey1,$inout1
1016	paddq	$increment,$iv
1017	dec	$len				# $len-- ($len is in blocks)
1018	aesenclast	$rndkey0,$inout0
1019	aesenclast	$rndkey0,$inout1
1020
1021	lea	16($inp),$inp
1022	xorps	$inout0,$in0			# inp ^= E(iv)
1023	movdqa	$iv,$inout0
1024	movups	$in0,($out)			# save output
1025	pshufb	$bswap_mask,$inout0
1026	lea	16($out),$out			# $out+=16
1027	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1028
1029	 pxor	$rndkey0,$rndkey0		# clear register bank
1030	 pxor	$rndkey1,$rndkey1
1031	 pxor	$inout0,$inout0
1032	movups	$inout1,($cmac)			# store resulting mac
1033	 pxor	$inout1,$inout1
1034	 pxor	$in0,$in0
1035	 pxor	$iv,$iv
1036___
1037$code.=<<___ if ($win64);
1038	movaps	(%rsp),%xmm6
1039	movaps	%xmm0,(%rsp)			# clear stack
1040	movaps	0x10(%rsp),%xmm7
1041	movaps	%xmm0,0x10(%rsp)
1042	movaps	0x20(%rsp),%xmm8
1043	movaps	%xmm0,0x20(%rsp)
1044	movaps	0x30(%rsp),%xmm9
1045	movaps	%xmm0,0x30(%rsp)
1046	lea	0x58(%rsp),%rsp
1047.Lccm64_enc_ret:
1048___
1049$code.=<<___;
1050	ret
1051.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1052___
1053######################################################################
1054$code.=<<___;
1055.globl	aesni_ccm64_decrypt_blocks
1056.type	aesni_ccm64_decrypt_blocks,\@function,6
1057.align	16
1058aesni_ccm64_decrypt_blocks:
1059___
1060$code.=<<___ if ($win64);
1061	lea	-0x58(%rsp),%rsp
1062	movaps	%xmm6,(%rsp)		# $iv
1063	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1064	movaps	%xmm8,0x20(%rsp)	# $in8
1065	movaps	%xmm9,0x30(%rsp)	# $increment
1066.Lccm64_dec_body:
1067___
1068$code.=<<___;
1069	mov	240($key),$rounds		# key->rounds
1070	movups	($ivp),$iv
1071	movdqu	($cmac),$inout1
1072	movdqa	.Lincrement64(%rip),$increment
1073	movdqa	.Lbswap_mask(%rip),$bswap_mask
1074
1075	movaps	$iv,$inout0
1076	mov	$rounds,$rnds_
1077	mov	$key,$key_
1078	pshufb	$bswap_mask,$iv
1079___
1080	&aesni_generate1("enc",$key,$rounds);
1081$code.=<<___;
1082	shl	\$4,$rnds_
1083	mov	\$16,$rounds
1084	movups	($inp),$in0			# load inp
1085	paddq	$increment,$iv
1086	lea	16($inp),$inp			# $inp+=16
1087	sub	%r10,%rax			# twisted $rounds
1088	lea	32($key_,$rnds_),$key		# end of key schedule
1089	mov	%rax,%r10
1090	jmp	.Lccm64_dec_outer
1091.align	16
1092.Lccm64_dec_outer:
1093	xorps	$inout0,$in0			# inp ^= E(iv)
1094	movdqa	$iv,$inout0
1095	movups	$in0,($out)			# save output
1096	lea	16($out),$out			# $out+=16
1097	pshufb	$bswap_mask,$inout0
1098
1099	sub	\$1,$len			# $len-- ($len is in blocks)
1100	jz	.Lccm64_dec_break		# if ($len==0) break
1101
1102	$movkey	($key_),$rndkey0
1103	mov	%r10,%rax
1104	$movkey	16($key_),$rndkey1
1105	xorps	$rndkey0,$in0
1106	xorps	$rndkey0,$inout0
1107	xorps	$in0,$inout1			# cmac^=out
1108	$movkey	32($key_),$rndkey0
1109	jmp	.Lccm64_dec2_loop
1110.align	16
1111.Lccm64_dec2_loop:
1112	aesenc	$rndkey1,$inout0
1113	aesenc	$rndkey1,$inout1
1114	$movkey	($key,%rax),$rndkey1
1115	add	\$32,%rax
1116	aesenc	$rndkey0,$inout0
1117	aesenc	$rndkey0,$inout1
1118	$movkey	-16($key,%rax),$rndkey0
1119	jnz	.Lccm64_dec2_loop
1120	movups	($inp),$in0			# load input
1121	paddq	$increment,$iv
1122	aesenc	$rndkey1,$inout0
1123	aesenc	$rndkey1,$inout1
1124	aesenclast	$rndkey0,$inout0
1125	aesenclast	$rndkey0,$inout1
1126	lea	16($inp),$inp			# $inp+=16
1127	jmp	.Lccm64_dec_outer
1128
1129.align	16
1130.Lccm64_dec_break:
1131	#xorps	$in0,$inout1			# cmac^=out
1132	mov	240($key_),$rounds
1133___
1134	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1135$code.=<<___;
1136	 pxor	$rndkey0,$rndkey0		# clear register bank
1137	 pxor	$rndkey1,$rndkey1
1138	 pxor	$inout0,$inout0
1139	movups	$inout1,($cmac)			# store resulting mac
1140	 pxor	$inout1,$inout1
1141	 pxor	$in0,$in0
1142	 pxor	$iv,$iv
1143___
1144$code.=<<___ if ($win64);
1145	movaps	(%rsp),%xmm6
1146	movaps	%xmm0,(%rsp)			# clear stack
1147	movaps	0x10(%rsp),%xmm7
1148	movaps	%xmm0,0x10(%rsp)
1149	movaps	0x20(%rsp),%xmm8
1150	movaps	%xmm0,0x20(%rsp)
1151	movaps	0x30(%rsp),%xmm9
1152	movaps	%xmm0,0x30(%rsp)
1153	lea	0x58(%rsp),%rsp
1154.Lccm64_dec_ret:
1155___
1156$code.=<<___;
1157	ret
1158.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1159___
1160}
1161######################################################################
1162# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1163#                         size_t blocks, const AES_KEY *key,
1164#                         const char *ivec);
1165#
1166# Handles only complete blocks, operates on 32-bit counter and
1167# does not update *ivec! (see crypto/modes/ctr128.c for details)
1168#
1169# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1170# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1171# Keywords are full unroll and modulo-schedule counter calculations
1172# with zero-round key xor.
1173{
1174my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1175my ($key0,$ctr)=("%ebp","${ivp}d");
1176my $frame_size = 0x80 + ($win64?160:0);
1177
1178$code.=<<___;
1179.globl	aesni_ctr32_encrypt_blocks
1180.type	aesni_ctr32_encrypt_blocks,\@function,5
1181.align	16
1182aesni_ctr32_encrypt_blocks:
1183	cmp	\$1,$len
1184	jne	.Lctr32_bulk
1185
1186	# handle single block without allocating stack frame,
1187	# useful when handling edges
1188	movups	($ivp),$inout0
1189	movups	($inp),$inout1
1190	mov	240($key),%edx			# key->rounds
1191___
1192	&aesni_generate1("enc",$key,"%edx");
1193$code.=<<___;
1194	 pxor	$rndkey0,$rndkey0		# clear register bank
1195	 pxor	$rndkey1,$rndkey1
1196	xorps	$inout1,$inout0
1197	 pxor	$inout1,$inout1
1198	movups	$inout0,($out)
1199	 xorps	$inout0,$inout0
1200	jmp	.Lctr32_epilogue
1201
1202.align	16
1203.Lctr32_bulk:
1204	lea	(%rsp),$key_			# use $key_ as frame pointer
1205	push	%rbp
1206	sub	\$$frame_size,%rsp
1207	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1208___
1209$code.=<<___ if ($win64);
1210	movaps	%xmm6,-0xa8($key_)		# offload everything
1211	movaps	%xmm7,-0x98($key_)
1212	movaps	%xmm8,-0x88($key_)
1213	movaps	%xmm9,-0x78($key_)
1214	movaps	%xmm10,-0x68($key_)
1215	movaps	%xmm11,-0x58($key_)
1216	movaps	%xmm12,-0x48($key_)
1217	movaps	%xmm13,-0x38($key_)
1218	movaps	%xmm14,-0x28($key_)
1219	movaps	%xmm15,-0x18($key_)
1220.Lctr32_body:
1221___
1222$code.=<<___;
1223
1224	# 8 16-byte words on top of stack are counter values
1225	# xor-ed with zero-round key
1226
1227	movdqu	($ivp),$inout0
1228	movdqu	($key),$rndkey0
1229	mov	12($ivp),$ctr			# counter LSB
1230	pxor	$rndkey0,$inout0
1231	mov	12($key),$key0			# 0-round key LSB
1232	movdqa	$inout0,0x00(%rsp)		# populate counter block
1233	bswap	$ctr
1234	movdqa	$inout0,$inout1
1235	movdqa	$inout0,$inout2
1236	movdqa	$inout0,$inout3
1237	movdqa	$inout0,0x40(%rsp)
1238	movdqa	$inout0,0x50(%rsp)
1239	movdqa	$inout0,0x60(%rsp)
1240	mov	%rdx,%r10			# about to borrow %rdx
1241	movdqa	$inout0,0x70(%rsp)
1242
1243	lea	1($ctr),%rax
1244	 lea	2($ctr),%rdx
1245	bswap	%eax
1246	 bswap	%edx
1247	xor	$key0,%eax
1248	 xor	$key0,%edx
1249	pinsrd	\$3,%eax,$inout1
1250	lea	3($ctr),%rax
1251	movdqa	$inout1,0x10(%rsp)
1252	 pinsrd	\$3,%edx,$inout2
1253	bswap	%eax
1254	 mov	%r10,%rdx			# restore %rdx
1255	 lea	4($ctr),%r10
1256	 movdqa	$inout2,0x20(%rsp)
1257	xor	$key0,%eax
1258	 bswap	%r10d
1259	pinsrd	\$3,%eax,$inout3
1260	 xor	$key0,%r10d
1261	movdqa	$inout3,0x30(%rsp)
1262	lea	5($ctr),%r9
1263	 mov	%r10d,0x40+12(%rsp)
1264	bswap	%r9d
1265	 lea	6($ctr),%r10
1266	mov	240($key),$rounds		# key->rounds
1267	xor	$key0,%r9d
1268	 bswap	%r10d
1269	mov	%r9d,0x50+12(%rsp)
1270	 xor	$key0,%r10d
1271	lea	7($ctr),%r9
1272	 mov	%r10d,0x60+12(%rsp)
1273	bswap	%r9d
1274	leaq	OPENSSL_ia32cap_P(%rip),%r10
1275	 mov	4(%r10),%r10d
1276	xor	$key0,%r9d
1277	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
1278	mov	%r9d,0x70+12(%rsp)
1279
1280	$movkey	0x10($key),$rndkey1
1281
1282	movdqa	0x40(%rsp),$inout4
1283	movdqa	0x50(%rsp),$inout5
1284
1285	cmp	\$8,$len		# $len is in blocks
1286	jb	.Lctr32_tail		# short input if ($len<8)
1287
1288	sub	\$6,$len		# $len is biased by -6
1289	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
1290	je	.Lctr32_6x		# [which denotes Atom Silvermont]
1291
1292	lea	0x80($key),$key		# size optimization
1293	sub	\$2,$len		# $len is biased by -8
1294	jmp	.Lctr32_loop8
1295
1296.align	16
1297.Lctr32_6x:
1298	shl	\$4,$rounds
1299	mov	\$48,$rnds_
1300	bswap	$key0
1301	lea	32($key,$rounds),$key	# end of key schedule
1302	sub	%rax,%r10		# twisted $rounds
1303	jmp	.Lctr32_loop6
1304
1305.align	16
1306.Lctr32_loop6:
1307	 add	\$6,$ctr		# next counter value
1308	$movkey	-48($key,$rnds_),$rndkey0
1309	aesenc	$rndkey1,$inout0
1310	 mov	$ctr,%eax
1311	 xor	$key0,%eax
1312	aesenc	$rndkey1,$inout1
1313	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
1314	 lea	1($ctr),%eax
1315	aesenc	$rndkey1,$inout2
1316	 xor	$key0,%eax
1317	 movbe	%eax,`0x10+12`(%rsp)
1318	aesenc	$rndkey1,$inout3
1319	 lea	2($ctr),%eax
1320	 xor	$key0,%eax
1321	aesenc	$rndkey1,$inout4
1322	 movbe	%eax,`0x20+12`(%rsp)
1323	 lea	3($ctr),%eax
1324	aesenc	$rndkey1,$inout5
1325	$movkey	-32($key,$rnds_),$rndkey1
1326	 xor	$key0,%eax
1327
1328	aesenc	$rndkey0,$inout0
1329	 movbe	%eax,`0x30+12`(%rsp)
1330	 lea	4($ctr),%eax
1331	aesenc	$rndkey0,$inout1
1332	 xor	$key0,%eax
1333	 movbe	%eax,`0x40+12`(%rsp)
1334	aesenc	$rndkey0,$inout2
1335	 lea	5($ctr),%eax
1336	 xor	$key0,%eax
1337	aesenc	$rndkey0,$inout3
1338	 movbe	%eax,`0x50+12`(%rsp)
1339	 mov	%r10,%rax		# mov	$rnds_,$rounds
1340	aesenc	$rndkey0,$inout4
1341	aesenc	$rndkey0,$inout5
1342	$movkey	-16($key,$rnds_),$rndkey0
1343
1344	call	.Lenc_loop6
1345
1346	movdqu	($inp),$inout6		# load 6 input blocks
1347	movdqu	0x10($inp),$inout7
1348	movdqu	0x20($inp),$in0
1349	movdqu	0x30($inp),$in1
1350	movdqu	0x40($inp),$in2
1351	movdqu	0x50($inp),$in3
1352	lea	0x60($inp),$inp		# $inp+=6*16
1353	$movkey	-64($key,$rnds_),$rndkey1
1354	pxor	$inout0,$inout6		# inp^=E(ctr)
1355	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
1356	pxor	$inout1,$inout7
1357	movaps	0x10(%rsp),$inout1
1358	pxor	$inout2,$in0
1359	movaps	0x20(%rsp),$inout2
1360	pxor	$inout3,$in1
1361	movaps	0x30(%rsp),$inout3
1362	pxor	$inout4,$in2
1363	movaps	0x40(%rsp),$inout4
1364	pxor	$inout5,$in3
1365	movaps	0x50(%rsp),$inout5
1366	movdqu	$inout6,($out)		# store 6 output blocks
1367	movdqu	$inout7,0x10($out)
1368	movdqu	$in0,0x20($out)
1369	movdqu	$in1,0x30($out)
1370	movdqu	$in2,0x40($out)
1371	movdqu	$in3,0x50($out)
1372	lea	0x60($out),$out		# $out+=6*16
1373
1374	sub	\$6,$len
1375	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
1376
1377	add	\$6,$len		# restore real remaining $len
1378	jz	.Lctr32_done		# done if ($len==0)
1379
1380	lea	-48($rnds_),$rounds
1381	lea	-80($key,$rnds_),$key	# restore $key
1382	neg	$rounds
1383	shr	\$4,$rounds		# restore $rounds
1384	jmp	.Lctr32_tail
1385
1386.align	32
1387.Lctr32_loop8:
1388	 add		\$8,$ctr		# next counter value
1389	movdqa		0x60(%rsp),$inout6
1390	aesenc		$rndkey1,$inout0
1391	 mov		$ctr,%r9d
1392	movdqa		0x70(%rsp),$inout7
1393	aesenc		$rndkey1,$inout1
1394	 bswap		%r9d
1395	$movkey		0x20-0x80($key),$rndkey0
1396	aesenc		$rndkey1,$inout2
1397	 xor		$key0,%r9d
1398	 nop
1399	aesenc		$rndkey1,$inout3
1400	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1401	 lea		1($ctr),%r9
1402	aesenc		$rndkey1,$inout4
1403	aesenc		$rndkey1,$inout5
1404	aesenc		$rndkey1,$inout6
1405	aesenc		$rndkey1,$inout7
1406	$movkey		0x30-0x80($key),$rndkey1
1407___
1408for($i=2;$i<8;$i++) {
1409my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1410$code.=<<___;
1411	 bswap		%r9d
1412	aesenc		$rndkeyx,$inout0
1413	aesenc		$rndkeyx,$inout1
1414	 xor		$key0,%r9d
1415	 .byte		0x66,0x90
1416	aesenc		$rndkeyx,$inout2
1417	aesenc		$rndkeyx,$inout3
1418	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1419	 lea		$i($ctr),%r9
1420	aesenc		$rndkeyx,$inout4
1421	aesenc		$rndkeyx,$inout5
1422	aesenc		$rndkeyx,$inout6
1423	aesenc		$rndkeyx,$inout7
1424	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1425___
1426}
1427$code.=<<___;
1428	 bswap		%r9d
1429	aesenc		$rndkey0,$inout0
1430	aesenc		$rndkey0,$inout1
1431	aesenc		$rndkey0,$inout2
1432	 xor		$key0,%r9d
1433	 movdqu		0x00($inp),$in0		# start loading input
1434	aesenc		$rndkey0,$inout3
1435	 mov		%r9d,0x70+12(%rsp)
1436	 cmp		\$11,$rounds
1437	aesenc		$rndkey0,$inout4
1438	aesenc		$rndkey0,$inout5
1439	aesenc		$rndkey0,$inout6
1440	aesenc		$rndkey0,$inout7
1441	$movkey		0xa0-0x80($key),$rndkey0
1442
1443	jb		.Lctr32_enc_done
1444
1445	aesenc		$rndkey1,$inout0
1446	aesenc		$rndkey1,$inout1
1447	aesenc		$rndkey1,$inout2
1448	aesenc		$rndkey1,$inout3
1449	aesenc		$rndkey1,$inout4
1450	aesenc		$rndkey1,$inout5
1451	aesenc		$rndkey1,$inout6
1452	aesenc		$rndkey1,$inout7
1453	$movkey		0xb0-0x80($key),$rndkey1
1454
1455	aesenc		$rndkey0,$inout0
1456	aesenc		$rndkey0,$inout1
1457	aesenc		$rndkey0,$inout2
1458	aesenc		$rndkey0,$inout3
1459	aesenc		$rndkey0,$inout4
1460	aesenc		$rndkey0,$inout5
1461	aesenc		$rndkey0,$inout6
1462	aesenc		$rndkey0,$inout7
1463	$movkey		0xc0-0x80($key),$rndkey0
1464	je		.Lctr32_enc_done
1465
1466	aesenc		$rndkey1,$inout0
1467	aesenc		$rndkey1,$inout1
1468	aesenc		$rndkey1,$inout2
1469	aesenc		$rndkey1,$inout3
1470	aesenc		$rndkey1,$inout4
1471	aesenc		$rndkey1,$inout5
1472	aesenc		$rndkey1,$inout6
1473	aesenc		$rndkey1,$inout7
1474	$movkey		0xd0-0x80($key),$rndkey1
1475
1476	aesenc		$rndkey0,$inout0
1477	aesenc		$rndkey0,$inout1
1478	aesenc		$rndkey0,$inout2
1479	aesenc		$rndkey0,$inout3
1480	aesenc		$rndkey0,$inout4
1481	aesenc		$rndkey0,$inout5
1482	aesenc		$rndkey0,$inout6
1483	aesenc		$rndkey0,$inout7
1484	$movkey		0xe0-0x80($key),$rndkey0
1485	jmp		.Lctr32_enc_done
1486
1487.align	16
1488.Lctr32_enc_done:
1489	movdqu		0x10($inp),$in1
1490	pxor		$rndkey0,$in0		# input^=round[last]
1491	movdqu		0x20($inp),$in2
1492	pxor		$rndkey0,$in1
1493	movdqu		0x30($inp),$in3
1494	pxor		$rndkey0,$in2
1495	movdqu		0x40($inp),$in4
1496	pxor		$rndkey0,$in3
1497	movdqu		0x50($inp),$in5
1498	pxor		$rndkey0,$in4
1499	pxor		$rndkey0,$in5
1500	aesenc		$rndkey1,$inout0
1501	aesenc		$rndkey1,$inout1
1502	aesenc		$rndkey1,$inout2
1503	aesenc		$rndkey1,$inout3
1504	aesenc		$rndkey1,$inout4
1505	aesenc		$rndkey1,$inout5
1506	aesenc		$rndkey1,$inout6
1507	aesenc		$rndkey1,$inout7
1508	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1509	lea		0x80($inp),$inp		# $inp+=8*16
1510
1511	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1512	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1513	movdqu		0x70-0x80($inp),$in0
1514	aesenclast	$in1,$inout1
1515	pxor		$rndkey0,$in0
1516	movdqa		0x00(%rsp),$in1		# load next counter block
1517	aesenclast	$in2,$inout2
1518	aesenclast	$in3,$inout3
1519	movdqa		0x10(%rsp),$in2
1520	movdqa		0x20(%rsp),$in3
1521	aesenclast	$in4,$inout4
1522	aesenclast	$in5,$inout5
1523	movdqa		0x30(%rsp),$in4
1524	movdqa		0x40(%rsp),$in5
1525	aesenclast	$rndkey1,$inout6
1526	movdqa		0x50(%rsp),$rndkey0
1527	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1528	aesenclast	$in0,$inout7
1529
1530	movups		$inout0,($out)		# store 8 output blocks
1531	movdqa		$in1,$inout0
1532	movups		$inout1,0x10($out)
1533	movdqa		$in2,$inout1
1534	movups		$inout2,0x20($out)
1535	movdqa		$in3,$inout2
1536	movups		$inout3,0x30($out)
1537	movdqa		$in4,$inout3
1538	movups		$inout4,0x40($out)
1539	movdqa		$in5,$inout4
1540	movups		$inout5,0x50($out)
1541	movdqa		$rndkey0,$inout5
1542	movups		$inout6,0x60($out)
1543	movups		$inout7,0x70($out)
1544	lea		0x80($out),$out		# $out+=8*16
1545
1546	sub	\$8,$len
1547	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1548
1549	add	\$8,$len			# restore real remainig $len
1550	jz	.Lctr32_done			# done if ($len==0)
1551	lea	-0x80($key),$key
1552
1553.Lctr32_tail:
1554	# note that at this point $inout0..5 are populated with
1555	# counter values xor-ed with 0-round key
1556	lea	16($key),$key
1557	cmp	\$4,$len
1558	jb	.Lctr32_loop3
1559	je	.Lctr32_loop4
1560
1561	# if ($len>4) compute 7 E(counter)
1562	shl		\$4,$rounds
1563	movdqa		0x60(%rsp),$inout6
1564	pxor		$inout7,$inout7
1565
1566	$movkey		16($key),$rndkey0
1567	aesenc		$rndkey1,$inout0
1568	aesenc		$rndkey1,$inout1
1569	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1570	neg		%rax
1571	aesenc		$rndkey1,$inout2
1572	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1573	 movups		($inp),$in0
1574	aesenc		$rndkey1,$inout3
1575	aesenc		$rndkey1,$inout4
1576	 movups		0x10($inp),$in1		# pre-load input
1577	 movups		0x20($inp),$in2
1578	aesenc		$rndkey1,$inout5
1579	aesenc		$rndkey1,$inout6
1580
1581	call            .Lenc_loop8_enter
1582
1583	movdqu	0x30($inp),$in3
1584	pxor	$in0,$inout0
1585	movdqu	0x40($inp),$in0
1586	pxor	$in1,$inout1
1587	movdqu	$inout0,($out)			# store output
1588	pxor	$in2,$inout2
1589	movdqu	$inout1,0x10($out)
1590	pxor	$in3,$inout3
1591	movdqu	$inout2,0x20($out)
1592	pxor	$in0,$inout4
1593	movdqu	$inout3,0x30($out)
1594	movdqu	$inout4,0x40($out)
1595	cmp	\$6,$len
1596	jb	.Lctr32_done			# $len was 5, stop store
1597
1598	movups	0x50($inp),$in1
1599	xorps	$in1,$inout5
1600	movups	$inout5,0x50($out)
1601	je	.Lctr32_done			# $len was 6, stop store
1602
1603	movups	0x60($inp),$in2
1604	xorps	$in2,$inout6
1605	movups	$inout6,0x60($out)
1606	jmp	.Lctr32_done			# $len was 7, stop store
1607
1608.align	32
1609.Lctr32_loop4:
1610	aesenc		$rndkey1,$inout0
1611	lea		16($key),$key
1612	dec		$rounds
1613	aesenc		$rndkey1,$inout1
1614	aesenc		$rndkey1,$inout2
1615	aesenc		$rndkey1,$inout3
1616	$movkey		($key),$rndkey1
1617	jnz		.Lctr32_loop4
1618	aesenclast	$rndkey1,$inout0
1619	aesenclast	$rndkey1,$inout1
1620	 movups		($inp),$in0		# load input
1621	 movups		0x10($inp),$in1
1622	aesenclast	$rndkey1,$inout2
1623	aesenclast	$rndkey1,$inout3
1624	 movups		0x20($inp),$in2
1625	 movups		0x30($inp),$in3
1626
1627	xorps	$in0,$inout0
1628	movups	$inout0,($out)			# store output
1629	xorps	$in1,$inout1
1630	movups	$inout1,0x10($out)
1631	pxor	$in2,$inout2
1632	movdqu	$inout2,0x20($out)
1633	pxor	$in3,$inout3
1634	movdqu	$inout3,0x30($out)
1635	jmp	.Lctr32_done			# $len was 4, stop store
1636
1637.align	32
1638.Lctr32_loop3:
1639	aesenc		$rndkey1,$inout0
1640	lea		16($key),$key
1641	dec		$rounds
1642	aesenc		$rndkey1,$inout1
1643	aesenc		$rndkey1,$inout2
1644	$movkey		($key),$rndkey1
1645	jnz		.Lctr32_loop3
1646	aesenclast	$rndkey1,$inout0
1647	aesenclast	$rndkey1,$inout1
1648	aesenclast	$rndkey1,$inout2
1649
1650	movups	($inp),$in0			# load input
1651	xorps	$in0,$inout0
1652	movups	$inout0,($out)			# store output
1653	cmp	\$2,$len
1654	jb	.Lctr32_done			# $len was 1, stop store
1655
1656	movups	0x10($inp),$in1
1657	xorps	$in1,$inout1
1658	movups	$inout1,0x10($out)
1659	je	.Lctr32_done			# $len was 2, stop store
1660
1661	movups	0x20($inp),$in2
1662	xorps	$in2,$inout2
1663	movups	$inout2,0x20($out)		# $len was 3, stop store
1664
1665.Lctr32_done:
1666	xorps	%xmm0,%xmm0			# clear regiser bank
1667	xor	$key0,$key0
1668	pxor	%xmm1,%xmm1
1669	pxor	%xmm2,%xmm2
1670	pxor	%xmm3,%xmm3
1671	pxor	%xmm4,%xmm4
1672	pxor	%xmm5,%xmm5
1673___
1674$code.=<<___ if (!$win64);
1675	pxor	%xmm6,%xmm6
1676	pxor	%xmm7,%xmm7
1677	movaps	%xmm0,0x00(%rsp)		# clear stack
1678	pxor	%xmm8,%xmm8
1679	movaps	%xmm0,0x10(%rsp)
1680	pxor	%xmm9,%xmm9
1681	movaps	%xmm0,0x20(%rsp)
1682	pxor	%xmm10,%xmm10
1683	movaps	%xmm0,0x30(%rsp)
1684	pxor	%xmm11,%xmm11
1685	movaps	%xmm0,0x40(%rsp)
1686	pxor	%xmm12,%xmm12
1687	movaps	%xmm0,0x50(%rsp)
1688	pxor	%xmm13,%xmm13
1689	movaps	%xmm0,0x60(%rsp)
1690	pxor	%xmm14,%xmm14
1691	movaps	%xmm0,0x70(%rsp)
1692	pxor	%xmm15,%xmm15
1693___
1694$code.=<<___ if ($win64);
1695	movaps	-0xa8($key_),%xmm6
1696	movaps	%xmm0,-0xa8($key_)		# clear stack
1697	movaps	-0x98($key_),%xmm7
1698	movaps	%xmm0,-0x98($key_)
1699	movaps	-0x88($key_),%xmm8
1700	movaps	%xmm0,-0x88($key_)
1701	movaps	-0x78($key_),%xmm9
1702	movaps	%xmm0,-0x78($key_)
1703	movaps	-0x68($key_),%xmm10
1704	movaps	%xmm0,-0x68($key_)
1705	movaps	-0x58($key_),%xmm11
1706	movaps	%xmm0,-0x58($key_)
1707	movaps	-0x48($key_),%xmm12
1708	movaps	%xmm0,-0x48($key_)
1709	movaps	-0x38($key_),%xmm13
1710	movaps	%xmm0,-0x38($key_)
1711	movaps	-0x28($key_),%xmm14
1712	movaps	%xmm0,-0x28($key_)
1713	movaps	-0x18($key_),%xmm15
1714	movaps	%xmm0,-0x18($key_)
1715	movaps	%xmm0,0x00(%rsp)
1716	movaps	%xmm0,0x10(%rsp)
1717	movaps	%xmm0,0x20(%rsp)
1718	movaps	%xmm0,0x30(%rsp)
1719	movaps	%xmm0,0x40(%rsp)
1720	movaps	%xmm0,0x50(%rsp)
1721	movaps	%xmm0,0x60(%rsp)
1722	movaps	%xmm0,0x70(%rsp)
1723___
1724$code.=<<___;
1725	mov	-8($key_),%rbp
1726	lea	($key_),%rsp
1727.Lctr32_epilogue:
1728	ret
1729.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1730___
1731}
1732
1733######################################################################
1734# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1735#	const AES_KEY *key1, const AES_KEY *key2
1736#	const unsigned char iv[16]);
1737#
1738{
1739my @tweak=map("%xmm$_",(10..15));
1740my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1741my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1742my $frame_size = 0x70 + ($win64?160:0);
1743my $key_ = "%rbp";	# override so that we can use %r11 as FP
1744
1745$code.=<<___;
1746.globl	aesni_xts_encrypt
1747.type	aesni_xts_encrypt,\@function,6
1748.align	16
1749aesni_xts_encrypt:
1750	lea	(%rsp),%r11			# frame pointer
1751	push	%rbp
1752	sub	\$$frame_size,%rsp
1753	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1754___
1755$code.=<<___ if ($win64);
1756	movaps	%xmm6,-0xa8(%r11)		# offload everything
1757	movaps	%xmm7,-0x98(%r11)
1758	movaps	%xmm8,-0x88(%r11)
1759	movaps	%xmm9,-0x78(%r11)
1760	movaps	%xmm10,-0x68(%r11)
1761	movaps	%xmm11,-0x58(%r11)
1762	movaps	%xmm12,-0x48(%r11)
1763	movaps	%xmm13,-0x38(%r11)
1764	movaps	%xmm14,-0x28(%r11)
1765	movaps	%xmm15,-0x18(%r11)
1766.Lxts_enc_body:
1767___
1768$code.=<<___;
1769	movups	($ivp),$inout0			# load clear-text tweak
1770	mov	240(%r8),$rounds		# key2->rounds
1771	mov	240($key),$rnds_		# key1->rounds
1772___
1773	# generate the tweak
1774	&aesni_generate1("enc",$key2,$rounds,$inout0);
1775$code.=<<___;
1776	$movkey	($key),$rndkey0			# zero round key
1777	mov	$key,$key_			# backup $key
1778	mov	$rnds_,$rounds			# backup $rounds
1779	shl	\$4,$rnds_
1780	mov	$len,$len_			# backup $len
1781	and	\$-16,$len
1782
1783	$movkey	16($key,$rnds_),$rndkey1	# last round key
1784
1785	movdqa	.Lxts_magic(%rip),$twmask
1786	movdqa	$inout0,@tweak[5]
1787	pshufd	\$0x5f,$inout0,$twres
1788	pxor	$rndkey0,$rndkey1
1789___
1790    # alternative tweak calculation algorithm is based on suggestions
1791    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1792    # and should help in the future...
1793    for ($i=0;$i<4;$i++) {
1794    $code.=<<___;
1795	movdqa	$twres,$twtmp
1796	paddd	$twres,$twres
1797	movdqa	@tweak[5],@tweak[$i]
1798	psrad	\$31,$twtmp			# broadcast upper bits
1799	paddq	@tweak[5],@tweak[5]
1800	pand	$twmask,$twtmp
1801	pxor	$rndkey0,@tweak[$i]
1802	pxor	$twtmp,@tweak[5]
1803___
1804    }
1805$code.=<<___;
1806	movdqa	@tweak[5],@tweak[4]
1807	psrad	\$31,$twres
1808	paddq	@tweak[5],@tweak[5]
1809	pand	$twmask,$twres
1810	pxor	$rndkey0,@tweak[4]
1811	pxor	$twres,@tweak[5]
1812	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1813
1814	sub	\$16*6,$len
1815	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1816
1817	mov	\$16+96,$rounds
1818	lea	32($key_,$rnds_),$key		# end of key schedule
1819	sub	%r10,%rax			# twisted $rounds
1820	$movkey	16($key_),$rndkey1
1821	mov	%rax,%r10			# backup twisted $rounds
1822	lea	.Lxts_magic(%rip),%r8
1823	jmp	.Lxts_enc_grandloop
1824
1825.align	32
1826.Lxts_enc_grandloop:
1827	movdqu	`16*0`($inp),$inout0		# load input
1828	movdqa	$rndkey0,$twmask
1829	movdqu	`16*1`($inp),$inout1
1830	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1831	movdqu	`16*2`($inp),$inout2
1832	pxor	@tweak[1],$inout1
1833	 aesenc		$rndkey1,$inout0
1834	movdqu	`16*3`($inp),$inout3
1835	pxor	@tweak[2],$inout2
1836	 aesenc		$rndkey1,$inout1
1837	movdqu	`16*4`($inp),$inout4
1838	pxor	@tweak[3],$inout3
1839	 aesenc		$rndkey1,$inout2
1840	movdqu	`16*5`($inp),$inout5
1841	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1842	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1843	pxor	@tweak[4],$inout4
1844	 aesenc		$rndkey1,$inout3
1845	$movkey	32($key_),$rndkey0
1846	lea	`16*6`($inp),$inp
1847	pxor	$twmask,$inout5
1848
1849	 pxor	$twres,@tweak[0]		# calclulate tweaks^round[last]
1850	aesenc		$rndkey1,$inout4
1851	 pxor	$twres,@tweak[1]
1852	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1853	aesenc		$rndkey1,$inout5
1854	$movkey		48($key_),$rndkey1
1855	 pxor	$twres,@tweak[2]
1856
1857	aesenc		$rndkey0,$inout0
1858	 pxor	$twres,@tweak[3]
1859	 movdqa	@tweak[1],`16*1`(%rsp)
1860	aesenc		$rndkey0,$inout1
1861	 pxor	$twres,@tweak[4]
1862	 movdqa	@tweak[2],`16*2`(%rsp)
1863	aesenc		$rndkey0,$inout2
1864	aesenc		$rndkey0,$inout3
1865	 pxor	$twres,$twmask
1866	 movdqa	@tweak[4],`16*4`(%rsp)
1867	aesenc		$rndkey0,$inout4
1868	aesenc		$rndkey0,$inout5
1869	$movkey		64($key_),$rndkey0
1870	 movdqa	$twmask,`16*5`(%rsp)
1871	pshufd	\$0x5f,@tweak[5],$twres
1872	jmp	.Lxts_enc_loop6
1873.align	32
1874.Lxts_enc_loop6:
1875	aesenc		$rndkey1,$inout0
1876	aesenc		$rndkey1,$inout1
1877	aesenc		$rndkey1,$inout2
1878	aesenc		$rndkey1,$inout3
1879	aesenc		$rndkey1,$inout4
1880	aesenc		$rndkey1,$inout5
1881	$movkey		-64($key,%rax),$rndkey1
1882	add		\$32,%rax
1883
1884	aesenc		$rndkey0,$inout0
1885	aesenc		$rndkey0,$inout1
1886	aesenc		$rndkey0,$inout2
1887	aesenc		$rndkey0,$inout3
1888	aesenc		$rndkey0,$inout4
1889	aesenc		$rndkey0,$inout5
1890	$movkey		-80($key,%rax),$rndkey0
1891	jnz		.Lxts_enc_loop6
1892
1893	movdqa	(%r8),$twmask			# start calculating next tweak
1894	movdqa	$twres,$twtmp
1895	paddd	$twres,$twres
1896	 aesenc		$rndkey1,$inout0
1897	paddq	@tweak[5],@tweak[5]
1898	psrad	\$31,$twtmp
1899	 aesenc		$rndkey1,$inout1
1900	pand	$twmask,$twtmp
1901	$movkey	($key_),@tweak[0]		# load round[0]
1902	 aesenc		$rndkey1,$inout2
1903	 aesenc		$rndkey1,$inout3
1904	 aesenc		$rndkey1,$inout4
1905	pxor	$twtmp,@tweak[5]
1906	movaps	@tweak[0],@tweak[1]		# copy round[0]
1907	 aesenc		$rndkey1,$inout5
1908	 $movkey	-64($key),$rndkey1
1909
1910	movdqa	$twres,$twtmp
1911	 aesenc		$rndkey0,$inout0
1912	paddd	$twres,$twres
1913	pxor	@tweak[5],@tweak[0]
1914	 aesenc		$rndkey0,$inout1
1915	psrad	\$31,$twtmp
1916	paddq	@tweak[5],@tweak[5]
1917	 aesenc		$rndkey0,$inout2
1918	 aesenc		$rndkey0,$inout3
1919	pand	$twmask,$twtmp
1920	movaps	@tweak[1],@tweak[2]
1921	 aesenc		$rndkey0,$inout4
1922	pxor	$twtmp,@tweak[5]
1923	movdqa	$twres,$twtmp
1924	 aesenc		$rndkey0,$inout5
1925	 $movkey	-48($key),$rndkey0
1926
1927	paddd	$twres,$twres
1928	 aesenc		$rndkey1,$inout0
1929	pxor	@tweak[5],@tweak[1]
1930	psrad	\$31,$twtmp
1931	 aesenc		$rndkey1,$inout1
1932	paddq	@tweak[5],@tweak[5]
1933	pand	$twmask,$twtmp
1934	 aesenc		$rndkey1,$inout2
1935	 aesenc		$rndkey1,$inout3
1936	 movdqa	@tweak[3],`16*3`(%rsp)
1937	pxor	$twtmp,@tweak[5]
1938	 aesenc		$rndkey1,$inout4
1939	movaps	@tweak[2],@tweak[3]
1940	movdqa	$twres,$twtmp
1941	 aesenc		$rndkey1,$inout5
1942	 $movkey	-32($key),$rndkey1
1943
1944	paddd	$twres,$twres
1945	 aesenc		$rndkey0,$inout0
1946	pxor	@tweak[5],@tweak[2]
1947	psrad	\$31,$twtmp
1948	 aesenc		$rndkey0,$inout1
1949	paddq	@tweak[5],@tweak[5]
1950	pand	$twmask,$twtmp
1951	 aesenc		$rndkey0,$inout2
1952	 aesenc		$rndkey0,$inout3
1953	 aesenc		$rndkey0,$inout4
1954	pxor	$twtmp,@tweak[5]
1955	movaps	@tweak[3],@tweak[4]
1956	 aesenc		$rndkey0,$inout5
1957
1958	movdqa	$twres,$rndkey0
1959	paddd	$twres,$twres
1960	 aesenc		$rndkey1,$inout0
1961	pxor	@tweak[5],@tweak[3]
1962	psrad	\$31,$rndkey0
1963	 aesenc		$rndkey1,$inout1
1964	paddq	@tweak[5],@tweak[5]
1965	pand	$twmask,$rndkey0
1966	 aesenc		$rndkey1,$inout2
1967	 aesenc		$rndkey1,$inout3
1968	pxor	$rndkey0,@tweak[5]
1969	$movkey		($key_),$rndkey0
1970	 aesenc		$rndkey1,$inout4
1971	 aesenc		$rndkey1,$inout5
1972	$movkey		16($key_),$rndkey1
1973
1974	pxor	@tweak[5],@tweak[4]
1975	 aesenclast	`16*0`(%rsp),$inout0
1976	psrad	\$31,$twres
1977	paddq	@tweak[5],@tweak[5]
1978	 aesenclast	`16*1`(%rsp),$inout1
1979	 aesenclast	`16*2`(%rsp),$inout2
1980	pand	$twmask,$twres
1981	mov	%r10,%rax			# restore $rounds
1982	 aesenclast	`16*3`(%rsp),$inout3
1983	 aesenclast	`16*4`(%rsp),$inout4
1984	 aesenclast	`16*5`(%rsp),$inout5
1985	pxor	$twres,@tweak[5]
1986
1987	lea	`16*6`($out),$out		# $out+=6*16
1988	movups	$inout0,`-16*6`($out)		# store 6 output blocks
1989	movups	$inout1,`-16*5`($out)
1990	movups	$inout2,`-16*4`($out)
1991	movups	$inout3,`-16*3`($out)
1992	movups	$inout4,`-16*2`($out)
1993	movups	$inout5,`-16*1`($out)
1994	sub	\$16*6,$len
1995	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
1996
1997	mov	\$16+96,$rounds
1998	sub	$rnds_,$rounds
1999	mov	$key_,$key			# restore $key
2000	shr	\$4,$rounds			# restore original value
2001
2002.Lxts_enc_short:
2003	# at the point @tweak[0..5] are populated with tweak values
2004	mov	$rounds,$rnds_			# backup $rounds
2005	pxor	$rndkey0,@tweak[0]
2006	add	\$16*6,$len			# restore real remaining $len
2007	jz	.Lxts_enc_done			# done if ($len==0)
2008
2009	pxor	$rndkey0,@tweak[1]
2010	cmp	\$0x20,$len
2011	jb	.Lxts_enc_one			# $len is 1*16
2012	pxor	$rndkey0,@tweak[2]
2013	je	.Lxts_enc_two			# $len is 2*16
2014
2015	pxor	$rndkey0,@tweak[3]
2016	cmp	\$0x40,$len
2017	jb	.Lxts_enc_three			# $len is 3*16
2018	pxor	$rndkey0,@tweak[4]
2019	je	.Lxts_enc_four			# $len is 4*16
2020
2021	movdqu	($inp),$inout0			# $len is 5*16
2022	movdqu	16*1($inp),$inout1
2023	movdqu	16*2($inp),$inout2
2024	pxor	@tweak[0],$inout0
2025	movdqu	16*3($inp),$inout3
2026	pxor	@tweak[1],$inout1
2027	movdqu	16*4($inp),$inout4
2028	lea	16*5($inp),$inp			# $inp+=5*16
2029	pxor	@tweak[2],$inout2
2030	pxor	@tweak[3],$inout3
2031	pxor	@tweak[4],$inout4
2032	pxor	$inout5,$inout5
2033
2034	call	_aesni_encrypt6
2035
2036	xorps	@tweak[0],$inout0
2037	movdqa	@tweak[5],@tweak[0]
2038	xorps	@tweak[1],$inout1
2039	xorps	@tweak[2],$inout2
2040	movdqu	$inout0,($out)			# store 5 output blocks
2041	xorps	@tweak[3],$inout3
2042	movdqu	$inout1,16*1($out)
2043	xorps	@tweak[4],$inout4
2044	movdqu	$inout2,16*2($out)
2045	movdqu	$inout3,16*3($out)
2046	movdqu	$inout4,16*4($out)
2047	lea	16*5($out),$out			# $out+=5*16
2048	jmp	.Lxts_enc_done
2049
2050.align	16
2051.Lxts_enc_one:
2052	movups	($inp),$inout0
2053	lea	16*1($inp),$inp			# inp+=1*16
2054	xorps	@tweak[0],$inout0
2055___
2056	&aesni_generate1("enc",$key,$rounds);
2057$code.=<<___;
2058	xorps	@tweak[0],$inout0
2059	movdqa	@tweak[1],@tweak[0]
2060	movups	$inout0,($out)			# store one output block
2061	lea	16*1($out),$out			# $out+=1*16
2062	jmp	.Lxts_enc_done
2063
2064.align	16
2065.Lxts_enc_two:
2066	movups	($inp),$inout0
2067	movups	16($inp),$inout1
2068	lea	32($inp),$inp			# $inp+=2*16
2069	xorps	@tweak[0],$inout0
2070	xorps	@tweak[1],$inout1
2071
2072	call	_aesni_encrypt2
2073
2074	xorps	@tweak[0],$inout0
2075	movdqa	@tweak[2],@tweak[0]
2076	xorps	@tweak[1],$inout1
2077	movups	$inout0,($out)			# store 2 output blocks
2078	movups	$inout1,16*1($out)
2079	lea	16*2($out),$out			# $out+=2*16
2080	jmp	.Lxts_enc_done
2081
2082.align	16
2083.Lxts_enc_three:
2084	movups	($inp),$inout0
2085	movups	16*1($inp),$inout1
2086	movups	16*2($inp),$inout2
2087	lea	16*3($inp),$inp			# $inp+=3*16
2088	xorps	@tweak[0],$inout0
2089	xorps	@tweak[1],$inout1
2090	xorps	@tweak[2],$inout2
2091
2092	call	_aesni_encrypt3
2093
2094	xorps	@tweak[0],$inout0
2095	movdqa	@tweak[3],@tweak[0]
2096	xorps	@tweak[1],$inout1
2097	xorps	@tweak[2],$inout2
2098	movups	$inout0,($out)			# store 3 output blocks
2099	movups	$inout1,16*1($out)
2100	movups	$inout2,16*2($out)
2101	lea	16*3($out),$out			# $out+=3*16
2102	jmp	.Lxts_enc_done
2103
2104.align	16
2105.Lxts_enc_four:
2106	movups	($inp),$inout0
2107	movups	16*1($inp),$inout1
2108	movups	16*2($inp),$inout2
2109	xorps	@tweak[0],$inout0
2110	movups	16*3($inp),$inout3
2111	lea	16*4($inp),$inp			# $inp+=4*16
2112	xorps	@tweak[1],$inout1
2113	xorps	@tweak[2],$inout2
2114	xorps	@tweak[3],$inout3
2115
2116	call	_aesni_encrypt4
2117
2118	pxor	@tweak[0],$inout0
2119	movdqa	@tweak[4],@tweak[0]
2120	pxor	@tweak[1],$inout1
2121	pxor	@tweak[2],$inout2
2122	movdqu	$inout0,($out)			# store 4 output blocks
2123	pxor	@tweak[3],$inout3
2124	movdqu	$inout1,16*1($out)
2125	movdqu	$inout2,16*2($out)
2126	movdqu	$inout3,16*3($out)
2127	lea	16*4($out),$out			# $out+=4*16
2128	jmp	.Lxts_enc_done
2129
2130.align	16
2131.Lxts_enc_done:
2132	and	\$15,$len_			# see if $len%16 is 0
2133	jz	.Lxts_enc_ret
2134	mov	$len_,$len
2135
2136.Lxts_enc_steal:
2137	movzb	($inp),%eax			# borrow $rounds ...
2138	movzb	-16($out),%ecx			# ... and $key
2139	lea	1($inp),$inp
2140	mov	%al,-16($out)
2141	mov	%cl,0($out)
2142	lea	1($out),$out
2143	sub	\$1,$len
2144	jnz	.Lxts_enc_steal
2145
2146	sub	$len_,$out			# rewind $out
2147	mov	$key_,$key			# restore $key
2148	mov	$rnds_,$rounds			# restore $rounds
2149
2150	movups	-16($out),$inout0
2151	xorps	@tweak[0],$inout0
2152___
2153	&aesni_generate1("enc",$key,$rounds);
2154$code.=<<___;
2155	xorps	@tweak[0],$inout0
2156	movups	$inout0,-16($out)
2157
2158.Lxts_enc_ret:
2159	xorps	%xmm0,%xmm0			# clear register bank
2160	pxor	%xmm1,%xmm1
2161	pxor	%xmm2,%xmm2
2162	pxor	%xmm3,%xmm3
2163	pxor	%xmm4,%xmm4
2164	pxor	%xmm5,%xmm5
2165___
2166$code.=<<___ if (!$win64);
2167	pxor	%xmm6,%xmm6
2168	pxor	%xmm7,%xmm7
2169	movaps	%xmm0,0x00(%rsp)		# clear stack
2170	pxor	%xmm8,%xmm8
2171	movaps	%xmm0,0x10(%rsp)
2172	pxor	%xmm9,%xmm9
2173	movaps	%xmm0,0x20(%rsp)
2174	pxor	%xmm10,%xmm10
2175	movaps	%xmm0,0x30(%rsp)
2176	pxor	%xmm11,%xmm11
2177	movaps	%xmm0,0x40(%rsp)
2178	pxor	%xmm12,%xmm12
2179	movaps	%xmm0,0x50(%rsp)
2180	pxor	%xmm13,%xmm13
2181	movaps	%xmm0,0x60(%rsp)
2182	pxor	%xmm14,%xmm14
2183	pxor	%xmm15,%xmm15
2184___
2185$code.=<<___ if ($win64);
2186	movaps	-0xa8(%r11),%xmm6
2187	movaps	%xmm0,-0xa8(%r11)		# clear stack
2188	movaps	-0x98(%r11),%xmm7
2189	movaps	%xmm0,-0x98(%r11)
2190	movaps	-0x88(%r11),%xmm8
2191	movaps	%xmm0,-0x88(%r11)
2192	movaps	-0x78(%r11),%xmm9
2193	movaps	%xmm0,-0x78(%r11)
2194	movaps	-0x68(%r11),%xmm10
2195	movaps	%xmm0,-0x68(%r11)
2196	movaps	-0x58(%r11),%xmm11
2197	movaps	%xmm0,-0x58(%r11)
2198	movaps	-0x48(%r11),%xmm12
2199	movaps	%xmm0,-0x48(%r11)
2200	movaps	-0x38(%r11),%xmm13
2201	movaps	%xmm0,-0x38(%r11)
2202	movaps	-0x28(%r11),%xmm14
2203	movaps	%xmm0,-0x28(%r11)
2204	movaps	-0x18(%r11),%xmm15
2205	movaps	%xmm0,-0x18(%r11)
2206	movaps	%xmm0,0x00(%rsp)
2207	movaps	%xmm0,0x10(%rsp)
2208	movaps	%xmm0,0x20(%rsp)
2209	movaps	%xmm0,0x30(%rsp)
2210	movaps	%xmm0,0x40(%rsp)
2211	movaps	%xmm0,0x50(%rsp)
2212	movaps	%xmm0,0x60(%rsp)
2213___
2214$code.=<<___;
2215	mov	-8(%r11),%rbp
2216	lea	(%r11),%rsp
2217.Lxts_enc_epilogue:
2218	ret
2219.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2220___
2221
2222$code.=<<___;
2223.globl	aesni_xts_decrypt
2224.type	aesni_xts_decrypt,\@function,6
2225.align	16
2226aesni_xts_decrypt:
2227	lea	(%rsp),%r11			# frame pointer
2228	push	%rbp
2229	sub	\$$frame_size,%rsp
2230	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2231___
2232$code.=<<___ if ($win64);
2233	movaps	%xmm6,-0xa8(%r11)		# offload everything
2234	movaps	%xmm7,-0x98(%r11)
2235	movaps	%xmm8,-0x88(%r11)
2236	movaps	%xmm9,-0x78(%r11)
2237	movaps	%xmm10,-0x68(%r11)
2238	movaps	%xmm11,-0x58(%r11)
2239	movaps	%xmm12,-0x48(%r11)
2240	movaps	%xmm13,-0x38(%r11)
2241	movaps	%xmm14,-0x28(%r11)
2242	movaps	%xmm15,-0x18(%r11)
2243.Lxts_dec_body:
2244___
2245$code.=<<___;
2246	movups	($ivp),$inout0			# load clear-text tweak
2247	mov	240($key2),$rounds		# key2->rounds
2248	mov	240($key),$rnds_		# key1->rounds
2249___
2250	# generate the tweak
2251	&aesni_generate1("enc",$key2,$rounds,$inout0);
2252$code.=<<___;
2253	xor	%eax,%eax			# if ($len%16) len-=16;
2254	test	\$15,$len
2255	setnz	%al
2256	shl	\$4,%rax
2257	sub	%rax,$len
2258
2259	$movkey	($key),$rndkey0			# zero round key
2260	mov	$key,$key_			# backup $key
2261	mov	$rnds_,$rounds			# backup $rounds
2262	shl	\$4,$rnds_
2263	mov	$len,$len_			# backup $len
2264	and	\$-16,$len
2265
2266	$movkey	16($key,$rnds_),$rndkey1	# last round key
2267
2268	movdqa	.Lxts_magic(%rip),$twmask
2269	movdqa	$inout0,@tweak[5]
2270	pshufd	\$0x5f,$inout0,$twres
2271	pxor	$rndkey0,$rndkey1
2272___
2273    for ($i=0;$i<4;$i++) {
2274    $code.=<<___;
2275	movdqa	$twres,$twtmp
2276	paddd	$twres,$twres
2277	movdqa	@tweak[5],@tweak[$i]
2278	psrad	\$31,$twtmp			# broadcast upper bits
2279	paddq	@tweak[5],@tweak[5]
2280	pand	$twmask,$twtmp
2281	pxor	$rndkey0,@tweak[$i]
2282	pxor	$twtmp,@tweak[5]
2283___
2284    }
2285$code.=<<___;
2286	movdqa	@tweak[5],@tweak[4]
2287	psrad	\$31,$twres
2288	paddq	@tweak[5],@tweak[5]
2289	pand	$twmask,$twres
2290	pxor	$rndkey0,@tweak[4]
2291	pxor	$twres,@tweak[5]
2292	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2293
2294	sub	\$16*6,$len
2295	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2296
2297	mov	\$16+96,$rounds
2298	lea	32($key_,$rnds_),$key		# end of key schedule
2299	sub	%r10,%rax			# twisted $rounds
2300	$movkey	16($key_),$rndkey1
2301	mov	%rax,%r10			# backup twisted $rounds
2302	lea	.Lxts_magic(%rip),%r8
2303	jmp	.Lxts_dec_grandloop
2304
2305.align	32
2306.Lxts_dec_grandloop:
2307	movdqu	`16*0`($inp),$inout0		# load input
2308	movdqa	$rndkey0,$twmask
2309	movdqu	`16*1`($inp),$inout1
2310	pxor	@tweak[0],$inout0		# intput^=tweak^round[0]
2311	movdqu	`16*2`($inp),$inout2
2312	pxor	@tweak[1],$inout1
2313	 aesdec		$rndkey1,$inout0
2314	movdqu	`16*3`($inp),$inout3
2315	pxor	@tweak[2],$inout2
2316	 aesdec		$rndkey1,$inout1
2317	movdqu	`16*4`($inp),$inout4
2318	pxor	@tweak[3],$inout3
2319	 aesdec		$rndkey1,$inout2
2320	movdqu	`16*5`($inp),$inout5
2321	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2322	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2323	pxor	@tweak[4],$inout4
2324	 aesdec		$rndkey1,$inout3
2325	$movkey	32($key_),$rndkey0
2326	lea	`16*6`($inp),$inp
2327	pxor	$twmask,$inout5
2328
2329	 pxor	$twres,@tweak[0]		# calclulate tweaks^round[last]
2330	aesdec		$rndkey1,$inout4
2331	 pxor	$twres,@tweak[1]
2332	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2333	aesdec		$rndkey1,$inout5
2334	$movkey		48($key_),$rndkey1
2335	 pxor	$twres,@tweak[2]
2336
2337	aesdec		$rndkey0,$inout0
2338	 pxor	$twres,@tweak[3]
2339	 movdqa	@tweak[1],`16*1`(%rsp)
2340	aesdec		$rndkey0,$inout1
2341	 pxor	$twres,@tweak[4]
2342	 movdqa	@tweak[2],`16*2`(%rsp)
2343	aesdec		$rndkey0,$inout2
2344	aesdec		$rndkey0,$inout3
2345	 pxor	$twres,$twmask
2346	 movdqa	@tweak[4],`16*4`(%rsp)
2347	aesdec		$rndkey0,$inout4
2348	aesdec		$rndkey0,$inout5
2349	$movkey		64($key_),$rndkey0
2350	 movdqa	$twmask,`16*5`(%rsp)
2351	pshufd	\$0x5f,@tweak[5],$twres
2352	jmp	.Lxts_dec_loop6
2353.align	32
2354.Lxts_dec_loop6:
2355	aesdec		$rndkey1,$inout0
2356	aesdec		$rndkey1,$inout1
2357	aesdec		$rndkey1,$inout2
2358	aesdec		$rndkey1,$inout3
2359	aesdec		$rndkey1,$inout4
2360	aesdec		$rndkey1,$inout5
2361	$movkey		-64($key,%rax),$rndkey1
2362	add		\$32,%rax
2363
2364	aesdec		$rndkey0,$inout0
2365	aesdec		$rndkey0,$inout1
2366	aesdec		$rndkey0,$inout2
2367	aesdec		$rndkey0,$inout3
2368	aesdec		$rndkey0,$inout4
2369	aesdec		$rndkey0,$inout5
2370	$movkey		-80($key,%rax),$rndkey0
2371	jnz		.Lxts_dec_loop6
2372
2373	movdqa	(%r8),$twmask			# start calculating next tweak
2374	movdqa	$twres,$twtmp
2375	paddd	$twres,$twres
2376	 aesdec		$rndkey1,$inout0
2377	paddq	@tweak[5],@tweak[5]
2378	psrad	\$31,$twtmp
2379	 aesdec		$rndkey1,$inout1
2380	pand	$twmask,$twtmp
2381	$movkey	($key_),@tweak[0]		# load round[0]
2382	 aesdec		$rndkey1,$inout2
2383	 aesdec		$rndkey1,$inout3
2384	 aesdec		$rndkey1,$inout4
2385	pxor	$twtmp,@tweak[5]
2386	movaps	@tweak[0],@tweak[1]		# copy round[0]
2387	 aesdec		$rndkey1,$inout5
2388	 $movkey	-64($key),$rndkey1
2389
2390	movdqa	$twres,$twtmp
2391	 aesdec		$rndkey0,$inout0
2392	paddd	$twres,$twres
2393	pxor	@tweak[5],@tweak[0]
2394	 aesdec		$rndkey0,$inout1
2395	psrad	\$31,$twtmp
2396	paddq	@tweak[5],@tweak[5]
2397	 aesdec		$rndkey0,$inout2
2398	 aesdec		$rndkey0,$inout3
2399	pand	$twmask,$twtmp
2400	movaps	@tweak[1],@tweak[2]
2401	 aesdec		$rndkey0,$inout4
2402	pxor	$twtmp,@tweak[5]
2403	movdqa	$twres,$twtmp
2404	 aesdec		$rndkey0,$inout5
2405	 $movkey	-48($key),$rndkey0
2406
2407	paddd	$twres,$twres
2408	 aesdec		$rndkey1,$inout0
2409	pxor	@tweak[5],@tweak[1]
2410	psrad	\$31,$twtmp
2411	 aesdec		$rndkey1,$inout1
2412	paddq	@tweak[5],@tweak[5]
2413	pand	$twmask,$twtmp
2414	 aesdec		$rndkey1,$inout2
2415	 aesdec		$rndkey1,$inout3
2416	 movdqa	@tweak[3],`16*3`(%rsp)
2417	pxor	$twtmp,@tweak[5]
2418	 aesdec		$rndkey1,$inout4
2419	movaps	@tweak[2],@tweak[3]
2420	movdqa	$twres,$twtmp
2421	 aesdec		$rndkey1,$inout5
2422	 $movkey	-32($key),$rndkey1
2423
2424	paddd	$twres,$twres
2425	 aesdec		$rndkey0,$inout0
2426	pxor	@tweak[5],@tweak[2]
2427	psrad	\$31,$twtmp
2428	 aesdec		$rndkey0,$inout1
2429	paddq	@tweak[5],@tweak[5]
2430	pand	$twmask,$twtmp
2431	 aesdec		$rndkey0,$inout2
2432	 aesdec		$rndkey0,$inout3
2433	 aesdec		$rndkey0,$inout4
2434	pxor	$twtmp,@tweak[5]
2435	movaps	@tweak[3],@tweak[4]
2436	 aesdec		$rndkey0,$inout5
2437
2438	movdqa	$twres,$rndkey0
2439	paddd	$twres,$twres
2440	 aesdec		$rndkey1,$inout0
2441	pxor	@tweak[5],@tweak[3]
2442	psrad	\$31,$rndkey0
2443	 aesdec		$rndkey1,$inout1
2444	paddq	@tweak[5],@tweak[5]
2445	pand	$twmask,$rndkey0
2446	 aesdec		$rndkey1,$inout2
2447	 aesdec		$rndkey1,$inout3
2448	pxor	$rndkey0,@tweak[5]
2449	$movkey		($key_),$rndkey0
2450	 aesdec		$rndkey1,$inout4
2451	 aesdec		$rndkey1,$inout5
2452	$movkey		16($key_),$rndkey1
2453
2454	pxor	@tweak[5],@tweak[4]
2455	 aesdeclast	`16*0`(%rsp),$inout0
2456	psrad	\$31,$twres
2457	paddq	@tweak[5],@tweak[5]
2458	 aesdeclast	`16*1`(%rsp),$inout1
2459	 aesdeclast	`16*2`(%rsp),$inout2
2460	pand	$twmask,$twres
2461	mov	%r10,%rax			# restore $rounds
2462	 aesdeclast	`16*3`(%rsp),$inout3
2463	 aesdeclast	`16*4`(%rsp),$inout4
2464	 aesdeclast	`16*5`(%rsp),$inout5
2465	pxor	$twres,@tweak[5]
2466
2467	lea	`16*6`($out),$out		# $out+=6*16
2468	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2469	movups	$inout1,`-16*5`($out)
2470	movups	$inout2,`-16*4`($out)
2471	movups	$inout3,`-16*3`($out)
2472	movups	$inout4,`-16*2`($out)
2473	movups	$inout5,`-16*1`($out)
2474	sub	\$16*6,$len
2475	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2476
2477	mov	\$16+96,$rounds
2478	sub	$rnds_,$rounds
2479	mov	$key_,$key			# restore $key
2480	shr	\$4,$rounds			# restore original value
2481
2482.Lxts_dec_short:
2483	# at the point @tweak[0..5] are populated with tweak values
2484	mov	$rounds,$rnds_			# backup $rounds
2485	pxor	$rndkey0,@tweak[0]
2486	pxor	$rndkey0,@tweak[1]
2487	add	\$16*6,$len			# restore real remaining $len
2488	jz	.Lxts_dec_done			# done if ($len==0)
2489
2490	pxor	$rndkey0,@tweak[2]
2491	cmp	\$0x20,$len
2492	jb	.Lxts_dec_one			# $len is 1*16
2493	pxor	$rndkey0,@tweak[3]
2494	je	.Lxts_dec_two			# $len is 2*16
2495
2496	pxor	$rndkey0,@tweak[4]
2497	cmp	\$0x40,$len
2498	jb	.Lxts_dec_three			# $len is 3*16
2499	je	.Lxts_dec_four			# $len is 4*16
2500
2501	movdqu	($inp),$inout0			# $len is 5*16
2502	movdqu	16*1($inp),$inout1
2503	movdqu	16*2($inp),$inout2
2504	pxor	@tweak[0],$inout0
2505	movdqu	16*3($inp),$inout3
2506	pxor	@tweak[1],$inout1
2507	movdqu	16*4($inp),$inout4
2508	lea	16*5($inp),$inp			# $inp+=5*16
2509	pxor	@tweak[2],$inout2
2510	pxor	@tweak[3],$inout3
2511	pxor	@tweak[4],$inout4
2512
2513	call	_aesni_decrypt6
2514
2515	xorps	@tweak[0],$inout0
2516	xorps	@tweak[1],$inout1
2517	xorps	@tweak[2],$inout2
2518	movdqu	$inout0,($out)			# store 5 output blocks
2519	xorps	@tweak[3],$inout3
2520	movdqu	$inout1,16*1($out)
2521	xorps	@tweak[4],$inout4
2522	movdqu	$inout2,16*2($out)
2523	 pxor		$twtmp,$twtmp
2524	movdqu	$inout3,16*3($out)
2525	 pcmpgtd	@tweak[5],$twtmp
2526	movdqu	$inout4,16*4($out)
2527	lea	16*5($out),$out			# $out+=5*16
2528	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2529	and	\$15,$len_
2530	jz	.Lxts_dec_ret
2531
2532	movdqa	@tweak[5],@tweak[0]
2533	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2534	pand	$twmask,@tweak[1]		# isolate carry and residue
2535	pxor	@tweak[5],@tweak[1]
2536	jmp	.Lxts_dec_done2
2537
2538.align	16
2539.Lxts_dec_one:
2540	movups	($inp),$inout0
2541	lea	16*1($inp),$inp			# $inp+=1*16
2542	xorps	@tweak[0],$inout0
2543___
2544	&aesni_generate1("dec",$key,$rounds);
2545$code.=<<___;
2546	xorps	@tweak[0],$inout0
2547	movdqa	@tweak[1],@tweak[0]
2548	movups	$inout0,($out)			# store one output block
2549	movdqa	@tweak[2],@tweak[1]
2550	lea	16*1($out),$out			# $out+=1*16
2551	jmp	.Lxts_dec_done
2552
2553.align	16
2554.Lxts_dec_two:
2555	movups	($inp),$inout0
2556	movups	16($inp),$inout1
2557	lea	32($inp),$inp			# $inp+=2*16
2558	xorps	@tweak[0],$inout0
2559	xorps	@tweak[1],$inout1
2560
2561	call	_aesni_decrypt2
2562
2563	xorps	@tweak[0],$inout0
2564	movdqa	@tweak[2],@tweak[0]
2565	xorps	@tweak[1],$inout1
2566	movdqa	@tweak[3],@tweak[1]
2567	movups	$inout0,($out)			# store 2 output blocks
2568	movups	$inout1,16*1($out)
2569	lea	16*2($out),$out			# $out+=2*16
2570	jmp	.Lxts_dec_done
2571
2572.align	16
2573.Lxts_dec_three:
2574	movups	($inp),$inout0
2575	movups	16*1($inp),$inout1
2576	movups	16*2($inp),$inout2
2577	lea	16*3($inp),$inp			# $inp+=3*16
2578	xorps	@tweak[0],$inout0
2579	xorps	@tweak[1],$inout1
2580	xorps	@tweak[2],$inout2
2581
2582	call	_aesni_decrypt3
2583
2584	xorps	@tweak[0],$inout0
2585	movdqa	@tweak[3],@tweak[0]
2586	xorps	@tweak[1],$inout1
2587	movdqa	@tweak[4],@tweak[1]
2588	xorps	@tweak[2],$inout2
2589	movups	$inout0,($out)			# store 3 output blocks
2590	movups	$inout1,16*1($out)
2591	movups	$inout2,16*2($out)
2592	lea	16*3($out),$out			# $out+=3*16
2593	jmp	.Lxts_dec_done
2594
2595.align	16
2596.Lxts_dec_four:
2597	movups	($inp),$inout0
2598	movups	16*1($inp),$inout1
2599	movups	16*2($inp),$inout2
2600	xorps	@tweak[0],$inout0
2601	movups	16*3($inp),$inout3
2602	lea	16*4($inp),$inp			# $inp+=4*16
2603	xorps	@tweak[1],$inout1
2604	xorps	@tweak[2],$inout2
2605	xorps	@tweak[3],$inout3
2606
2607	call	_aesni_decrypt4
2608
2609	pxor	@tweak[0],$inout0
2610	movdqa	@tweak[4],@tweak[0]
2611	pxor	@tweak[1],$inout1
2612	movdqa	@tweak[5],@tweak[1]
2613	pxor	@tweak[2],$inout2
2614	movdqu	$inout0,($out)			# store 4 output blocks
2615	pxor	@tweak[3],$inout3
2616	movdqu	$inout1,16*1($out)
2617	movdqu	$inout2,16*2($out)
2618	movdqu	$inout3,16*3($out)
2619	lea	16*4($out),$out			# $out+=4*16
2620	jmp	.Lxts_dec_done
2621
2622.align	16
2623.Lxts_dec_done:
2624	and	\$15,$len_			# see if $len%16 is 0
2625	jz	.Lxts_dec_ret
2626.Lxts_dec_done2:
2627	mov	$len_,$len
2628	mov	$key_,$key			# restore $key
2629	mov	$rnds_,$rounds			# restore $rounds
2630
2631	movups	($inp),$inout0
2632	xorps	@tweak[1],$inout0
2633___
2634	&aesni_generate1("dec",$key,$rounds);
2635$code.=<<___;
2636	xorps	@tweak[1],$inout0
2637	movups	$inout0,($out)
2638
2639.Lxts_dec_steal:
2640	movzb	16($inp),%eax			# borrow $rounds ...
2641	movzb	($out),%ecx			# ... and $key
2642	lea	1($inp),$inp
2643	mov	%al,($out)
2644	mov	%cl,16($out)
2645	lea	1($out),$out
2646	sub	\$1,$len
2647	jnz	.Lxts_dec_steal
2648
2649	sub	$len_,$out			# rewind $out
2650	mov	$key_,$key			# restore $key
2651	mov	$rnds_,$rounds			# restore $rounds
2652
2653	movups	($out),$inout0
2654	xorps	@tweak[0],$inout0
2655___
2656	&aesni_generate1("dec",$key,$rounds);
2657$code.=<<___;
2658	xorps	@tweak[0],$inout0
2659	movups	$inout0,($out)
2660
2661.Lxts_dec_ret:
2662	xorps	%xmm0,%xmm0			# clear register bank
2663	pxor	%xmm1,%xmm1
2664	pxor	%xmm2,%xmm2
2665	pxor	%xmm3,%xmm3
2666	pxor	%xmm4,%xmm4
2667	pxor	%xmm5,%xmm5
2668___
2669$code.=<<___ if (!$win64);
2670	pxor	%xmm6,%xmm6
2671	pxor	%xmm7,%xmm7
2672	movaps	%xmm0,0x00(%rsp)		# clear stack
2673	pxor	%xmm8,%xmm8
2674	movaps	%xmm0,0x10(%rsp)
2675	pxor	%xmm9,%xmm9
2676	movaps	%xmm0,0x20(%rsp)
2677	pxor	%xmm10,%xmm10
2678	movaps	%xmm0,0x30(%rsp)
2679	pxor	%xmm11,%xmm11
2680	movaps	%xmm0,0x40(%rsp)
2681	pxor	%xmm12,%xmm12
2682	movaps	%xmm0,0x50(%rsp)
2683	pxor	%xmm13,%xmm13
2684	movaps	%xmm0,0x60(%rsp)
2685	pxor	%xmm14,%xmm14
2686	pxor	%xmm15,%xmm15
2687___
2688$code.=<<___ if ($win64);
2689	movaps	-0xa8(%r11),%xmm6
2690	movaps	%xmm0,-0xa8(%r11)		# clear stack
2691	movaps	-0x98(%r11),%xmm7
2692	movaps	%xmm0,-0x98(%r11)
2693	movaps	-0x88(%r11),%xmm8
2694	movaps	%xmm0,-0x88(%r11)
2695	movaps	-0x78(%r11),%xmm9
2696	movaps	%xmm0,-0x78(%r11)
2697	movaps	-0x68(%r11),%xmm10
2698	movaps	%xmm0,-0x68(%r11)
2699	movaps	-0x58(%r11),%xmm11
2700	movaps	%xmm0,-0x58(%r11)
2701	movaps	-0x48(%r11),%xmm12
2702	movaps	%xmm0,-0x48(%r11)
2703	movaps	-0x38(%r11),%xmm13
2704	movaps	%xmm0,-0x38(%r11)
2705	movaps	-0x28(%r11),%xmm14
2706	movaps	%xmm0,-0x28(%r11)
2707	movaps	-0x18(%r11),%xmm15
2708	movaps	%xmm0,-0x18(%r11)
2709	movaps	%xmm0,0x00(%rsp)
2710	movaps	%xmm0,0x10(%rsp)
2711	movaps	%xmm0,0x20(%rsp)
2712	movaps	%xmm0,0x30(%rsp)
2713	movaps	%xmm0,0x40(%rsp)
2714	movaps	%xmm0,0x50(%rsp)
2715	movaps	%xmm0,0x60(%rsp)
2716___
2717$code.=<<___;
2718	mov	-8(%r11),%rbp
2719	lea	(%r11),%rsp
2720.Lxts_dec_epilogue:
2721	ret
2722.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2723___
2724}
2725
2726######################################################################
2727# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2728#	const AES_KEY *key, unsigned int start_block_num,
2729#	unsigned char offset_i[16], const unsigned char L_[][16],
2730#	unsigned char checksum[16]);
2731#
2732{
2733my @offset=map("%xmm$_",(10..15));
2734my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2735my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
2736my ($L_p,$checksum_p) = ("%rbx","%rbp");
2737my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2738my $seventh_arg = $win64 ? 56 : 8;
2739my $blocks = $len;
2740
2741$code.=<<___;
2742.globl	aesni_ocb_encrypt
2743.type	aesni_ocb_encrypt,\@function,6
2744.align	32
2745aesni_ocb_encrypt:
2746	lea	(%rsp),%rax
2747	push	%rbx
2748	push	%rbp
2749	push	%r12
2750	push	%r13
2751	push	%r14
2752___
2753$code.=<<___ if ($win64);
2754	lea	-0xa0(%rsp),%rsp
2755	movaps	%xmm6,0x00(%rsp)		# offload everything
2756	movaps	%xmm7,0x10(%rsp)
2757	movaps	%xmm8,0x20(%rsp)
2758	movaps	%xmm9,0x30(%rsp)
2759	movaps	%xmm10,0x40(%rsp)
2760	movaps	%xmm11,0x50(%rsp)
2761	movaps	%xmm12,0x60(%rsp)
2762	movaps	%xmm13,0x70(%rsp)
2763	movaps	%xmm14,0x80(%rsp)
2764	movaps	%xmm15,0x90(%rsp)
2765.Locb_enc_body:
2766___
2767$code.=<<___;
2768	mov	$seventh_arg(%rax),$L_p		# 7th argument
2769	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
2770
2771	mov	240($key),$rnds_
2772	mov	$key,$key_
2773	shl	\$4,$rnds_
2774	$movkey	($key),$rndkey0l		# round[0]
2775	$movkey	16($key,$rnds_),$rndkey1	# round[last]
2776
2777	movdqu	($offset_p),@offset[5]		# load last offset_i
2778	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
2779	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
2780
2781	mov	\$16+32,$rounds
2782	lea	32($key_,$rnds_),$key
2783	$movkey	16($key_),$rndkey1		# round[1]
2784	sub	%r10,%rax			# twisted $rounds
2785	mov	%rax,%r10			# backup twisted $rounds
2786
2787	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
2788	movdqu	($checksum_p),$checksum		# load checksum
2789
2790	test	\$1,$block_num			# is first block number odd?
2791	jnz	.Locb_enc_odd
2792
2793	bsf	$block_num,$i1
2794	add	\$1,$block_num
2795	shl	\$4,$i1
2796	movdqu	($L_p,$i1),$inout5		# borrow
2797	movdqu	($inp),$inout0
2798	lea	16($inp),$inp
2799
2800	call	__ocb_encrypt1
2801
2802	movdqa	$inout5,@offset[5]
2803	movups	$inout0,($out)
2804	lea	16($out),$out
2805	sub	\$1,$blocks
2806	jz	.Locb_enc_done
2807
2808.Locb_enc_odd:
2809	lea	1($block_num),$i1		# even-numbered blocks
2810	lea	3($block_num),$i3
2811	lea	5($block_num),$i5
2812	lea	6($block_num),$block_num
2813	bsf	$i1,$i1				# ntz(block)
2814	bsf	$i3,$i3
2815	bsf	$i5,$i5
2816	shl	\$4,$i1				# ntz(block) -> table offset
2817	shl	\$4,$i3
2818	shl	\$4,$i5
2819
2820	sub	\$6,$blocks
2821	jc	.Locb_enc_short
2822	jmp	.Locb_enc_grandloop
2823
2824.align	32
2825.Locb_enc_grandloop:
2826	movdqu	`16*0`($inp),$inout0		# load input
2827	movdqu	`16*1`($inp),$inout1
2828	movdqu	`16*2`($inp),$inout2
2829	movdqu	`16*3`($inp),$inout3
2830	movdqu	`16*4`($inp),$inout4
2831	movdqu	`16*5`($inp),$inout5
2832	lea	`16*6`($inp),$inp
2833
2834	call	__ocb_encrypt6
2835
2836	movups	$inout0,`16*0`($out)		# store output
2837	movups	$inout1,`16*1`($out)
2838	movups	$inout2,`16*2`($out)
2839	movups	$inout3,`16*3`($out)
2840	movups	$inout4,`16*4`($out)
2841	movups	$inout5,`16*5`($out)
2842	lea	`16*6`($out),$out
2843	sub	\$6,$blocks
2844	jnc	.Locb_enc_grandloop
2845
2846.Locb_enc_short:
2847	add	\$6,$blocks
2848	jz	.Locb_enc_done
2849
2850	movdqu	`16*0`($inp),$inout0
2851	cmp	\$2,$blocks
2852	jb	.Locb_enc_one
2853	movdqu	`16*1`($inp),$inout1
2854	je	.Locb_enc_two
2855
2856	movdqu	`16*2`($inp),$inout2
2857	cmp	\$4,$blocks
2858	jb	.Locb_enc_three
2859	movdqu	`16*3`($inp),$inout3
2860	je	.Locb_enc_four
2861
2862	movdqu	`16*4`($inp),$inout4
2863	pxor	$inout5,$inout5
2864
2865	call	__ocb_encrypt6
2866
2867	movdqa	@offset[4],@offset[5]
2868	movups	$inout0,`16*0`($out)
2869	movups	$inout1,`16*1`($out)
2870	movups	$inout2,`16*2`($out)
2871	movups	$inout3,`16*3`($out)
2872	movups	$inout4,`16*4`($out)
2873
2874	jmp	.Locb_enc_done
2875
2876.align	16
2877.Locb_enc_one:
2878	movdqa	@offset[0],$inout5		# borrow
2879
2880	call	__ocb_encrypt1
2881
2882	movdqa	$inout5,@offset[5]
2883	movups	$inout0,`16*0`($out)
2884	jmp	.Locb_enc_done
2885
2886.align	16
2887.Locb_enc_two:
2888	pxor	$inout2,$inout2
2889	pxor	$inout3,$inout3
2890
2891	call	__ocb_encrypt4
2892
2893	movdqa	@offset[1],@offset[5]
2894	movups	$inout0,`16*0`($out)
2895	movups	$inout1,`16*1`($out)
2896
2897	jmp	.Locb_enc_done
2898
2899.align	16
2900.Locb_enc_three:
2901	pxor	$inout3,$inout3
2902
2903	call	__ocb_encrypt4
2904
2905	movdqa	@offset[2],@offset[5]
2906	movups	$inout0,`16*0`($out)
2907	movups	$inout1,`16*1`($out)
2908	movups	$inout2,`16*2`($out)
2909
2910	jmp	.Locb_enc_done
2911
2912.align	16
2913.Locb_enc_four:
2914	call	__ocb_encrypt4
2915
2916	movdqa	@offset[3],@offset[5]
2917	movups	$inout0,`16*0`($out)
2918	movups	$inout1,`16*1`($out)
2919	movups	$inout2,`16*2`($out)
2920	movups	$inout3,`16*3`($out)
2921
2922.Locb_enc_done:
2923	pxor	$rndkey0,@offset[5]		# "remove" round[last]
2924	movdqu	$checksum,($checksum_p)		# store checksum
2925	movdqu	@offset[5],($offset_p)		# store last offset_i
2926
2927	xorps	%xmm0,%xmm0			# clear register bank
2928	pxor	%xmm1,%xmm1
2929	pxor	%xmm2,%xmm2
2930	pxor	%xmm3,%xmm3
2931	pxor	%xmm4,%xmm4
2932	pxor	%xmm5,%xmm5
2933___
2934$code.=<<___ if (!$win64);
2935	pxor	%xmm6,%xmm6
2936	pxor	%xmm7,%xmm7
2937	pxor	%xmm8,%xmm8
2938	pxor	%xmm9,%xmm9
2939	pxor	%xmm10,%xmm10
2940	pxor	%xmm11,%xmm11
2941	pxor	%xmm12,%xmm12
2942	pxor	%xmm13,%xmm13
2943	pxor	%xmm14,%xmm14
2944	pxor	%xmm15,%xmm15
2945	lea	0x28(%rsp),%rax
2946___
2947$code.=<<___ if ($win64);
2948	movaps	0x00(%rsp),%xmm6
2949	movaps	%xmm0,0x00(%rsp)		# clear stack
2950	movaps	0x10(%rsp),%xmm7
2951	movaps	%xmm0,0x10(%rsp)
2952	movaps	0x20(%rsp),%xmm8
2953	movaps	%xmm0,0x20(%rsp)
2954	movaps	0x30(%rsp),%xmm9
2955	movaps	%xmm0,0x30(%rsp)
2956	movaps	0x40(%rsp),%xmm10
2957	movaps	%xmm0,0x40(%rsp)
2958	movaps	0x50(%rsp),%xmm11
2959	movaps	%xmm0,0x50(%rsp)
2960	movaps	0x60(%rsp),%xmm12
2961	movaps	%xmm0,0x60(%rsp)
2962	movaps	0x70(%rsp),%xmm13
2963	movaps	%xmm0,0x70(%rsp)
2964	movaps	0x80(%rsp),%xmm14
2965	movaps	%xmm0,0x80(%rsp)
2966	movaps	0x90(%rsp),%xmm15
2967	movaps	%xmm0,0x90(%rsp)
2968	lea	0xa0+0x28(%rsp),%rax
2969.Locb_enc_pop:
2970___
2971$code.=<<___;
2972	mov	-40(%rax),%r14
2973	mov	-32(%rax),%r13
2974	mov	-24(%rax),%r12
2975	mov	-16(%rax),%rbp
2976	mov	-8(%rax),%rbx
2977	lea	(%rax),%rsp
2978.Locb_enc_epilogue:
2979	ret
2980.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
2981
2982.type	__ocb_encrypt6,\@abi-omnipotent
2983.align	32
2984__ocb_encrypt6:
2985	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
2986	 movdqu		($L_p,$i1),@offset[1]
2987	 movdqa		@offset[0],@offset[2]
2988	 movdqu		($L_p,$i3),@offset[3]
2989	 movdqa		@offset[0],@offset[4]
2990	 pxor		@offset[5],@offset[0]
2991	 movdqu		($L_p,$i5),@offset[5]
2992	 pxor		@offset[0],@offset[1]
2993	pxor		$inout0,$checksum	# accumulate checksum
2994	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
2995	 pxor		@offset[1],@offset[2]
2996	pxor		$inout1,$checksum
2997	pxor		@offset[1],$inout1
2998	 pxor		@offset[2],@offset[3]
2999	pxor		$inout2,$checksum
3000	pxor		@offset[2],$inout2
3001	 pxor		@offset[3],@offset[4]
3002	pxor		$inout3,$checksum
3003	pxor		@offset[3],$inout3
3004	 pxor		@offset[4],@offset[5]
3005	pxor		$inout4,$checksum
3006	pxor		@offset[4],$inout4
3007	pxor		$inout5,$checksum
3008	pxor		@offset[5],$inout5
3009	$movkey		32($key_),$rndkey0
3010
3011	lea		1($block_num),$i1	# even-numbered blocks
3012	lea		3($block_num),$i3
3013	lea		5($block_num),$i5
3014	add		\$6,$block_num
3015	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3016	bsf		$i1,$i1			# ntz(block)
3017	bsf		$i3,$i3
3018	bsf		$i5,$i5
3019
3020	aesenc		$rndkey1,$inout0
3021	aesenc		$rndkey1,$inout1
3022	aesenc		$rndkey1,$inout2
3023	aesenc		$rndkey1,$inout3
3024	 pxor		$rndkey0l,@offset[1]
3025	 pxor		$rndkey0l,@offset[2]
3026	aesenc		$rndkey1,$inout4
3027	 pxor		$rndkey0l,@offset[3]
3028	 pxor		$rndkey0l,@offset[4]
3029	aesenc		$rndkey1,$inout5
3030	$movkey		48($key_),$rndkey1
3031	 pxor		$rndkey0l,@offset[5]
3032
3033	aesenc		$rndkey0,$inout0
3034	aesenc		$rndkey0,$inout1
3035	aesenc		$rndkey0,$inout2
3036	aesenc		$rndkey0,$inout3
3037	aesenc		$rndkey0,$inout4
3038	aesenc		$rndkey0,$inout5
3039	$movkey		64($key_),$rndkey0
3040	shl		\$4,$i1			# ntz(block) -> table offset
3041	shl		\$4,$i3
3042	jmp		.Locb_enc_loop6
3043
3044.align	32
3045.Locb_enc_loop6:
3046	aesenc		$rndkey1,$inout0
3047	aesenc		$rndkey1,$inout1
3048	aesenc		$rndkey1,$inout2
3049	aesenc		$rndkey1,$inout3
3050	aesenc		$rndkey1,$inout4
3051	aesenc		$rndkey1,$inout5
3052	$movkey		($key,%rax),$rndkey1
3053	add		\$32,%rax
3054
3055	aesenc		$rndkey0,$inout0
3056	aesenc		$rndkey0,$inout1
3057	aesenc		$rndkey0,$inout2
3058	aesenc		$rndkey0,$inout3
3059	aesenc		$rndkey0,$inout4
3060	aesenc		$rndkey0,$inout5
3061	$movkey		-16($key,%rax),$rndkey0
3062	jnz		.Locb_enc_loop6
3063
3064	aesenc		$rndkey1,$inout0
3065	aesenc		$rndkey1,$inout1
3066	aesenc		$rndkey1,$inout2
3067	aesenc		$rndkey1,$inout3
3068	aesenc		$rndkey1,$inout4
3069	aesenc		$rndkey1,$inout5
3070	$movkey		16($key_),$rndkey1
3071	shl		\$4,$i5
3072
3073	aesenclast	@offset[0],$inout0
3074	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3075	mov		%r10,%rax		# restore twisted rounds
3076	aesenclast	@offset[1],$inout1
3077	aesenclast	@offset[2],$inout2
3078	aesenclast	@offset[3],$inout3
3079	aesenclast	@offset[4],$inout4
3080	aesenclast	@offset[5],$inout5
3081	ret
3082.size	__ocb_encrypt6,.-__ocb_encrypt6
3083
3084.type	__ocb_encrypt4,\@abi-omnipotent
3085.align	32
3086__ocb_encrypt4:
3087	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3088	 movdqu		($L_p,$i1),@offset[1]
3089	 movdqa		@offset[0],@offset[2]
3090	 movdqu		($L_p,$i3),@offset[3]
3091	 pxor		@offset[5],@offset[0]
3092	 pxor		@offset[0],@offset[1]
3093	pxor		$inout0,$checksum	# accumulate checksum
3094	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3095	 pxor		@offset[1],@offset[2]
3096	pxor		$inout1,$checksum
3097	pxor		@offset[1],$inout1
3098	 pxor		@offset[2],@offset[3]
3099	pxor		$inout2,$checksum
3100	pxor		@offset[2],$inout2
3101	pxor		$inout3,$checksum
3102	pxor		@offset[3],$inout3
3103	$movkey		32($key_),$rndkey0
3104
3105	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3106	 pxor		$rndkey0l,@offset[1]
3107	 pxor		$rndkey0l,@offset[2]
3108	 pxor		$rndkey0l,@offset[3]
3109
3110	aesenc		$rndkey1,$inout0
3111	aesenc		$rndkey1,$inout1
3112	aesenc		$rndkey1,$inout2
3113	aesenc		$rndkey1,$inout3
3114	$movkey		48($key_),$rndkey1
3115
3116	aesenc		$rndkey0,$inout0
3117	aesenc		$rndkey0,$inout1
3118	aesenc		$rndkey0,$inout2
3119	aesenc		$rndkey0,$inout3
3120	$movkey		64($key_),$rndkey0
3121	jmp		.Locb_enc_loop4
3122
3123.align	32
3124.Locb_enc_loop4:
3125	aesenc		$rndkey1,$inout0
3126	aesenc		$rndkey1,$inout1
3127	aesenc		$rndkey1,$inout2
3128	aesenc		$rndkey1,$inout3
3129	$movkey		($key,%rax),$rndkey1
3130	add		\$32,%rax
3131
3132	aesenc		$rndkey0,$inout0
3133	aesenc		$rndkey0,$inout1
3134	aesenc		$rndkey0,$inout2
3135	aesenc		$rndkey0,$inout3
3136	$movkey		-16($key,%rax),$rndkey0
3137	jnz		.Locb_enc_loop4
3138
3139	aesenc		$rndkey1,$inout0
3140	aesenc		$rndkey1,$inout1
3141	aesenc		$rndkey1,$inout2
3142	aesenc		$rndkey1,$inout3
3143	$movkey		16($key_),$rndkey1
3144	mov		%r10,%rax		# restore twisted rounds
3145
3146	aesenclast	@offset[0],$inout0
3147	aesenclast	@offset[1],$inout1
3148	aesenclast	@offset[2],$inout2
3149	aesenclast	@offset[3],$inout3
3150	ret
3151.size	__ocb_encrypt4,.-__ocb_encrypt4
3152
3153.type	__ocb_encrypt1,\@abi-omnipotent
3154.align	32
3155__ocb_encrypt1:
3156	 pxor		@offset[5],$inout5	# offset_i
3157	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3158	pxor		$inout0,$checksum	# accumulate checksum
3159	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3160	$movkey		32($key_),$rndkey0
3161
3162	aesenc		$rndkey1,$inout0
3163	$movkey		48($key_),$rndkey1
3164	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3165
3166	aesenc		$rndkey0,$inout0
3167	$movkey		64($key_),$rndkey0
3168	jmp		.Locb_enc_loop1
3169
3170.align	32
3171.Locb_enc_loop1:
3172	aesenc		$rndkey1,$inout0
3173	$movkey		($key,%rax),$rndkey1
3174	add		\$32,%rax
3175
3176	aesenc		$rndkey0,$inout0
3177	$movkey		-16($key,%rax),$rndkey0
3178	jnz		.Locb_enc_loop1
3179
3180	aesenc		$rndkey1,$inout0
3181	$movkey		16($key_),$rndkey1	# redundant in tail
3182	mov		%r10,%rax		# restore twisted rounds
3183
3184	aesenclast	$inout5,$inout0
3185	ret
3186.size	__ocb_encrypt1,.-__ocb_encrypt1
3187
3188.globl	aesni_ocb_decrypt
3189.type	aesni_ocb_decrypt,\@function,6
3190.align	32
3191aesni_ocb_decrypt:
3192	lea	(%rsp),%rax
3193	push	%rbx
3194	push	%rbp
3195	push	%r12
3196	push	%r13
3197	push	%r14
3198___
3199$code.=<<___ if ($win64);
3200	lea	-0xa0(%rsp),%rsp
3201	movaps	%xmm6,0x00(%rsp)		# offload everything
3202	movaps	%xmm7,0x10(%rsp)
3203	movaps	%xmm8,0x20(%rsp)
3204	movaps	%xmm9,0x30(%rsp)
3205	movaps	%xmm10,0x40(%rsp)
3206	movaps	%xmm11,0x50(%rsp)
3207	movaps	%xmm12,0x60(%rsp)
3208	movaps	%xmm13,0x70(%rsp)
3209	movaps	%xmm14,0x80(%rsp)
3210	movaps	%xmm15,0x90(%rsp)
3211.Locb_dec_body:
3212___
3213$code.=<<___;
3214	mov	$seventh_arg(%rax),$L_p		# 7th argument
3215	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
3216
3217	mov	240($key),$rnds_
3218	mov	$key,$key_
3219	shl	\$4,$rnds_
3220	$movkey	($key),$rndkey0l		# round[0]
3221	$movkey	16($key,$rnds_),$rndkey1	# round[last]
3222
3223	movdqu	($offset_p),@offset[5]		# load last offset_i
3224	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
3225	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
3226
3227	mov	\$16+32,$rounds
3228	lea	32($key_,$rnds_),$key
3229	$movkey	16($key_),$rndkey1		# round[1]
3230	sub	%r10,%rax			# twisted $rounds
3231	mov	%rax,%r10			# backup twisted $rounds
3232
3233	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
3234	movdqu	($checksum_p),$checksum		# load checksum
3235
3236	test	\$1,$block_num			# is first block number odd?
3237	jnz	.Locb_dec_odd
3238
3239	bsf	$block_num,$i1
3240	add	\$1,$block_num
3241	shl	\$4,$i1
3242	movdqu	($L_p,$i1),$inout5		# borrow
3243	movdqu	($inp),$inout0
3244	lea	16($inp),$inp
3245
3246	call	__ocb_decrypt1
3247
3248	movdqa	$inout5,@offset[5]
3249	movups	$inout0,($out)
3250	xorps	$inout0,$checksum		# accumulate checksum
3251	lea	16($out),$out
3252	sub	\$1,$blocks
3253	jz	.Locb_dec_done
3254
3255.Locb_dec_odd:
3256	lea	1($block_num),$i1		# even-numbered blocks
3257	lea	3($block_num),$i3
3258	lea	5($block_num),$i5
3259	lea	6($block_num),$block_num
3260	bsf	$i1,$i1				# ntz(block)
3261	bsf	$i3,$i3
3262	bsf	$i5,$i5
3263	shl	\$4,$i1				# ntz(block) -> table offset
3264	shl	\$4,$i3
3265	shl	\$4,$i5
3266
3267	sub	\$6,$blocks
3268	jc	.Locb_dec_short
3269	jmp	.Locb_dec_grandloop
3270
3271.align	32
3272.Locb_dec_grandloop:
3273	movdqu	`16*0`($inp),$inout0		# load input
3274	movdqu	`16*1`($inp),$inout1
3275	movdqu	`16*2`($inp),$inout2
3276	movdqu	`16*3`($inp),$inout3
3277	movdqu	`16*4`($inp),$inout4
3278	movdqu	`16*5`($inp),$inout5
3279	lea	`16*6`($inp),$inp
3280
3281	call	__ocb_decrypt6
3282
3283	movups	$inout0,`16*0`($out)		# store output
3284	pxor	$inout0,$checksum		# accumulate checksum
3285	movups	$inout1,`16*1`($out)
3286	pxor	$inout1,$checksum
3287	movups	$inout2,`16*2`($out)
3288	pxor	$inout2,$checksum
3289	movups	$inout3,`16*3`($out)
3290	pxor	$inout3,$checksum
3291	movups	$inout4,`16*4`($out)
3292	pxor	$inout4,$checksum
3293	movups	$inout5,`16*5`($out)
3294	pxor	$inout5,$checksum
3295	lea	`16*6`($out),$out
3296	sub	\$6,$blocks
3297	jnc	.Locb_dec_grandloop
3298
3299.Locb_dec_short:
3300	add	\$6,$blocks
3301	jz	.Locb_dec_done
3302
3303	movdqu	`16*0`($inp),$inout0
3304	cmp	\$2,$blocks
3305	jb	.Locb_dec_one
3306	movdqu	`16*1`($inp),$inout1
3307	je	.Locb_dec_two
3308
3309	movdqu	`16*2`($inp),$inout2
3310	cmp	\$4,$blocks
3311	jb	.Locb_dec_three
3312	movdqu	`16*3`($inp),$inout3
3313	je	.Locb_dec_four
3314
3315	movdqu	`16*4`($inp),$inout4
3316	pxor	$inout5,$inout5
3317
3318	call	__ocb_decrypt6
3319
3320	movdqa	@offset[4],@offset[5]
3321	movups	$inout0,`16*0`($out)		# store output
3322	pxor	$inout0,$checksum		# accumulate checksum
3323	movups	$inout1,`16*1`($out)
3324	pxor	$inout1,$checksum
3325	movups	$inout2,`16*2`($out)
3326	pxor	$inout2,$checksum
3327	movups	$inout3,`16*3`($out)
3328	pxor	$inout3,$checksum
3329	movups	$inout4,`16*4`($out)
3330	pxor	$inout4,$checksum
3331
3332	jmp	.Locb_dec_done
3333
3334.align	16
3335.Locb_dec_one:
3336	movdqa	@offset[0],$inout5		# borrow
3337
3338	call	__ocb_decrypt1
3339
3340	movdqa	$inout5,@offset[5]
3341	movups	$inout0,`16*0`($out)		# store output
3342	xorps	$inout0,$checksum		# accumulate checksum
3343	jmp	.Locb_dec_done
3344
3345.align	16
3346.Locb_dec_two:
3347	pxor	$inout2,$inout2
3348	pxor	$inout3,$inout3
3349
3350	call	__ocb_decrypt4
3351
3352	movdqa	@offset[1],@offset[5]
3353	movups	$inout0,`16*0`($out)		# store output
3354	xorps	$inout0,$checksum		# accumulate checksum
3355	movups	$inout1,`16*1`($out)
3356	xorps	$inout1,$checksum
3357
3358	jmp	.Locb_dec_done
3359
3360.align	16
3361.Locb_dec_three:
3362	pxor	$inout3,$inout3
3363
3364	call	__ocb_decrypt4
3365
3366	movdqa	@offset[2],@offset[5]
3367	movups	$inout0,`16*0`($out)		# store output
3368	xorps	$inout0,$checksum		# accumulate checksum
3369	movups	$inout1,`16*1`($out)
3370	xorps	$inout1,$checksum
3371	movups	$inout2,`16*2`($out)
3372	xorps	$inout2,$checksum
3373
3374	jmp	.Locb_dec_done
3375
3376.align	16
3377.Locb_dec_four:
3378	call	__ocb_decrypt4
3379
3380	movdqa	@offset[3],@offset[5]
3381	movups	$inout0,`16*0`($out)		# store output
3382	pxor	$inout0,$checksum		# accumulate checksum
3383	movups	$inout1,`16*1`($out)
3384	pxor	$inout1,$checksum
3385	movups	$inout2,`16*2`($out)
3386	pxor	$inout2,$checksum
3387	movups	$inout3,`16*3`($out)
3388	pxor	$inout3,$checksum
3389
3390.Locb_dec_done:
3391	pxor	$rndkey0,@offset[5]		# "remove" round[last]
3392	movdqu	$checksum,($checksum_p)		# store checksum
3393	movdqu	@offset[5],($offset_p)		# store last offset_i
3394
3395	xorps	%xmm0,%xmm0			# clear register bank
3396	pxor	%xmm1,%xmm1
3397	pxor	%xmm2,%xmm2
3398	pxor	%xmm3,%xmm3
3399	pxor	%xmm4,%xmm4
3400	pxor	%xmm5,%xmm5
3401___
3402$code.=<<___ if (!$win64);
3403	pxor	%xmm6,%xmm6
3404	pxor	%xmm7,%xmm7
3405	pxor	%xmm8,%xmm8
3406	pxor	%xmm9,%xmm9
3407	pxor	%xmm10,%xmm10
3408	pxor	%xmm11,%xmm11
3409	pxor	%xmm12,%xmm12
3410	pxor	%xmm13,%xmm13
3411	pxor	%xmm14,%xmm14
3412	pxor	%xmm15,%xmm15
3413	lea	0x28(%rsp),%rax
3414___
3415$code.=<<___ if ($win64);
3416	movaps	0x00(%rsp),%xmm6
3417	movaps	%xmm0,0x00(%rsp)		# clear stack
3418	movaps	0x10(%rsp),%xmm7
3419	movaps	%xmm0,0x10(%rsp)
3420	movaps	0x20(%rsp),%xmm8
3421	movaps	%xmm0,0x20(%rsp)
3422	movaps	0x30(%rsp),%xmm9
3423	movaps	%xmm0,0x30(%rsp)
3424	movaps	0x40(%rsp),%xmm10
3425	movaps	%xmm0,0x40(%rsp)
3426	movaps	0x50(%rsp),%xmm11
3427	movaps	%xmm0,0x50(%rsp)
3428	movaps	0x60(%rsp),%xmm12
3429	movaps	%xmm0,0x60(%rsp)
3430	movaps	0x70(%rsp),%xmm13
3431	movaps	%xmm0,0x70(%rsp)
3432	movaps	0x80(%rsp),%xmm14
3433	movaps	%xmm0,0x80(%rsp)
3434	movaps	0x90(%rsp),%xmm15
3435	movaps	%xmm0,0x90(%rsp)
3436	lea	0xa0+0x28(%rsp),%rax
3437.Locb_dec_pop:
3438___
3439$code.=<<___;
3440	mov	-40(%rax),%r14
3441	mov	-32(%rax),%r13
3442	mov	-24(%rax),%r12
3443	mov	-16(%rax),%rbp
3444	mov	-8(%rax),%rbx
3445	lea	(%rax),%rsp
3446.Locb_dec_epilogue:
3447	ret
3448.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
3449
3450.type	__ocb_decrypt6,\@abi-omnipotent
3451.align	32
3452__ocb_decrypt6:
3453	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3454	 movdqu		($L_p,$i1),@offset[1]
3455	 movdqa		@offset[0],@offset[2]
3456	 movdqu		($L_p,$i3),@offset[3]
3457	 movdqa		@offset[0],@offset[4]
3458	 pxor		@offset[5],@offset[0]
3459	 movdqu		($L_p,$i5),@offset[5]
3460	 pxor		@offset[0],@offset[1]
3461	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3462	 pxor		@offset[1],@offset[2]
3463	pxor		@offset[1],$inout1
3464	 pxor		@offset[2],@offset[3]
3465	pxor		@offset[2],$inout2
3466	 pxor		@offset[3],@offset[4]
3467	pxor		@offset[3],$inout3
3468	 pxor		@offset[4],@offset[5]
3469	pxor		@offset[4],$inout4
3470	pxor		@offset[5],$inout5
3471	$movkey		32($key_),$rndkey0
3472
3473	lea		1($block_num),$i1	# even-numbered blocks
3474	lea		3($block_num),$i3
3475	lea		5($block_num),$i5
3476	add		\$6,$block_num
3477	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3478	bsf		$i1,$i1			# ntz(block)
3479	bsf		$i3,$i3
3480	bsf		$i5,$i5
3481
3482	aesdec		$rndkey1,$inout0
3483	aesdec		$rndkey1,$inout1
3484	aesdec		$rndkey1,$inout2
3485	aesdec		$rndkey1,$inout3
3486	 pxor		$rndkey0l,@offset[1]
3487	 pxor		$rndkey0l,@offset[2]
3488	aesdec		$rndkey1,$inout4
3489	 pxor		$rndkey0l,@offset[3]
3490	 pxor		$rndkey0l,@offset[4]
3491	aesdec		$rndkey1,$inout5
3492	$movkey		48($key_),$rndkey1
3493	 pxor		$rndkey0l,@offset[5]
3494
3495	aesdec		$rndkey0,$inout0
3496	aesdec		$rndkey0,$inout1
3497	aesdec		$rndkey0,$inout2
3498	aesdec		$rndkey0,$inout3
3499	aesdec		$rndkey0,$inout4
3500	aesdec		$rndkey0,$inout5
3501	$movkey		64($key_),$rndkey0
3502	shl		\$4,$i1			# ntz(block) -> table offset
3503	shl		\$4,$i3
3504	jmp		.Locb_dec_loop6
3505
3506.align	32
3507.Locb_dec_loop6:
3508	aesdec		$rndkey1,$inout0
3509	aesdec		$rndkey1,$inout1
3510	aesdec		$rndkey1,$inout2
3511	aesdec		$rndkey1,$inout3
3512	aesdec		$rndkey1,$inout4
3513	aesdec		$rndkey1,$inout5
3514	$movkey		($key,%rax),$rndkey1
3515	add		\$32,%rax
3516
3517	aesdec		$rndkey0,$inout0
3518	aesdec		$rndkey0,$inout1
3519	aesdec		$rndkey0,$inout2
3520	aesdec		$rndkey0,$inout3
3521	aesdec		$rndkey0,$inout4
3522	aesdec		$rndkey0,$inout5
3523	$movkey		-16($key,%rax),$rndkey0
3524	jnz		.Locb_dec_loop6
3525
3526	aesdec		$rndkey1,$inout0
3527	aesdec		$rndkey1,$inout1
3528	aesdec		$rndkey1,$inout2
3529	aesdec		$rndkey1,$inout3
3530	aesdec		$rndkey1,$inout4
3531	aesdec		$rndkey1,$inout5
3532	$movkey		16($key_),$rndkey1
3533	shl		\$4,$i5
3534
3535	aesdeclast	@offset[0],$inout0
3536	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3537	mov		%r10,%rax		# restore twisted rounds
3538	aesdeclast	@offset[1],$inout1
3539	aesdeclast	@offset[2],$inout2
3540	aesdeclast	@offset[3],$inout3
3541	aesdeclast	@offset[4],$inout4
3542	aesdeclast	@offset[5],$inout5
3543	ret
3544.size	__ocb_decrypt6,.-__ocb_decrypt6
3545
3546.type	__ocb_decrypt4,\@abi-omnipotent
3547.align	32
3548__ocb_decrypt4:
3549	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3550	 movdqu		($L_p,$i1),@offset[1]
3551	 movdqa		@offset[0],@offset[2]
3552	 movdqu		($L_p,$i3),@offset[3]
3553	 pxor		@offset[5],@offset[0]
3554	 pxor		@offset[0],@offset[1]
3555	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3556	 pxor		@offset[1],@offset[2]
3557	pxor		@offset[1],$inout1
3558	 pxor		@offset[2],@offset[3]
3559	pxor		@offset[2],$inout2
3560	pxor		@offset[3],$inout3
3561	$movkey		32($key_),$rndkey0
3562
3563	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3564	 pxor		$rndkey0l,@offset[1]
3565	 pxor		$rndkey0l,@offset[2]
3566	 pxor		$rndkey0l,@offset[3]
3567
3568	aesdec		$rndkey1,$inout0
3569	aesdec		$rndkey1,$inout1
3570	aesdec		$rndkey1,$inout2
3571	aesdec		$rndkey1,$inout3
3572	$movkey		48($key_),$rndkey1
3573
3574	aesdec		$rndkey0,$inout0
3575	aesdec		$rndkey0,$inout1
3576	aesdec		$rndkey0,$inout2
3577	aesdec		$rndkey0,$inout3
3578	$movkey		64($key_),$rndkey0
3579	jmp		.Locb_dec_loop4
3580
3581.align	32
3582.Locb_dec_loop4:
3583	aesdec		$rndkey1,$inout0
3584	aesdec		$rndkey1,$inout1
3585	aesdec		$rndkey1,$inout2
3586	aesdec		$rndkey1,$inout3
3587	$movkey		($key,%rax),$rndkey1
3588	add		\$32,%rax
3589
3590	aesdec		$rndkey0,$inout0
3591	aesdec		$rndkey0,$inout1
3592	aesdec		$rndkey0,$inout2
3593	aesdec		$rndkey0,$inout3
3594	$movkey		-16($key,%rax),$rndkey0
3595	jnz		.Locb_dec_loop4
3596
3597	aesdec		$rndkey1,$inout0
3598	aesdec		$rndkey1,$inout1
3599	aesdec		$rndkey1,$inout2
3600	aesdec		$rndkey1,$inout3
3601	$movkey		16($key_),$rndkey1
3602	mov		%r10,%rax		# restore twisted rounds
3603
3604	aesdeclast	@offset[0],$inout0
3605	aesdeclast	@offset[1],$inout1
3606	aesdeclast	@offset[2],$inout2
3607	aesdeclast	@offset[3],$inout3
3608	ret
3609.size	__ocb_decrypt4,.-__ocb_decrypt4
3610
3611.type	__ocb_decrypt1,\@abi-omnipotent
3612.align	32
3613__ocb_decrypt1:
3614	 pxor		@offset[5],$inout5	# offset_i
3615	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3616	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3617	$movkey		32($key_),$rndkey0
3618
3619	aesdec		$rndkey1,$inout0
3620	$movkey		48($key_),$rndkey1
3621	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3622
3623	aesdec		$rndkey0,$inout0
3624	$movkey		64($key_),$rndkey0
3625	jmp		.Locb_dec_loop1
3626
3627.align	32
3628.Locb_dec_loop1:
3629	aesdec		$rndkey1,$inout0
3630	$movkey		($key,%rax),$rndkey1
3631	add		\$32,%rax
3632
3633	aesdec		$rndkey0,$inout0
3634	$movkey		-16($key,%rax),$rndkey0
3635	jnz		.Locb_dec_loop1
3636
3637	aesdec		$rndkey1,$inout0
3638	$movkey		16($key_),$rndkey1	# redundant in tail
3639	mov		%r10,%rax		# restore twisted rounds
3640
3641	aesdeclast	$inout5,$inout0
3642	ret
3643.size	__ocb_decrypt1,.-__ocb_decrypt1
3644___
3645} }}
3646
3647########################################################################
3648# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3649#			    size_t length, const AES_KEY *key,
3650#			    unsigned char *ivp,const int enc);
3651{
3652my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
3653my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3654
3655$code.=<<___;
3656.globl	${PREFIX}_cbc_encrypt
3657.type	${PREFIX}_cbc_encrypt,\@function,6
3658.align	16
3659${PREFIX}_cbc_encrypt:
3660	test	$len,$len		# check length
3661	jz	.Lcbc_ret
3662
3663	mov	240($key),$rnds_	# key->rounds
3664	mov	$key,$key_		# backup $key
3665	test	%r9d,%r9d		# 6th argument
3666	jz	.Lcbc_decrypt
3667#--------------------------- CBC ENCRYPT ------------------------------#
3668	movups	($ivp),$inout0		# load iv as initial state
3669	mov	$rnds_,$rounds
3670	cmp	\$16,$len
3671	jb	.Lcbc_enc_tail
3672	sub	\$16,$len
3673	jmp	.Lcbc_enc_loop
3674.align	16
3675.Lcbc_enc_loop:
3676	movups	($inp),$inout1		# load input
3677	lea	16($inp),$inp
3678	#xorps	$inout1,$inout0
3679___
3680	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3681$code.=<<___;
3682	mov	$rnds_,$rounds		# restore $rounds
3683	mov	$key_,$key		# restore $key
3684	movups	$inout0,0($out)		# store output
3685	lea	16($out),$out
3686	sub	\$16,$len
3687	jnc	.Lcbc_enc_loop
3688	add	\$16,$len
3689	jnz	.Lcbc_enc_tail
3690	 pxor	$rndkey0,$rndkey0	# clear register bank
3691	 pxor	$rndkey1,$rndkey1
3692	movups	$inout0,($ivp)
3693	 pxor	$inout0,$inout0
3694	 pxor	$inout1,$inout1
3695	jmp	.Lcbc_ret
3696
3697.Lcbc_enc_tail:
3698	mov	$len,%rcx	# zaps $key
3699	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
3700	.long	0x9066A4F3	# rep movsb
3701	mov	\$16,%ecx	# zero tail
3702	sub	$len,%rcx
3703	xor	%eax,%eax
3704	.long	0x9066AAF3	# rep stosb
3705	lea	-16(%rdi),%rdi	# rewind $out by 1 block
3706	mov	$rnds_,$rounds	# restore $rounds
3707	mov	%rdi,%rsi	# $inp and $out are the same
3708	mov	$key_,$key	# restore $key
3709	xor	$len,$len	# len=16
3710	jmp	.Lcbc_enc_loop	# one more spin
3711#--------------------------- CBC DECRYPT ------------------------------#
3712.align	16
3713.Lcbc_decrypt:
3714	cmp	\$16,$len
3715	jne	.Lcbc_decrypt_bulk
3716
3717	# handle single block without allocating stack frame,
3718	# useful in ciphertext stealing mode
3719	movdqu	($inp),$inout0		# load input
3720	movdqu	($ivp),$inout1		# load iv
3721	movdqa	$inout0,$inout2		# future iv
3722___
3723	&aesni_generate1("dec",$key,$rnds_);
3724$code.=<<___;
3725	 pxor	$rndkey0,$rndkey0	# clear register bank
3726	 pxor	$rndkey1,$rndkey1
3727	movdqu	$inout2,($ivp)		# store iv
3728	xorps	$inout1,$inout0		# ^=iv
3729	 pxor	$inout1,$inout1
3730	movups	$inout0,($out)		# store output
3731	 pxor	$inout0,$inout0
3732	jmp	.Lcbc_ret
3733.align	16
3734.Lcbc_decrypt_bulk:
3735	lea	(%rsp),%r11		# frame pointer
3736	push	%rbp
3737	sub	\$$frame_size,%rsp
3738	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
3739___
3740$code.=<<___ if ($win64);
3741	movaps	%xmm6,0x10(%rsp)
3742	movaps	%xmm7,0x20(%rsp)
3743	movaps	%xmm8,0x30(%rsp)
3744	movaps	%xmm9,0x40(%rsp)
3745	movaps	%xmm10,0x50(%rsp)
3746	movaps	%xmm11,0x60(%rsp)
3747	movaps	%xmm12,0x70(%rsp)
3748	movaps	%xmm13,0x80(%rsp)
3749	movaps	%xmm14,0x90(%rsp)
3750	movaps	%xmm15,0xa0(%rsp)
3751.Lcbc_decrypt_body:
3752___
3753
3754my $inp_=$key_="%rbp";			# reassign $key_
3755
3756$code.=<<___;
3757	mov	$key,$key_		# [re-]backup $key [after reassignment]
3758	movups	($ivp),$iv
3759	mov	$rnds_,$rounds
3760	cmp	\$0x50,$len
3761	jbe	.Lcbc_dec_tail
3762
3763	$movkey	($key),$rndkey0
3764	movdqu	0x00($inp),$inout0	# load input
3765	movdqu	0x10($inp),$inout1
3766	movdqa	$inout0,$in0
3767	movdqu	0x20($inp),$inout2
3768	movdqa	$inout1,$in1
3769	movdqu	0x30($inp),$inout3
3770	movdqa	$inout2,$in2
3771	movdqu	0x40($inp),$inout4
3772	movdqa	$inout3,$in3
3773	movdqu	0x50($inp),$inout5
3774	movdqa	$inout4,$in4
3775	leaq	OPENSSL_ia32cap_P(%rip),%r9
3776	mov	4(%r9),%r9d
3777	cmp	\$0x70,$len
3778	jbe	.Lcbc_dec_six_or_seven
3779
3780	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
3781	sub	\$0x50,$len		# $len is biased by -5*16
3782	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
3783	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
3784	sub	\$0x20,$len		# $len is biased by -7*16
3785	lea	0x70($key),$key		# size optimization
3786	jmp	.Lcbc_dec_loop8_enter
3787.align	16
3788.Lcbc_dec_loop8:
3789	movups	$inout7,($out)
3790	lea	0x10($out),$out
3791.Lcbc_dec_loop8_enter:
3792	movdqu		0x60($inp),$inout6
3793	pxor		$rndkey0,$inout0
3794	movdqu		0x70($inp),$inout7
3795	pxor		$rndkey0,$inout1
3796	$movkey		0x10-0x70($key),$rndkey1
3797	pxor		$rndkey0,$inout2
3798	mov		\$-1,$inp_
3799	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
3800	pxor		$rndkey0,$inout3
3801	pxor		$rndkey0,$inout4
3802	pxor		$rndkey0,$inout5
3803	pxor		$rndkey0,$inout6
3804
3805	aesdec		$rndkey1,$inout0
3806	pxor		$rndkey0,$inout7
3807	$movkey		0x20-0x70($key),$rndkey0
3808	aesdec		$rndkey1,$inout1
3809	aesdec		$rndkey1,$inout2
3810	aesdec		$rndkey1,$inout3
3811	aesdec		$rndkey1,$inout4
3812	aesdec		$rndkey1,$inout5
3813	aesdec		$rndkey1,$inout6
3814	adc		\$0,$inp_
3815	and		\$128,$inp_
3816	aesdec		$rndkey1,$inout7
3817	add		$inp,$inp_
3818	$movkey		0x30-0x70($key),$rndkey1
3819___
3820for($i=1;$i<12;$i++) {
3821my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3822$code.=<<___	if ($i==7);
3823	cmp		\$11,$rounds
3824___
3825$code.=<<___;
3826	aesdec		$rndkeyx,$inout0
3827	aesdec		$rndkeyx,$inout1
3828	aesdec		$rndkeyx,$inout2
3829	aesdec		$rndkeyx,$inout3
3830	aesdec		$rndkeyx,$inout4
3831	aesdec		$rndkeyx,$inout5
3832	aesdec		$rndkeyx,$inout6
3833	aesdec		$rndkeyx,$inout7
3834	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
3835___
3836$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
3837	nop
3838___
3839$code.=<<___	if ($i==7);
3840	jb		.Lcbc_dec_done
3841___
3842$code.=<<___	if ($i==9);
3843	je		.Lcbc_dec_done
3844___
3845$code.=<<___	if ($i==11);
3846	jmp		.Lcbc_dec_done
3847___
3848}
3849$code.=<<___;
3850.align	16
3851.Lcbc_dec_done:
3852	aesdec		$rndkey1,$inout0
3853	aesdec		$rndkey1,$inout1
3854	pxor		$rndkey0,$iv
3855	pxor		$rndkey0,$in0
3856	aesdec		$rndkey1,$inout2
3857	aesdec		$rndkey1,$inout3
3858	pxor		$rndkey0,$in1
3859	pxor		$rndkey0,$in2
3860	aesdec		$rndkey1,$inout4
3861	aesdec		$rndkey1,$inout5
3862	pxor		$rndkey0,$in3
3863	pxor		$rndkey0,$in4
3864	aesdec		$rndkey1,$inout6
3865	aesdec		$rndkey1,$inout7
3866	movdqu		0x50($inp),$rndkey1
3867
3868	aesdeclast	$iv,$inout0
3869	movdqu		0x60($inp),$iv		# borrow $iv
3870	pxor		$rndkey0,$rndkey1
3871	aesdeclast	$in0,$inout1
3872	pxor		$rndkey0,$iv
3873	movdqu		0x70($inp),$rndkey0	# next IV
3874	aesdeclast	$in1,$inout2
3875	lea		0x80($inp),$inp
3876	movdqu		0x00($inp_),$in0
3877	aesdeclast	$in2,$inout3
3878	aesdeclast	$in3,$inout4
3879	movdqu		0x10($inp_),$in1
3880	movdqu		0x20($inp_),$in2
3881	aesdeclast	$in4,$inout5
3882	aesdeclast	$rndkey1,$inout6
3883	movdqu		0x30($inp_),$in3
3884	movdqu		0x40($inp_),$in4
3885	aesdeclast	$iv,$inout7
3886	movdqa		$rndkey0,$iv		# return $iv
3887	movdqu		0x50($inp_),$rndkey1
3888	$movkey		-0x70($key),$rndkey0
3889
3890	movups		$inout0,($out)		# store output
3891	movdqa		$in0,$inout0
3892	movups		$inout1,0x10($out)
3893	movdqa		$in1,$inout1
3894	movups		$inout2,0x20($out)
3895	movdqa		$in2,$inout2
3896	movups		$inout3,0x30($out)
3897	movdqa		$in3,$inout3
3898	movups		$inout4,0x40($out)
3899	movdqa		$in4,$inout4
3900	movups		$inout5,0x50($out)
3901	movdqa		$rndkey1,$inout5
3902	movups		$inout6,0x60($out)
3903	lea		0x70($out),$out
3904
3905	sub	\$0x80,$len
3906	ja	.Lcbc_dec_loop8
3907
3908	movaps	$inout7,$inout0
3909	lea	-0x70($key),$key
3910	add	\$0x70,$len
3911	jle	.Lcbc_dec_clear_tail_collected
3912	movups	$inout7,($out)
3913	lea	0x10($out),$out
3914	cmp	\$0x50,$len
3915	jbe	.Lcbc_dec_tail
3916
3917	movaps	$in0,$inout0
3918.Lcbc_dec_six_or_seven:
3919	cmp	\$0x60,$len
3920	ja	.Lcbc_dec_seven
3921
3922	movaps	$inout5,$inout6
3923	call	_aesni_decrypt6
3924	pxor	$iv,$inout0		# ^= IV
3925	movaps	$inout6,$iv
3926	pxor	$in0,$inout1
3927	movdqu	$inout0,($out)
3928	pxor	$in1,$inout2
3929	movdqu	$inout1,0x10($out)
3930	 pxor	$inout1,$inout1		# clear register bank
3931	pxor	$in2,$inout3
3932	movdqu	$inout2,0x20($out)
3933	 pxor	$inout2,$inout2
3934	pxor	$in3,$inout4
3935	movdqu	$inout3,0x30($out)
3936	 pxor	$inout3,$inout3
3937	pxor	$in4,$inout5
3938	movdqu	$inout4,0x40($out)
3939	 pxor	$inout4,$inout4
3940	lea	0x50($out),$out
3941	movdqa	$inout5,$inout0
3942	 pxor	$inout5,$inout5
3943	jmp	.Lcbc_dec_tail_collected
3944
3945.align	16
3946.Lcbc_dec_seven:
3947	movups	0x60($inp),$inout6
3948	xorps	$inout7,$inout7
3949	call	_aesni_decrypt8
3950	movups	0x50($inp),$inout7
3951	pxor	$iv,$inout0		# ^= IV
3952	movups	0x60($inp),$iv
3953	pxor	$in0,$inout1
3954	movdqu	$inout0,($out)
3955	pxor	$in1,$inout2
3956	movdqu	$inout1,0x10($out)
3957	 pxor	$inout1,$inout1		# clear register bank
3958	pxor	$in2,$inout3
3959	movdqu	$inout2,0x20($out)
3960	 pxor	$inout2,$inout2
3961	pxor	$in3,$inout4
3962	movdqu	$inout3,0x30($out)
3963	 pxor	$inout3,$inout3
3964	pxor	$in4,$inout5
3965	movdqu	$inout4,0x40($out)
3966	 pxor	$inout4,$inout4
3967	pxor	$inout7,$inout6
3968	movdqu	$inout5,0x50($out)
3969	 pxor	$inout5,$inout5
3970	lea	0x60($out),$out
3971	movdqa	$inout6,$inout0
3972	 pxor	$inout6,$inout6
3973	 pxor	$inout7,$inout7
3974	jmp	.Lcbc_dec_tail_collected
3975
3976.align	16
3977.Lcbc_dec_loop6:
3978	movups	$inout5,($out)
3979	lea	0x10($out),$out
3980	movdqu	0x00($inp),$inout0	# load input
3981	movdqu	0x10($inp),$inout1
3982	movdqa	$inout0,$in0
3983	movdqu	0x20($inp),$inout2
3984	movdqa	$inout1,$in1
3985	movdqu	0x30($inp),$inout3
3986	movdqa	$inout2,$in2
3987	movdqu	0x40($inp),$inout4
3988	movdqa	$inout3,$in3
3989	movdqu	0x50($inp),$inout5
3990	movdqa	$inout4,$in4
3991.Lcbc_dec_loop6_enter:
3992	lea	0x60($inp),$inp
3993	movdqa	$inout5,$inout6
3994
3995	call	_aesni_decrypt6
3996
3997	pxor	$iv,$inout0		# ^= IV
3998	movdqa	$inout6,$iv
3999	pxor	$in0,$inout1
4000	movdqu	$inout0,($out)
4001	pxor	$in1,$inout2
4002	movdqu	$inout1,0x10($out)
4003	pxor	$in2,$inout3
4004	movdqu	$inout2,0x20($out)
4005	pxor	$in3,$inout4
4006	mov	$key_,$key
4007	movdqu	$inout3,0x30($out)
4008	pxor	$in4,$inout5
4009	mov	$rnds_,$rounds
4010	movdqu	$inout4,0x40($out)
4011	lea	0x50($out),$out
4012	sub	\$0x60,$len
4013	ja	.Lcbc_dec_loop6
4014
4015	movdqa	$inout5,$inout0
4016	add	\$0x50,$len
4017	jle	.Lcbc_dec_clear_tail_collected
4018	movups	$inout5,($out)
4019	lea	0x10($out),$out
4020
4021.Lcbc_dec_tail:
4022	movups	($inp),$inout0
4023	sub	\$0x10,$len
4024	jbe	.Lcbc_dec_one		# $len is 1*16 or less
4025
4026	movups	0x10($inp),$inout1
4027	movaps	$inout0,$in0
4028	sub	\$0x10,$len
4029	jbe	.Lcbc_dec_two		# $len is 2*16 or less
4030
4031	movups	0x20($inp),$inout2
4032	movaps	$inout1,$in1
4033	sub	\$0x10,$len
4034	jbe	.Lcbc_dec_three		# $len is 3*16 or less
4035
4036	movups	0x30($inp),$inout3
4037	movaps	$inout2,$in2
4038	sub	\$0x10,$len
4039	jbe	.Lcbc_dec_four		# $len is 4*16 or less
4040
4041	movups	0x40($inp),$inout4	# $len is 5*16 or less
4042	movaps	$inout3,$in3
4043	movaps	$inout4,$in4
4044	xorps	$inout5,$inout5
4045	call	_aesni_decrypt6
4046	pxor	$iv,$inout0
4047	movaps	$in4,$iv
4048	pxor	$in0,$inout1
4049	movdqu	$inout0,($out)
4050	pxor	$in1,$inout2
4051	movdqu	$inout1,0x10($out)
4052	 pxor	$inout1,$inout1		# clear register bank
4053	pxor	$in2,$inout3
4054	movdqu	$inout2,0x20($out)
4055	 pxor	$inout2,$inout2
4056	pxor	$in3,$inout4
4057	movdqu	$inout3,0x30($out)
4058	 pxor	$inout3,$inout3
4059	lea	0x40($out),$out
4060	movdqa	$inout4,$inout0
4061	 pxor	$inout4,$inout4
4062	 pxor	$inout5,$inout5
4063	sub	\$0x10,$len
4064	jmp	.Lcbc_dec_tail_collected
4065
4066.align	16
4067.Lcbc_dec_one:
4068	movaps	$inout0,$in0
4069___
4070	&aesni_generate1("dec",$key,$rounds);
4071$code.=<<___;
4072	xorps	$iv,$inout0
4073	movaps	$in0,$iv
4074	jmp	.Lcbc_dec_tail_collected
4075.align	16
4076.Lcbc_dec_two:
4077	movaps	$inout1,$in1
4078	call	_aesni_decrypt2
4079	pxor	$iv,$inout0
4080	movaps	$in1,$iv
4081	pxor	$in0,$inout1
4082	movdqu	$inout0,($out)
4083	movdqa	$inout1,$inout0
4084	 pxor	$inout1,$inout1		# clear register bank
4085	lea	0x10($out),$out
4086	jmp	.Lcbc_dec_tail_collected
4087.align	16
4088.Lcbc_dec_three:
4089	movaps	$inout2,$in2
4090	call	_aesni_decrypt3
4091	pxor	$iv,$inout0
4092	movaps	$in2,$iv
4093	pxor	$in0,$inout1
4094	movdqu	$inout0,($out)
4095	pxor	$in1,$inout2
4096	movdqu	$inout1,0x10($out)
4097	 pxor	$inout1,$inout1		# clear register bank
4098	movdqa	$inout2,$inout0
4099	 pxor	$inout2,$inout2
4100	lea	0x20($out),$out
4101	jmp	.Lcbc_dec_tail_collected
4102.align	16
4103.Lcbc_dec_four:
4104	movaps	$inout3,$in3
4105	call	_aesni_decrypt4
4106	pxor	$iv,$inout0
4107	movaps	$in3,$iv
4108	pxor	$in0,$inout1
4109	movdqu	$inout0,($out)
4110	pxor	$in1,$inout2
4111	movdqu	$inout1,0x10($out)
4112	 pxor	$inout1,$inout1		# clear register bank
4113	pxor	$in2,$inout3
4114	movdqu	$inout2,0x20($out)
4115	 pxor	$inout2,$inout2
4116	movdqa	$inout3,$inout0
4117	 pxor	$inout3,$inout3
4118	lea	0x30($out),$out
4119	jmp	.Lcbc_dec_tail_collected
4120
4121.align	16
4122.Lcbc_dec_clear_tail_collected:
4123	pxor	$inout1,$inout1		# clear register bank
4124	pxor	$inout2,$inout2
4125	pxor	$inout3,$inout3
4126___
4127$code.=<<___ if (!$win64);
4128	pxor	$inout4,$inout4		# %xmm6..9
4129	pxor	$inout5,$inout5
4130	pxor	$inout6,$inout6
4131	pxor	$inout7,$inout7
4132___
4133$code.=<<___;
4134.Lcbc_dec_tail_collected:
4135	movups	$iv,($ivp)
4136	and	\$15,$len
4137	jnz	.Lcbc_dec_tail_partial
4138	movups	$inout0,($out)
4139	pxor	$inout0,$inout0
4140	jmp	.Lcbc_dec_ret
4141.align	16
4142.Lcbc_dec_tail_partial:
4143	movaps	$inout0,(%rsp)
4144	pxor	$inout0,$inout0
4145	mov	\$16,%rcx
4146	mov	$out,%rdi
4147	sub	$len,%rcx
4148	lea	(%rsp),%rsi
4149	.long	0x9066A4F3		# rep movsb
4150	movdqa	$inout0,(%rsp)
4151
4152.Lcbc_dec_ret:
4153	xorps	$rndkey0,$rndkey0	# %xmm0
4154	pxor	$rndkey1,$rndkey1
4155___
4156$code.=<<___ if ($win64);
4157	movaps	0x10(%rsp),%xmm6
4158	movaps	%xmm0,0x10(%rsp)	# clear stack
4159	movaps	0x20(%rsp),%xmm7
4160	movaps	%xmm0,0x20(%rsp)
4161	movaps	0x30(%rsp),%xmm8
4162	movaps	%xmm0,0x30(%rsp)
4163	movaps	0x40(%rsp),%xmm9
4164	movaps	%xmm0,0x40(%rsp)
4165	movaps	0x50(%rsp),%xmm10
4166	movaps	%xmm0,0x50(%rsp)
4167	movaps	0x60(%rsp),%xmm11
4168	movaps	%xmm0,0x60(%rsp)
4169	movaps	0x70(%rsp),%xmm12
4170	movaps	%xmm0,0x70(%rsp)
4171	movaps	0x80(%rsp),%xmm13
4172	movaps	%xmm0,0x80(%rsp)
4173	movaps	0x90(%rsp),%xmm14
4174	movaps	%xmm0,0x90(%rsp)
4175	movaps	0xa0(%rsp),%xmm15
4176	movaps	%xmm0,0xa0(%rsp)
4177___
4178$code.=<<___;
4179	mov	-8(%r11),%rbp
4180	lea	(%r11),%rsp
4181.Lcbc_ret:
4182	ret
4183.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4184___
4185}
4186# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4187#				int bits, AES_KEY *key)
4188#
4189# input:	$inp	user-supplied key
4190#		$bits	$inp length in bits
4191#		$key	pointer to key schedule
4192# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4193#		*$key	key schedule
4194#
4195{ my ($inp,$bits,$key) = @_4args;
4196  $bits =~ s/%r/%e/;
4197
4198$code.=<<___;
4199.globl	${PREFIX}_set_decrypt_key
4200.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
4201.align	16
4202${PREFIX}_set_decrypt_key:
4203	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4204	call	__aesni_set_encrypt_key
4205	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
4206	test	%eax,%eax
4207	jnz	.Ldec_key_ret
4208	lea	16($key,$bits),$inp	# points at the end of key schedule
4209
4210	$movkey	($key),%xmm0		# just swap
4211	$movkey	($inp),%xmm1
4212	$movkey	%xmm0,($inp)
4213	$movkey	%xmm1,($key)
4214	lea	16($key),$key
4215	lea	-16($inp),$inp
4216
4217.Ldec_key_inverse:
4218	$movkey	($key),%xmm0		# swap and inverse
4219	$movkey	($inp),%xmm1
4220	aesimc	%xmm0,%xmm0
4221	aesimc	%xmm1,%xmm1
4222	lea	16($key),$key
4223	lea	-16($inp),$inp
4224	$movkey	%xmm0,16($inp)
4225	$movkey	%xmm1,-16($key)
4226	cmp	$key,$inp
4227	ja	.Ldec_key_inverse
4228
4229	$movkey	($key),%xmm0		# inverse middle
4230	aesimc	%xmm0,%xmm0
4231	pxor	%xmm1,%xmm1
4232	$movkey	%xmm0,($inp)
4233	pxor	%xmm0,%xmm0
4234.Ldec_key_ret:
4235	add	\$8,%rsp
4236	ret
4237.LSEH_end_set_decrypt_key:
4238.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4239___
4240
4241# This is based on submission by
4242#
4243#	Huang Ying <ying.huang@intel.com>
4244#	Vinodh Gopal <vinodh.gopal@intel.com>
4245#	Kahraman Akdemir
4246#
4247# Aggressively optimized in respect to aeskeygenassist's critical path
4248# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4249#
4250# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4251#				int bits, AES_KEY * const key);
4252#
4253# input:	$inp	user-supplied key
4254#		$bits	$inp length in bits
4255#		$key	pointer to key schedule
4256# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4257#		$bits	rounds-1 (used in aesni_set_decrypt_key)
4258#		*$key	key schedule
4259#		$key	pointer to key schedule (used in
4260#			aesni_set_decrypt_key)
4261#
4262# Subroutine is frame-less, which means that only volatile registers
4263# are used. Note that it's declared "abi-omnipotent", which means that
4264# amount of volatile registers is smaller on Windows.
4265#
4266$code.=<<___;
4267.globl	${PREFIX}_set_encrypt_key
4268.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
4269.align	16
4270${PREFIX}_set_encrypt_key:
4271__aesni_set_encrypt_key:
4272	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4273	mov	\$-1,%rax
4274	test	$inp,$inp
4275	jz	.Lenc_key_ret
4276	test	$key,$key
4277	jz	.Lenc_key_ret
4278
4279	movups	($inp),%xmm0		# pull first 128 bits of *userKey
4280	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
4281	leaq	OPENSSL_ia32cap_P(%rip),%r10
4282	movl	4(%r10),%r10d
4283	and	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
4284	lea	16($key),%rax		# %rax is used as modifiable copy of $key
4285	cmp	\$256,$bits
4286	je	.L14rounds
4287	cmp	\$192,$bits
4288	je	.L12rounds
4289	cmp	\$128,$bits
4290	jne	.Lbad_keybits
4291
4292.L10rounds:
4293	mov	\$9,$bits			# 10 rounds for 128-bit key
4294	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
4295	je	.L10rounds_alt
4296
4297	$movkey	%xmm0,($key)			# round 0
4298	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
4299	call		.Lkey_expansion_128_cold
4300	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
4301	call		.Lkey_expansion_128
4302	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
4303	call		.Lkey_expansion_128
4304	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
4305	call		.Lkey_expansion_128
4306	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
4307	call		.Lkey_expansion_128
4308	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
4309	call		.Lkey_expansion_128
4310	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
4311	call		.Lkey_expansion_128
4312	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
4313	call		.Lkey_expansion_128
4314	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
4315	call		.Lkey_expansion_128
4316	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
4317	call		.Lkey_expansion_128
4318	$movkey	%xmm0,(%rax)
4319	mov	$bits,80(%rax)	# 240(%rdx)
4320	xor	%eax,%eax
4321	jmp	.Lenc_key_ret
4322
4323.align	16
4324.L10rounds_alt:
4325	movdqa	.Lkey_rotate(%rip),%xmm5
4326	mov	\$8,%r10d
4327	movdqa	.Lkey_rcon1(%rip),%xmm4
4328	movdqa	%xmm0,%xmm2
4329	movdqu	%xmm0,($key)
4330	jmp	.Loop_key128
4331
4332.align	16
4333.Loop_key128:
4334	pshufb		%xmm5,%xmm0
4335	aesenclast	%xmm4,%xmm0
4336	pslld		\$1,%xmm4
4337	lea		16(%rax),%rax
4338
4339	movdqa		%xmm2,%xmm3
4340	pslldq		\$4,%xmm2
4341	pxor		%xmm2,%xmm3
4342	pslldq		\$4,%xmm2
4343	pxor		%xmm2,%xmm3
4344	pslldq		\$4,%xmm2
4345	pxor		%xmm3,%xmm2
4346
4347	pxor		%xmm2,%xmm0
4348	movdqu		%xmm0,-16(%rax)
4349	movdqa		%xmm0,%xmm2
4350
4351	dec	%r10d
4352	jnz	.Loop_key128
4353
4354	movdqa		.Lkey_rcon1b(%rip),%xmm4
4355
4356	pshufb		%xmm5,%xmm0
4357	aesenclast	%xmm4,%xmm0
4358	pslld		\$1,%xmm4
4359
4360	movdqa		%xmm2,%xmm3
4361	pslldq		\$4,%xmm2
4362	pxor		%xmm2,%xmm3
4363	pslldq		\$4,%xmm2
4364	pxor		%xmm2,%xmm3
4365	pslldq		\$4,%xmm2
4366	pxor		%xmm3,%xmm2
4367
4368	pxor		%xmm2,%xmm0
4369	movdqu		%xmm0,(%rax)
4370
4371	movdqa		%xmm0,%xmm2
4372	pshufb		%xmm5,%xmm0
4373	aesenclast	%xmm4,%xmm0
4374
4375	movdqa		%xmm2,%xmm3
4376	pslldq		\$4,%xmm2
4377	pxor		%xmm2,%xmm3
4378	pslldq		\$4,%xmm2
4379	pxor		%xmm2,%xmm3
4380	pslldq		\$4,%xmm2
4381	pxor		%xmm3,%xmm2
4382
4383	pxor		%xmm2,%xmm0
4384	movdqu		%xmm0,16(%rax)
4385
4386	mov	$bits,96(%rax)	# 240($key)
4387	xor	%eax,%eax
4388	jmp	.Lenc_key_ret
4389
4390.align	16
4391.L12rounds:
4392	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
4393	mov	\$11,$bits			# 12 rounds for 192
4394	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4395	je	.L12rounds_alt
4396
4397	$movkey	%xmm0,($key)			# round 0
4398	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
4399	call		.Lkey_expansion_192a_cold
4400	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
4401	call		.Lkey_expansion_192b
4402	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
4403	call		.Lkey_expansion_192a
4404	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
4405	call		.Lkey_expansion_192b
4406	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
4407	call		.Lkey_expansion_192a
4408	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
4409	call		.Lkey_expansion_192b
4410	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
4411	call		.Lkey_expansion_192a
4412	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
4413	call		.Lkey_expansion_192b
4414	$movkey	%xmm0,(%rax)
4415	mov	$bits,48(%rax)	# 240(%rdx)
4416	xor	%rax, %rax
4417	jmp	.Lenc_key_ret
4418
4419.align	16
4420.L12rounds_alt:
4421	movdqa	.Lkey_rotate192(%rip),%xmm5
4422	movdqa	.Lkey_rcon1(%rip),%xmm4
4423	mov	\$8,%r10d
4424	movdqu	%xmm0,($key)
4425	jmp	.Loop_key192
4426
4427.align	16
4428.Loop_key192:
4429	movq		%xmm2,0(%rax)
4430	movdqa		%xmm2,%xmm1
4431	pshufb		%xmm5,%xmm2
4432	aesenclast	%xmm4,%xmm2
4433	pslld		\$1, %xmm4
4434	lea		24(%rax),%rax
4435
4436	movdqa		%xmm0,%xmm3
4437	pslldq		\$4,%xmm0
4438	pxor		%xmm0,%xmm3
4439	pslldq		\$4,%xmm0
4440	pxor		%xmm0,%xmm3
4441	pslldq		\$4,%xmm0
4442	pxor		%xmm3,%xmm0
4443
4444	pshufd		\$0xff,%xmm0,%xmm3
4445	pxor		%xmm1,%xmm3
4446	pslldq		\$4,%xmm1
4447	pxor		%xmm1,%xmm3
4448
4449	pxor		%xmm2,%xmm0
4450	pxor		%xmm3,%xmm2
4451	movdqu		%xmm0,-16(%rax)
4452
4453	dec	%r10d
4454	jnz	.Loop_key192
4455
4456	mov	$bits,32(%rax)	# 240($key)
4457	xor	%eax,%eax
4458	jmp	.Lenc_key_ret
4459
4460.align	16
4461.L14rounds:
4462	movups	16($inp),%xmm2			# remaning half of *userKey
4463	mov	\$13,$bits			# 14 rounds for 256
4464	lea	16(%rax),%rax
4465	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4466	je	.L14rounds_alt
4467
4468	$movkey	%xmm0,($key)			# round 0
4469	$movkey	%xmm2,16($key)			# round 1
4470	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
4471	call		.Lkey_expansion_256a_cold
4472	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
4473	call		.Lkey_expansion_256b
4474	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
4475	call		.Lkey_expansion_256a
4476	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
4477	call		.Lkey_expansion_256b
4478	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
4479	call		.Lkey_expansion_256a
4480	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
4481	call		.Lkey_expansion_256b
4482	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
4483	call		.Lkey_expansion_256a
4484	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
4485	call		.Lkey_expansion_256b
4486	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
4487	call		.Lkey_expansion_256a
4488	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
4489	call		.Lkey_expansion_256b
4490	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
4491	call		.Lkey_expansion_256a
4492	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
4493	call		.Lkey_expansion_256b
4494	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
4495	call		.Lkey_expansion_256a
4496	$movkey	%xmm0,(%rax)
4497	mov	$bits,16(%rax)	# 240(%rdx)
4498	xor	%rax,%rax
4499	jmp	.Lenc_key_ret
4500
4501.align	16
4502.L14rounds_alt:
4503	movdqa	.Lkey_rotate(%rip),%xmm5
4504	movdqa	.Lkey_rcon1(%rip),%xmm4
4505	mov	\$7,%r10d
4506	movdqu	%xmm0,0($key)
4507	movdqa	%xmm2,%xmm1
4508	movdqu	%xmm2,16($key)
4509	jmp	.Loop_key256
4510
4511.align	16
4512.Loop_key256:
4513	pshufb		%xmm5,%xmm2
4514	aesenclast	%xmm4,%xmm2
4515
4516	movdqa		%xmm0,%xmm3
4517	pslldq		\$4,%xmm0
4518	pxor		%xmm0,%xmm3
4519	pslldq		\$4,%xmm0
4520	pxor		%xmm0,%xmm3
4521	pslldq		\$4,%xmm0
4522	pxor		%xmm3,%xmm0
4523	pslld		\$1,%xmm4
4524
4525	pxor		%xmm2,%xmm0
4526	movdqu		%xmm0,(%rax)
4527
4528	dec	%r10d
4529	jz	.Ldone_key256
4530
4531	pshufd		\$0xff,%xmm0,%xmm2
4532	pxor		%xmm3,%xmm3
4533	aesenclast	%xmm3,%xmm2
4534
4535	movdqa		%xmm1,%xmm3
4536	pslldq		\$4,%xmm1
4537	pxor		%xmm1,%xmm3
4538	pslldq		\$4,%xmm1
4539	pxor		%xmm1,%xmm3
4540	pslldq		\$4,%xmm1
4541	pxor		%xmm3,%xmm1
4542
4543	pxor		%xmm1,%xmm2
4544	movdqu		%xmm2,16(%rax)
4545	lea		32(%rax),%rax
4546	movdqa		%xmm2,%xmm1
4547
4548	jmp	.Loop_key256
4549
4550.Ldone_key256:
4551	mov	$bits,16(%rax)	# 240($key)
4552	xor	%eax,%eax
4553	jmp	.Lenc_key_ret
4554
4555.align	16
4556.Lbad_keybits:
4557	mov	\$-2,%rax
4558.Lenc_key_ret:
4559	pxor	%xmm0,%xmm0
4560	pxor	%xmm1,%xmm1
4561	pxor	%xmm2,%xmm2
4562	pxor	%xmm3,%xmm3
4563	pxor	%xmm4,%xmm4
4564	pxor	%xmm5,%xmm5
4565	add	\$8,%rsp
4566	ret
4567.LSEH_end_set_encrypt_key:
4568
4569.align	16
4570.Lkey_expansion_128:
4571	$movkey	%xmm0,(%rax)
4572	lea	16(%rax),%rax
4573.Lkey_expansion_128_cold:
4574	shufps	\$0b00010000,%xmm0,%xmm4
4575	xorps	%xmm4, %xmm0
4576	shufps	\$0b10001100,%xmm0,%xmm4
4577	xorps	%xmm4, %xmm0
4578	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4579	xorps	%xmm1,%xmm0
4580	ret
4581
4582.align 16
4583.Lkey_expansion_192a:
4584	$movkey	%xmm0,(%rax)
4585	lea	16(%rax),%rax
4586.Lkey_expansion_192a_cold:
4587	movaps	%xmm2, %xmm5
4588.Lkey_expansion_192b_warm:
4589	shufps	\$0b00010000,%xmm0,%xmm4
4590	movdqa	%xmm2,%xmm3
4591	xorps	%xmm4,%xmm0
4592	shufps	\$0b10001100,%xmm0,%xmm4
4593	pslldq	\$4,%xmm3
4594	xorps	%xmm4,%xmm0
4595	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
4596	pxor	%xmm3,%xmm2
4597	pxor	%xmm1,%xmm0
4598	pshufd	\$0b11111111,%xmm0,%xmm3
4599	pxor	%xmm3,%xmm2
4600	ret
4601
4602.align 16
4603.Lkey_expansion_192b:
4604	movaps	%xmm0,%xmm3
4605	shufps	\$0b01000100,%xmm0,%xmm5
4606	$movkey	%xmm5,(%rax)
4607	shufps	\$0b01001110,%xmm2,%xmm3
4608	$movkey	%xmm3,16(%rax)
4609	lea	32(%rax),%rax
4610	jmp	.Lkey_expansion_192b_warm
4611
4612.align	16
4613.Lkey_expansion_256a:
4614	$movkey	%xmm2,(%rax)
4615	lea	16(%rax),%rax
4616.Lkey_expansion_256a_cold:
4617	shufps	\$0b00010000,%xmm0,%xmm4
4618	xorps	%xmm4,%xmm0
4619	shufps	\$0b10001100,%xmm0,%xmm4
4620	xorps	%xmm4,%xmm0
4621	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4622	xorps	%xmm1,%xmm0
4623	ret
4624
4625.align 16
4626.Lkey_expansion_256b:
4627	$movkey	%xmm0,(%rax)
4628	lea	16(%rax),%rax
4629
4630	shufps	\$0b00010000,%xmm2,%xmm4
4631	xorps	%xmm4,%xmm2
4632	shufps	\$0b10001100,%xmm2,%xmm4
4633	xorps	%xmm4,%xmm2
4634	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
4635	xorps	%xmm1,%xmm2
4636	ret
4637.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4638.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4639___
4640}
4641
4642$code.=<<___;
4643.align	64
4644.Lbswap_mask:
4645	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4646.Lincrement32:
4647	.long	6,6,6,0
4648.Lincrement64:
4649	.long	1,0,0,0
4650.Lxts_magic:
4651	.long	0x87,0,1,0
4652.Lincrement1:
4653	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4654.Lkey_rotate:
4655	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4656.Lkey_rotate192:
4657	.long	0x04070605,0x04070605,0x04070605,0x04070605
4658.Lkey_rcon1:
4659	.long	1,1,1,1
4660.Lkey_rcon1b:
4661	.long	0x1b,0x1b,0x1b,0x1b
4662
4663.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4664.align	64
4665___
4666
4667# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4668#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4669if ($win64) {
4670$rec="%rcx";
4671$frame="%rdx";
4672$context="%r8";
4673$disp="%r9";
4674
4675$code.=<<___;
4676.extern	__imp_RtlVirtualUnwind
4677___
4678$code.=<<___ if ($PREFIX eq "aesni");
4679.type	ecb_ccm64_se_handler,\@abi-omnipotent
4680.align	16
4681ecb_ccm64_se_handler:
4682	push	%rsi
4683	push	%rdi
4684	push	%rbx
4685	push	%rbp
4686	push	%r12
4687	push	%r13
4688	push	%r14
4689	push	%r15
4690	pushfq
4691	sub	\$64,%rsp
4692
4693	mov	120($context),%rax	# pull context->Rax
4694	mov	248($context),%rbx	# pull context->Rip
4695
4696	mov	8($disp),%rsi		# disp->ImageBase
4697	mov	56($disp),%r11		# disp->HandlerData
4698
4699	mov	0(%r11),%r10d		# HandlerData[0]
4700	lea	(%rsi,%r10),%r10	# prologue label
4701	cmp	%r10,%rbx		# context->Rip<prologue label
4702	jb	.Lcommon_seh_tail
4703
4704	mov	152($context),%rax	# pull context->Rsp
4705
4706	mov	4(%r11),%r10d		# HandlerData[1]
4707	lea	(%rsi,%r10),%r10	# epilogue label
4708	cmp	%r10,%rbx		# context->Rip>=epilogue label
4709	jae	.Lcommon_seh_tail
4710
4711	lea	0(%rax),%rsi		# %xmm save area
4712	lea	512($context),%rdi	# &context.Xmm6
4713	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
4714	.long	0xa548f3fc		# cld; rep movsq
4715	lea	0x58(%rax),%rax		# adjust stack pointer
4716
4717	jmp	.Lcommon_seh_tail
4718.size	ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4719
4720.type	ctr_xts_se_handler,\@abi-omnipotent
4721.align	16
4722ctr_xts_se_handler:
4723	push	%rsi
4724	push	%rdi
4725	push	%rbx
4726	push	%rbp
4727	push	%r12
4728	push	%r13
4729	push	%r14
4730	push	%r15
4731	pushfq
4732	sub	\$64,%rsp
4733
4734	mov	120($context),%rax	# pull context->Rax
4735	mov	248($context),%rbx	# pull context->Rip
4736
4737	mov	8($disp),%rsi		# disp->ImageBase
4738	mov	56($disp),%r11		# disp->HandlerData
4739
4740	mov	0(%r11),%r10d		# HandlerData[0]
4741	lea	(%rsi,%r10),%r10	# prologue lable
4742	cmp	%r10,%rbx		# context->Rip<prologue label
4743	jb	.Lcommon_seh_tail
4744
4745	mov	152($context),%rax	# pull context->Rsp
4746
4747	mov	4(%r11),%r10d		# HandlerData[1]
4748	lea	(%rsi,%r10),%r10	# epilogue label
4749	cmp	%r10,%rbx		# context->Rip>=epilogue label
4750	jae	.Lcommon_seh_tail
4751
4752	mov	208($context),%rax	# pull context->R11
4753
4754	lea	-0xa8(%rax),%rsi	# %xmm save area
4755	lea	512($context),%rdi	# & context.Xmm6
4756	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4757	.long	0xa548f3fc		# cld; rep movsq
4758
4759	mov	-8(%rax),%rbp		# restore saved %rbp
4760	mov	%rbp,160($context)	# restore context->Rbp
4761	jmp	.Lcommon_seh_tail
4762.size	ctr_xts_se_handler,.-ctr_xts_se_handler
4763
4764.type	ocb_se_handler,\@abi-omnipotent
4765.align	16
4766ocb_se_handler:
4767	push	%rsi
4768	push	%rdi
4769	push	%rbx
4770	push	%rbp
4771	push	%r12
4772	push	%r13
4773	push	%r14
4774	push	%r15
4775	pushfq
4776	sub	\$64,%rsp
4777
4778	mov	120($context),%rax	# pull context->Rax
4779	mov	248($context),%rbx	# pull context->Rip
4780
4781	mov	8($disp),%rsi		# disp->ImageBase
4782	mov	56($disp),%r11		# disp->HandlerData
4783
4784	mov	0(%r11),%r10d		# HandlerData[0]
4785	lea	(%rsi,%r10),%r10	# prologue lable
4786	cmp	%r10,%rbx		# context->Rip<prologue label
4787	jb	.Lcommon_seh_tail
4788
4789	mov	4(%r11),%r10d		# HandlerData[1]
4790	lea	(%rsi,%r10),%r10	# epilogue label
4791	cmp	%r10,%rbx		# context->Rip>=epilogue label
4792	jae	.Lcommon_seh_tail
4793
4794	mov	8(%r11),%r10d		# HandlerData[2]
4795	lea	(%rsi,%r10),%r10
4796	cmp	%r10,%rbx		# context->Rip>=pop label
4797	jae	.Locb_no_xmm
4798
4799	mov	152($context),%rax	# pull context->Rsp
4800
4801	lea	(%rax),%rsi		# %xmm save area
4802	lea	512($context),%rdi	# & context.Xmm6
4803	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4804	.long	0xa548f3fc		# cld; rep movsq
4805	lea	0xa0+0x28(%rax),%rax
4806
4807.Locb_no_xmm:
4808	mov	-8(%rax),%rbx
4809	mov	-16(%rax),%rbp
4810	mov	-24(%rax),%r12
4811	mov	-32(%rax),%r13
4812	mov	-40(%rax),%r14
4813
4814	mov	%rbx,144($context)	# restore context->Rbx
4815	mov	%rbp,160($context)	# restore context->Rbp
4816	mov	%r12,216($context)	# restore context->R12
4817	mov	%r13,224($context)	# restore context->R13
4818	mov	%r14,232($context)	# restore context->R14
4819
4820	jmp	.Lcommon_seh_tail
4821.size	ocb_se_handler,.-ocb_se_handler
4822___
4823$code.=<<___;
4824.type	cbc_se_handler,\@abi-omnipotent
4825.align	16
4826cbc_se_handler:
4827	push	%rsi
4828	push	%rdi
4829	push	%rbx
4830	push	%rbp
4831	push	%r12
4832	push	%r13
4833	push	%r14
4834	push	%r15
4835	pushfq
4836	sub	\$64,%rsp
4837
4838	mov	152($context),%rax	# pull context->Rsp
4839	mov	248($context),%rbx	# pull context->Rip
4840
4841	lea	.Lcbc_decrypt_bulk(%rip),%r10
4842	cmp	%r10,%rbx		# context->Rip<"prologue" label
4843	jb	.Lcommon_seh_tail
4844
4845	mov	120($context),%rax	# pull context->Rax
4846
4847	lea	.Lcbc_decrypt_body(%rip),%r10
4848	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
4849	jb	.Lcommon_seh_tail
4850
4851	mov	152($context),%rax	# pull context->Rsp
4852
4853	lea	.Lcbc_ret(%rip),%r10
4854	cmp	%r10,%rbx		# context->Rip>="epilogue" label
4855	jae	.Lcommon_seh_tail
4856
4857	lea	16(%rax),%rsi		# %xmm save area
4858	lea	512($context),%rdi	# &context.Xmm6
4859	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4860	.long	0xa548f3fc		# cld; rep movsq
4861
4862	mov	208($context),%rax	# pull context->R11
4863
4864	mov	-8(%rax),%rbp		# restore saved %rbp
4865	mov	%rbp,160($context)	# restore context->Rbp
4866
4867.Lcommon_seh_tail:
4868	mov	8(%rax),%rdi
4869	mov	16(%rax),%rsi
4870	mov	%rax,152($context)	# restore context->Rsp
4871	mov	%rsi,168($context)	# restore context->Rsi
4872	mov	%rdi,176($context)	# restore context->Rdi
4873
4874	mov	40($disp),%rdi		# disp->ContextRecord
4875	mov	$context,%rsi		# context
4876	mov	\$154,%ecx		# sizeof(CONTEXT)
4877	.long	0xa548f3fc		# cld; rep movsq
4878
4879	mov	$disp,%rsi
4880	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4881	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4882	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4883	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4884	mov	40(%rsi),%r10		# disp->ContextRecord
4885	lea	56(%rsi),%r11		# &disp->HandlerData
4886	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4887	mov	%r10,32(%rsp)		# arg5
4888	mov	%r11,40(%rsp)		# arg6
4889	mov	%r12,48(%rsp)		# arg7
4890	mov	%rcx,56(%rsp)		# arg8, (NULL)
4891	call	*__imp_RtlVirtualUnwind(%rip)
4892
4893	mov	\$1,%eax		# ExceptionContinueSearch
4894	add	\$64,%rsp
4895	popfq
4896	pop	%r15
4897	pop	%r14
4898	pop	%r13
4899	pop	%r12
4900	pop	%rbp
4901	pop	%rbx
4902	pop	%rdi
4903	pop	%rsi
4904	ret
4905.size	cbc_se_handler,.-cbc_se_handler
4906
4907.section	.pdata
4908.align	4
4909___
4910$code.=<<___ if ($PREFIX eq "aesni");
4911	.rva	.LSEH_begin_aesni_ecb_encrypt
4912	.rva	.LSEH_end_aesni_ecb_encrypt
4913	.rva	.LSEH_info_ecb
4914
4915	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
4916	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
4917	.rva	.LSEH_info_ccm64_enc
4918
4919	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
4920	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
4921	.rva	.LSEH_info_ccm64_dec
4922
4923	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
4924	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
4925	.rva	.LSEH_info_ctr32
4926
4927	.rva	.LSEH_begin_aesni_xts_encrypt
4928	.rva	.LSEH_end_aesni_xts_encrypt
4929	.rva	.LSEH_info_xts_enc
4930
4931	.rva	.LSEH_begin_aesni_xts_decrypt
4932	.rva	.LSEH_end_aesni_xts_decrypt
4933	.rva	.LSEH_info_xts_dec
4934
4935	.rva	.LSEH_begin_aesni_ocb_encrypt
4936	.rva	.LSEH_end_aesni_ocb_encrypt
4937	.rva	.LSEH_info_ocb_enc
4938
4939	.rva	.LSEH_begin_aesni_ocb_decrypt
4940	.rva	.LSEH_end_aesni_ocb_decrypt
4941	.rva	.LSEH_info_ocb_dec
4942___
4943$code.=<<___;
4944	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
4945	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
4946	.rva	.LSEH_info_cbc
4947
4948	.rva	${PREFIX}_set_decrypt_key
4949	.rva	.LSEH_end_set_decrypt_key
4950	.rva	.LSEH_info_key
4951
4952	.rva	${PREFIX}_set_encrypt_key
4953	.rva	.LSEH_end_set_encrypt_key
4954	.rva	.LSEH_info_key
4955.section	.xdata
4956.align	8
4957___
4958$code.=<<___ if ($PREFIX eq "aesni");
4959.LSEH_info_ecb:
4960	.byte	9,0,0,0
4961	.rva	ecb_ccm64_se_handler
4962	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
4963.LSEH_info_ccm64_enc:
4964	.byte	9,0,0,0
4965	.rva	ecb_ccm64_se_handler
4966	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
4967.LSEH_info_ccm64_dec:
4968	.byte	9,0,0,0
4969	.rva	ecb_ccm64_se_handler
4970	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
4971.LSEH_info_ctr32:
4972	.byte	9,0,0,0
4973	.rva	ctr_xts_se_handler
4974	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
4975.LSEH_info_xts_enc:
4976	.byte	9,0,0,0
4977	.rva	ctr_xts_se_handler
4978	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
4979.LSEH_info_xts_dec:
4980	.byte	9,0,0,0
4981	.rva	ctr_xts_se_handler
4982	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
4983.LSEH_info_ocb_enc:
4984	.byte	9,0,0,0
4985	.rva	ocb_se_handler
4986	.rva	.Locb_enc_body,.Locb_enc_epilogue	# HandlerData[]
4987	.rva	.Locb_enc_pop
4988	.long	0
4989.LSEH_info_ocb_dec:
4990	.byte	9,0,0,0
4991	.rva	ocb_se_handler
4992	.rva	.Locb_dec_body,.Locb_dec_epilogue	# HandlerData[]
4993	.rva	.Locb_dec_pop
4994	.long	0
4995___
4996$code.=<<___;
4997.LSEH_info_cbc:
4998	.byte	9,0,0,0
4999	.rva	cbc_se_handler
5000.LSEH_info_key:
5001	.byte	0x01,0x04,0x01,0x00
5002	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
5003___
5004}
5005
5006sub rex {
5007  local *opcode=shift;
5008  my ($dst,$src)=@_;
5009  my $rex=0;
5010
5011    $rex|=0x04			if($dst>=8);
5012    $rex|=0x01			if($src>=8);
5013    push @opcode,$rex|0x40	if($rex);
5014}
5015
5016sub aesni {
5017  my $line=shift;
5018  my @opcode=(0x66);
5019
5020    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5021	rex(\@opcode,$4,$3);
5022	push @opcode,0x0f,0x3a,0xdf;
5023	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
5024	my $c=$2;
5025	push @opcode,$c=~/^0/?oct($c):$c;
5026	return ".byte\t".join(',',@opcode);
5027    }
5028    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5029	my %opcodelet = (
5030		"aesimc" => 0xdb,
5031		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5032		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5033	);
5034	return undef if (!defined($opcodelet{$1}));
5035	rex(\@opcode,$3,$2);
5036	push @opcode,0x0f,0x38,$opcodelet{$1};
5037	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
5038	return ".byte\t".join(',',@opcode);
5039    }
5040    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5041	my %opcodelet = (
5042		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5043		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5044	);
5045	return undef if (!defined($opcodelet{$1}));
5046	my $off = $2;
5047	push @opcode,0x44 if ($3>=8);
5048	push @opcode,0x0f,0x38,$opcodelet{$1};
5049	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
5050	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5051	return ".byte\t".join(',',@opcode);
5052    }
5053    return $line;
5054}
5055
5056sub movbe {
5057	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
5058}
5059
5060$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5061$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5062#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
5063$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5064
5065print $code;
5066
5067close STDOUT;
5068