1392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#!/usr/bin/env perl
2392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
3392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
4392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# project. The module is, however, dual licensed under OpenSSL and
6392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CRYPTOGAMS licenses depending on where you obtain it. For further
7392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details see http://www.openssl.org/~appro/cryptogams/.
8392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ====================================================================
9392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
10392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# This module implements support for Intel AES-NI extension. In
11392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# OpenSSL context it's used with Intel engine, but can also be used as
12392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# details].
14392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
15392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Performance.
16392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
17392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Given aes(enc|dec) instructions' latency asymptotic performance for
18392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# processed with 128-bit key. And given their throughput asymptotic
20392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# performance for parallelizable modes is 1.25 cycles per byte. Being
21392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# asymptotic limit it's not something you commonly achieve in reality,
22392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# but how close does one get? Below are results collected for
23392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# different modes and block sized. Pairs of numbers are for en-/
24392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# decryption.
25392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
26392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	16-byte     64-byte     256-byte    1-KB        8-KB
27392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
28392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
29392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
30392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
31392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
32392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
33392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
34392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# The results were collected with specially crafted speed.c benchmark
38392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in order to compare them with results reported in "Intel Advanced
39392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Encryption Standard (AES) New Instruction Set" White Paper Revision
40392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 3.0 dated May 2010. All above results are consistently better. This
41392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# module also provides better performance for block sizes smaller than
42392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 128 bytes in points *not* represented in the above table.
43392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
44392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Looking at the results for 8-KB buffer.
45392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
46392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CFB and OFB results are far from the limit, because implementation
47392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# single-block aesni_encrypt, which is not the most optimal way to go.
49392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CBC encrypt result is unexpectedly high and there is no documented
50392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# explanation for it. Seemingly there is a small penalty for feeding
51392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# the result back to AES unit the way it's done in CBC mode. There is
52392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# nothing one can do and the result appears optimal. CCM result is
53392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# identical to CBC, because CBC-MAC is essentially CBC encrypt without
54392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# saving output. CCM CTR "stays invisible," because it's neatly
55392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# interleaved wih CBC-MAC. This provides ~30% improvement over
56392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "straghtforward" CCM implementation with CTR and CBC-MAC performed
57392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# disjointly. Parallelizable modes practically achieve the theoretical
58392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# limit.
59392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
60392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Looking at how results vary with buffer size.
61392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
62392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Curves are practically saturated at 1-KB buffer size. In most cases
63392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CTR curve doesn't follow this pattern and is "slowest" changing one
65392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# with "256-byte" result being 87% of "8-KB." This is because overhead
66392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in CTR mode is most computationally intensive. Small-block CCM
67392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# decrypt is slower than encrypt, because first CTR and last CBC-MAC
68392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# iterations can't be interleaved.
69392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
70392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Results for 192- and 256-bit keys.
71392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
72392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# EVP-free results were observed to scale perfectly with number of
73392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# are a tad smaller, because the above mentioned penalty biases all
76392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# results by same constant value. In similar way function call
77392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# overhead affects small-block performance, as well as OFB and CFB
78392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# results. Differences are not large, most common coefficients are
79392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
82392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# January 2011
83392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
84392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# While Westmere processor features 6 cycles latency for aes[enc|dec]
85392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# instructions, which can be scheduled every second cycle, Sandy
86392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Bridge spends 8 cycles per instruction, but it can schedule them
87392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# every cycle. This means that code targeting Westmere would perform
88392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# suboptimally on Sandy Bridge. Therefore this update.
89392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
90392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# In addition, non-parallelizable CBC encrypt (as well as CCM) is
91392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# optimized. Relative improvement might appear modest, 8% on Westmere,
92392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# but in absolute terms it's 3.77 cycles per byte encrypted with
93392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# should be compared to asymptotic limits of 3.75 for Westmere and
95392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 5.00 for Sandy Bridge. Actually, the fact that they get this close
96392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# to asymptotic limits is quite amazing. Indeed, the limit is
97392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# calculated as latency times number of rounds, 10 for 128-bit key,
98392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# and divided by 16, the number of bytes in block, or in other words
99392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# it accounts *solely* for aesenc instructions. But there are extra
100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# instructions, and numbers so close to the asymptotic limits mean
101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# that it's as if it takes as little as *one* additional cycle to
102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# execute all of them. How is it possible? It is possible thanks to
103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# out-of-order execution logic, which manages to overlap post-
104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# processing of previous block, things like saving the output, with
105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# actual encryption of current block, as well as pre-processing of
106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# current block, things like fetching input and xor-ing it with
107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 0-round element of the key schedule, with actual encryption of
108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# previous block. Keep this in mind...
109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# performance is achieved by interleaving instructions working on
112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# independent blocks. In which case asymptotic limit for such modes
113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# can be obtained by dividing above mentioned numbers by AES
114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# instructions' interleave factor. Westmere can execute at most 3
115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# instructions at a time, meaning that optimal interleave factor is 3,
116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# and that's where the "magic" number of 1.25 come from. "Optimal
117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# interleave factor" means that increase of interleave factor does
118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# not improve performance. The formula has proven to reflect reality
119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# pretty well on Westmere... Sandy Bridge on the other hand can
120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# execute up to 8 AES instructions at a time, so how does varying
121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# interleave factor affect the performance? Here is table for ECB
122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# (numbers are cycles per byte processed with 128-bit key):
123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# instruction interleave factor		3x	6x	8x
125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# theoretical asymptotic limit		1.67	0.83	0.625
126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# measured performance for 8KB block	1.05	0.86	0.84
127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# "as if" interleave factor		4.7x	5.8x	6.0x
129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Further data for other parallelizable modes:
131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CBC decrypt				1.16	0.93	0.93
133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CTR					1.14	0.91	n/a
134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Well, given 3x column it's probably inappropriate to call the limit
136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# asymptotic, if it can be surpassed, isn't it? What happens there?
137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# magic is responsible for this. Processor overlaps not only the
139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# additional instructions with AES ones, but even AES instuctions
140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# processing adjacent triplets of independent blocks. In the 6x case
141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# additional instructions  still claim disproportionally small amount
142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# of additional cycles, but in 8x case number of instructions must be
143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# a tad too high for out-of-order logic to cope with, and AES unit
144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# remains underutilized... As you can see 8x interleave is hardly
145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# utilizies 6x interleave because of limited register bank capacity.
147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Higher interleave factors do have negative impact on Westmere
149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# performance. While for ECB mode it's negligible ~1.5%, other
150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# parallelizables perform ~5% worse, which is outweighed by ~25%
151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# improvement on Sandy Bridge. To balance regression on Westmere
152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# CTR mode was implemented with 6x aesenc interleave factor.
153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# April 2011
155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# in CTR mode AES instruction interleave factor was chosen to be 6x.
159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			# generates drop-in replacement for
162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom			# crypto/aes/asm/aes-x86_64.pl:-)
163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$flavour = shift;
165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$output  = shift;
166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromdie "can't locate x86_64-xlate.pl";
174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
17504ef91b390dfcc6125913e2f2af502d23d7a5112Brian Carlstromopen OUT,"| \"$^X\" $xlate $flavour $output";
17604ef91b390dfcc6125913e2f2af502d23d7a5112Brian Carlstrom*STDOUT=*OUT;
177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		("%rdi","%rsi","%rdx","%rcx");	# Unix order
181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code=".text\n";
183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$inp="%rdi";
187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$out="%rsi";
188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$len="%rdx";
189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$ivp="%r8";	# cbc, ctr, ...
191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rnds_="%r10d";	# backup copy for $rounds
193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$key_="%r11";	# backup copy for $key
194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# %xmm register layout
196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rndkey0="%xmm0";	$rndkey1="%xmm1";
197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$inout0="%xmm2";	$inout1="%xmm3";
198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$inout2="%xmm4";	$inout3="%xmm5";
199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$inout4="%xmm6";	$inout5="%xmm7";
200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$inout6="%xmm8";	$inout7="%xmm9";
201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$in0="%xmm8";		$iv="%xmm9";
204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Inline version of internal aesni_[en|de]crypt1.
206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# cycles which take care of loop variables...
209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ my $sn;
210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub aesni_generate1 {
211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom++$sn;
213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey0
215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	16($key),$rndkey1
216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if (defined($ivec));
218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$ivec
219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($key),$key
220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$ivec,$inout
221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if (!defined($ivec));
223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($key),$key
224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout
225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Loop_${p}1_$sn:
228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${p}	$rndkey1,$inout
229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey1
231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($key),$key
232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${p}last	$rndkey1,$inout
234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}}
236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ my ($inp,$out,$key) = @_4args;
239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	${PREFIX}_encrypt
242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	${PREFIX}_encrypt,\@abi-omnipotent
243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom${PREFIX}_encrypt:
245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0		# load input
246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rounds	# key->rounds
247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key,$rounds);
249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)		# output
251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	${PREFIX}_decrypt
255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	${PREFIX}_decrypt,\@abi-omnipotent
256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom${PREFIX}_decrypt:
258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0		# load input
259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rounds	# key->rounds
260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("dec",$key,$rounds);
262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)		# output
264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# factor. Why 3x subroutine were originally used in loops? Even though
271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# aes[enc|dec] latency was originally 6, it could be scheduled only
272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# every *2nd* cycle. Thus 3x interleave was the one providing optimal
273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# utilization, i.e. when subroutine's throughput is virtually same as
274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# of non-interleaved subroutine [for number of input blocks up to 3].
275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# This is why it makes no sense to implement 2x subroutine.
276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# aes[enc|dec] latency in next processor generation is 8, but the
277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# instructions can be scheduled every cycle. Optimal interleave for
278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# new processor is therefore 8x...
279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub aesni_generate3 {
280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $dir=shift;
281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# As already mentioned it takes in $key and $rounds, which are *not*
282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# preserved. $inout[0-2] is cipher/clear text...
283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	_aesni_${dir}rypt3,\@abi-omnipotent
285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_aesni_${dir}rypt3:
287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey0
288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$1,$rounds
289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	16($key),$rndkey1
290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($key),$key
291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout0
292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout1
293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout2
294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L${dir}_loop3:
297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout0
303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout1
304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout2
306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz		.L${dir}_loop3
308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout0
313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout1
314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout2
315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# 4x interleave is implemented to improve small block performance,
320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# most notably [and naturally] 4 block by ~30%. One can argue that one
321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# should have implemented 5x as well, but improvement would be <20%,
322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# so it's not worth it...
323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub aesni_generate4 {
324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $dir=shift;
325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# As already mentioned it takes in $key and $rounds, which are *not*
326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# preserved. $inout[0-3] is cipher/clear text...
327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	_aesni_${dir}rypt4,\@abi-omnipotent
329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_aesni_${dir}rypt4:
331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey0
332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$1,$rounds
333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	16($key),$rndkey1
334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($key),$key
335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout0
336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout1
337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout2
338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout3
339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey0
340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L${dir}_loop4:
342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout3
347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout0
349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout1
350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout2
352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout3
353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz		.L${dir}_loop4
355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout3
360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout0
361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout1
362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout2
363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout3
364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub aesni_generate6 {
369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $dir=shift;
370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# As already mentioned it takes in $key and $rounds, which are *not*
371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# preserved. $inout[0-5] is cipher/clear text...
372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	_aesni_${dir}rypt6,\@abi-omnipotent
374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_aesni_${dir}rypt6:
376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr		\$1,$rounds
378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps		$rndkey0,$inout0
381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout1
382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout2
384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout3
386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout4
388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout3
389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout5
390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout4
392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout5
394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp		.L${dir}_loop6_enter
395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L${dir}_loop6:
397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout3
402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout4
403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout5
404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L${dir}_loop6_enter:				# happens to be 16-byte aligned
405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout0
407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout1
408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout2
410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout3
411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout4
412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout5
413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz		.L${dir}_loop6
415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout3
420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout4
421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout5
422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout0
423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout1
424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout2
425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout3
426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout4
427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout5
428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub aesni_generate8 {
433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $dir=shift;
434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# As already mentioned it takes in $key and $rounds, which are *not*
435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# preserved. $inout[0-7] is cipher/clear text...
436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	_aesni_${dir}rypt8,\@abi-omnipotent
438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom_aesni_${dir}rypt8:
440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr		\$1,$rounds
442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps		$rndkey0,$inout0
445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps		$rndkey0,$inout1
446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout2
448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout3
450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout4
452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout3
453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout5
454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout4
456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout6
457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout5
458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout7
459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout6
461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout7
462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp		.L${dir}_loop8_enter
464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L${dir}_loop8:
466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout3
471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout4
472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout5
473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout6
474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout7
475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L${dir}_loop8_enter:				# happens to be 16-byte aligned
477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout0
478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout1
479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout2
481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout3
482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout4
483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout5
484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout6
485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey0,$inout7
486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz		.L${dir}_loop8
488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout0
490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout1
491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout2
492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout3
493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout4
494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout5
495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout6
496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}	$rndkey1,$inout7
497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout0
498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout1
499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout2
500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout3
501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout4
502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout5
503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout6
504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aes${dir}last	$rndkey0,$inout7
505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&aesni_generate3("enc") if ($PREFIX eq "aesni");
510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&aesni_generate3("dec");
511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&aesni_generate4("enc") if ($PREFIX eq "aesni");
512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&aesni_generate4("dec");
513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&aesni_generate6("enc") if ($PREFIX eq "aesni");
514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&aesni_generate6("dec");
515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&aesni_generate8("enc") if ($PREFIX eq "aesni");
516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom&aesni_generate8("dec");
517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($PREFIX eq "aesni") {
519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom########################################################################
520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void aesni_ecb_encrypt (const void *in, void *out,
521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#			  size_t length, const AES_KEY *key,
522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#			  int enc);
523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	aesni_ecb_encrypt
525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	aesni_ecb_encrypt,\@function,5
526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromaesni_ecb_encrypt:
528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$-16,$len
529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lecb_ret
530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rounds	# key->rounds
532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey0
533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key,$key_		# backup $key
534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rounds,$rnds_		# backup $rounds
535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	%r8d,%r8d		# 5th argument
536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lecb_decrypt
537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#--------------------------- ECB ENCRYPT ------------------------------#
538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x80,$len
539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_tail
540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($inp),$inout0
542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp),$inout1
543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp),$inout2
544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp),$inout3
545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp),$inout4
546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp),$inout5
547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp),$inout6
548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp),$inout7
549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp),$inp
550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_enc_loop8_enter
552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16
553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_loop8:
554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key		# restore $key
556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($inp),$inout0
557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds		# restore $rounds
558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp),$inout1
560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp),$inout2
562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp),$inout3
564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp),$inout4
566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp),$inout5
568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout6,0x60($out)
569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp),$inout6
570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout7,0x70($out)
571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out),$out
572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp),$inout7
573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp),$inp
574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_loop8_enter:
575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt8
577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lecb_enc_loop8
580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key		# restore $key
583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds		# restore $rounds
585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout6,0x60($out)
590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout7,0x70($out)
591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out),$out
592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$0x80,$len
593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lecb_ret
594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_tail:
596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x20,$len
598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_one
599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$inout1
600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_enc_two
601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x20($inp),$inout2
602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x40,$len
603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_three
604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$inout3
605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_enc_four
606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x40($inp),$inout4
607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x60,$len
608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_enc_five
609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x50($inp),$inout5
610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_enc_six
611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp),$inout6
612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt8
613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout6,0x60($out)
620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_one:
623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key,$rounds);
625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_two:
630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout2,$inout2
631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt3
632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_three:
637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt3
638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_four:
644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt4
645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_five:
652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout5,$inout5
653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt6
654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_enc_six:
662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt6
663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#--------------------------- ECB DECRYPT ------------------------------#
671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_decrypt:
673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x80,$len
674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_tail
675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($inp),$inout0
677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp),$inout1
678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp),$inout2
679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp),$inout3
680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp),$inout4
681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp),$inout5
682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp),$inout6
683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp),$inout7
684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp),$inp
685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_dec_loop8_enter
687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16
688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_loop8:
689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key		# restore $key
691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($inp),$inout0
692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds		# restore $rounds
693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x10($inp),$inout1
695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp),$inout2
697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp),$inout3
699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp),$inout4
701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp),$inout5
703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout6,0x60($out)
704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp),$inout6
705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout7,0x70($out)
706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out),$out
707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp),$inout7
708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp),$inp
709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_loop8_enter:
710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt8
712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key_),$rndkey0
714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lecb_dec_loop8
716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key		# restore $key
719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds		# restore $rounds
721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout6,0x60($out)
726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout7,0x70($out)
727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($out),$out
728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$0x80,$len
729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lecb_ret
730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_tail:
732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x20,$len
734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_one
735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$inout1
736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_dec_two
737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x20($inp),$inout2
738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x40,$len
739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_three
740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$inout3
741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_dec_four
742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x40($inp),$inout4
743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x60,$len
744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lecb_dec_five
745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x50($inp),$inout5
746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lecb_dec_six
747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x60($inp),$inout6
748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey0
749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt8
750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout6,0x60($out)
757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_one:
760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("dec",$key,$rounds);
762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_two:
767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout2,$inout2
768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt3
769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_three:
774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt3
775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_four:
781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt4
782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_five:
789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout5,$inout5
790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt6
791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lecb_ret
797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_dec_six:
799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt6
800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lecb_ret:
808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom######################################################################
814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#                         size_t blocks, const AES_KEY *key,
816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#                         const char *ivec,char *cmac);
817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Handles only complete blocks, operates on 64-bit counter and
819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# does not update *ivec! Nor does it finalize CMAC value
820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# (see engine/eng_aesni.c for details)
821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $cmac="%r9";	# 6th argument
824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $increment="%xmm6";
826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $bswap_mask="%xmm7";
827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	aesni_ccm64_encrypt_blocks
830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	aesni_ccm64_encrypt_blocks,\@function,6
831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromaesni_ccm64_encrypt_blocks:
833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x58(%rsp),%rsp
836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6,(%rsp)
837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7,0x10(%rsp)
838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8,0x20(%rsp)
839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9,0x30(%rsp)
840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_enc_body:
841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rounds		# key->rounds
844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($ivp),$iv
845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lincrement64(%rip),$increment
846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lbswap_mask(%rip),$bswap_mask
847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$1,$rounds
849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0($key),$key_
850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($cmac),$inout1
851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$iv,$inout0
852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rounds,$rnds_
853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$bswap_mask,$iv
854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lccm64_enc_outer
855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_enc_outer:
857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key_),$rndkey0
858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds
859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$in0			# load inp
860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout0		# counter
862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	16($key_),$rndkey1
863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in0,$rndkey0
864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($key_),$key
865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout1		# cmac^=inp
866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey0
867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_enc2_loop:
869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey1,$inout0
870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey1,$inout1
872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	16($key),$rndkey1
873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey0,$inout0
874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($key),$key
875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey0,$inout1
876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	0($key),$rndkey0
877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lccm64_enc2_loop
878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey1,$inout0
879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey1,$inout1
880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	$increment,$iv
881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout0
882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout1
883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$len
885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($inp),$inp
886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout0,$in0			# inp ^= E(iv)
887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$iv,$inout0
888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in0,($out)			# save output
889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($out),$out
890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$bswap_mask,$inout0
891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lccm64_enc_outer
892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,($cmac)
894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	(%rsp),%xmm6
897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x10(%rsp),%xmm7
898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x20(%rsp),%xmm8
899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x30(%rsp),%xmm9
900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x58(%rsp),%rsp
901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_enc_ret:
902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom######################################################################
908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	aesni_ccm64_decrypt_blocks
910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	aesni_ccm64_decrypt_blocks,\@function,6
911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromaesni_ccm64_decrypt_blocks:
913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x58(%rsp),%rsp
916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6,(%rsp)
917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7,0x10(%rsp)
918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8,0x20(%rsp)
919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9,0x30(%rsp)
920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_dec_body:
921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rounds		# key->rounds
924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($ivp),$iv
925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($cmac),$inout1
926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lincrement64(%rip),$increment
927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lbswap_mask(%rip),$bswap_mask
928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$iv,$inout0
930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rounds,$rnds_
931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key,$key_
932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$bswap_mask,$iv
933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key,$rounds);
935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$in0			# load inp
937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	$increment,$iv
938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($inp),$inp
939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lccm64_dec_outer
940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_dec_outer:
942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout0,$in0			# inp ^= E(iv)
943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$iv,$inout0
944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds
945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in0,($out)			# save output
946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($out),$out
947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$bswap_mask,$inout0
948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$1,$len
950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lccm64_dec_break
951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key_),$rndkey0
953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$1,$rounds
954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	16($key_),$rndkey1
955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$in0
956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($key_),$key
957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout0
958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in0,$inout1			# cmac^=out
959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),$rndkey0
960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_dec2_loop:
962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey1,$inout0
963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec	$rounds
964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey1,$inout1
965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	16($key),$rndkey1
966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey0,$inout0
967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($key),$key
968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey0,$inout1
969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	0($key),$rndkey0
970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lccm64_dec2_loop
971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$in0			# load inp
972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	$increment,$iv
973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey1,$inout0
974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc	$rndkey1,$inout1
975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($inp),$inp
976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout0
977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout1
978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lccm64_dec_outer
979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_dec_break:
982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#xorps	$in0,$inout1			# cmac^=out
983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,($cmac)
987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	(%rsp),%xmm6
990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x10(%rsp),%xmm7
991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x20(%rsp),%xmm8
992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x30(%rsp),%xmm9
993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x58(%rsp),%rsp
994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lccm64_dec_ret:
995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
1001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom######################################################################
1002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#                         size_t blocks, const AES_KEY *key,
1004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#                         const char *ivec);
1005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
1006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Handles only complete blocks, operates on 32-bit counter and
1007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# does not update *ivec! (see engine/eng_aesni.c for details)
1008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
1009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
1010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $reserved = $win64?0:-0x28;
1011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
1012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
1013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $bswap_mask="%xmm15";
1014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	aesni_ctr32_encrypt_blocks
1017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	aesni_ctr32_encrypt_blocks,\@function,5
1018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromaesni_ctr32_encrypt_blocks:
1020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0xc8(%rsp),%rsp
1023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6,0x20(%rsp)
1024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7,0x30(%rsp)
1025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8,0x40(%rsp)
1026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9,0x50(%rsp)
1027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10,0x60(%rsp)
1028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11,0x70(%rsp)
1029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12,0x80(%rsp)
1030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13,0x90(%rsp)
1031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14,0xa0(%rsp)
1032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15,0xb0(%rsp)
1033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_body:
1034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$1,$len
1037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lctr32_one_shortcut
1038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	($ivp),$ivec
1040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lbswap_mask(%rip),$bswap_mask
1041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$rounds,$rounds
1042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
1043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
1044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1045392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rounds		# key->rounds
1046392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	bswap	$rnds_
1047392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$iv0,$iv0			# vector of 3 32-bit counters
1048392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$iv1,$iv1			# vector of 3 32-bit counters
1049392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pinsrd	\$0,$rnds_,$iv0
1050392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	3($rnds_),$key_
1051392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pinsrd	\$0,$key_,$iv1
1052392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	inc	$rnds_
1053392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pinsrd	\$1,$rnds_,$iv0
1054392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	inc	$key_
1055392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pinsrd	\$1,$key_,$iv1
1056392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	inc	$rnds_
1057392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pinsrd	\$2,$rnds_,$iv0
1058392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	inc	$key_
1059392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pinsrd	\$2,$key_,$iv1
1060392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$iv0,$reserved(%rsp)
1061392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$bswap_mask,$iv0
1062392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	$iv1,`$reserved+0x10`(%rsp)
1063392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufb	$bswap_mask,$iv1
1064392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1065392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
1066392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$`2<<6`,$iv0,$inout1
1067392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$`1<<6`,$iv0,$inout2
1068392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$6,$len
1069392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lctr32_tail
1070392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$1,$rounds
1071392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key,$key_			# backup $key
1072392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rounds,$rnds_			# backup $rounds
1073392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$6,$len
1074392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lctr32_loop6
1075392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1076392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1077392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_loop6:
1078392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$`3<<6`,$iv1,$inout3
1079392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout0			# merge counter-less ivec
1080392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 $movkey	($key_),$rndkey0
1081392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$`2<<6`,$iv1,$inout4
1082392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout1
1083392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 $movkey	16($key_),$rndkey1
1084392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$`1<<6`,$iv1,$inout5
1085392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout2
1086392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout3
1087392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 xorps		$rndkey0,$inout0
1088392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout4
1089392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout5
1090392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1091392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# inline _aesni_encrypt6 and interleave last rounds
1092392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# with own code...
1093392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1094392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout1
1095392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout0
1096392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key_),$key
1097392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout2
1098392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout1
1099392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa		.Lincrement32(%rip),$iv1
1100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout3
1101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout2
1102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa		$reserved(%rsp),$iv0
1103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout4
1104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout3
1105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout5
1106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
1107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
1108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout4
1109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout5
1110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp		.Lctr32_enc_loop6_enter
1111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_enc_loop6:
1113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout0
1114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout1
1115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
1116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout2
1117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout3
1118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout4
1119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout5
1120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_enc_loop6_enter:
1121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
1122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout0
1123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout1
1124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
1125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout2
1126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout3
1127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout4
1128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout5
1129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
1130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz		.Lctr32_enc_loop6
1131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout0
1133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 paddd		$iv1,$iv0		# increment counter vector
1134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout1
1135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 paddd		`$reserved+0x10`(%rsp),$iv1
1136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout2
1137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa		$iv0,$reserved(%rsp)	# save counter vector
1138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout3
1139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa		$iv1,`$reserved+0x10`(%rsp)
1140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout4
1141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb		$bswap_mask,$iv0	# byte swap
1142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout5
1143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufb		$bswap_mask,$iv1
1144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout0
1146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movups		($inp),$in0		# load input
1147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout1
1148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movups		0x10($inp),$in1
1149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout2
1150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movups		0x20($inp),$in2
1151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout3
1152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movups		0x30($inp),$in3
1153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout4
1154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movups		0x40($inp),$rndkey1
1155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenclast	$rndkey0,$inout5
1156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movups		0x50($inp),$rndkey0
1157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 lea	0x60($inp),$inp
1158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout0,$in0			# xor
1160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$`3<<6`,$iv0,$inout0
1161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout1,$in1
1162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$`2<<6`,$iv0,$inout1
1163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in0,($out)			# store output
1164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout2,$in2
1165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd	\$`1<<6`,$iv0,$inout2
1166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in1,0x10($out)
1167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout3,$in3
1168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in2,0x20($out)
1169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout4,$rndkey1
1170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in3,0x30($out)
1171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout5,$rndkey0
1172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$rndkey1,0x40($out)
1173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$rndkey0,0x50($out)
1174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x60($out),$out
1175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds
1176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$6,$len
1177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lctr32_loop6
1178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$6,$len
1180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lctr32_done
1181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key			# restore $key
1182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($rounds,$rounds),$rounds	# restore original value
1183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_tail:
1185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout0
1186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$in0
1187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$2,$len
1188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lctr32_one
1189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout1
1191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$in1
1192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lctr32_two
1193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$`3<<6`,$iv1,$inout3
1195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout2
1196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x20($inp),$in2
1197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$4,$len
1198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lctr32_three
1199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$`2<<6`,$iv1,$inout4
1201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout3
1202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$in3
1203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lctr32_four
1204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	por	$ivec,$inout4
1206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout5,$inout5
1207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt6
1209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x40($inp),$rndkey1
1211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout0,$in0
1212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout1,$in1
1213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in0,($out)
1214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout2,$in2
1215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in1,0x10($out)
1216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout3,$in3
1217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in2,0x20($out)
1218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout4,$rndkey1
1219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in3,0x30($out)
1220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$rndkey1,0x40($out)
1221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lctr32_done
1222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_one_shortcut:
1225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($ivp),$inout0
1226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$in0
1227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rounds		# key->rounds
1228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_one:
1229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key,$rounds);
1231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout0,$in0
1233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in0,($out)
1234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lctr32_done
1235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_two:
1238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout2,$inout2
1239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt3
1240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout0,$in0
1241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout1,$in1
1242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in0,($out)
1243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in1,0x10($out)
1244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lctr32_done
1245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_three:
1248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt3
1249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout0,$in0
1250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout1,$in1
1251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in0,($out)
1252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout2,$in2
1253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in1,0x10($out)
1254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in2,0x20($out)
1255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lctr32_done
1256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_four:
1259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt4
1260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout0,$in0
1261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout1,$in1
1262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in0,($out)
1263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout2,$in2
1264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in1,0x10($out)
1265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout3,$in3
1266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in2,0x20($out)
1267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$in3,0x30($out)
1268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_done:
1270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x20(%rsp),%xmm6
1273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x30(%rsp),%xmm7
1274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x40(%rsp),%xmm8
1275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x50(%rsp),%xmm9
1276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rsp),%xmm10
1277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rsp),%xmm11
1278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rsp),%xmm12
1279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rsp),%xmm13
1280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rsp),%xmm14
1281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rsp),%xmm15
1282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xc8(%rsp),%rsp
1283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lctr32_ret:
1284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
1290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom######################################################################
1292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	const AES_KEY *key1, const AES_KEY *key2
1294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	const unsigned char iv[16]);
1295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
1296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
1297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy @tweak=map("%xmm$_",(10..15));
1298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $frame_size = 0x68 + ($win64?160:0);
1301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	aesni_xts_encrypt
1304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	aesni_xts_encrypt,\@function,6
1305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromaesni_xts_encrypt:
1307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-$frame_size(%rsp),%rsp
1308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6,0x60(%rsp)
1311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7,0x70(%rsp)
1312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8,0x80(%rsp)
1313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9,0x90(%rsp)
1314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10,0xa0(%rsp)
1315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11,0xb0(%rsp)
1316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12,0xc0(%rsp)
1317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13,0xd0(%rsp)
1318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14,0xe0(%rsp)
1319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15,0xf0(%rsp)
1320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_body:
1321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($ivp),@tweak[5]		# load clear-text tweak
1324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240(%r8),$rounds		# key2->rounds
1325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rnds_		# key1->rounds
1326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# generate the tweak
1328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key,$key_			# backup $key
1331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds			# backup $rounds
1332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$len,$len_			# backup $len
1333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$-16,$len
1334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lxts_magic(%rip),$twmask
1336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
1338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for ($i=0;$i<4;$i++) {
1340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___;
1341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[$i]
1344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
1350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$16*6,$len
1352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jc	.Lxts_enc_short
1353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$1,$rounds
1355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$1,$rounds
1356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rounds,$rnds_
1357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_grandloop
1358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_grandloop:
1361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[4]
1363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*0`($inp),$inout0		# load input
1365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*1`($inp),$inout1
1367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*2`($inp),$inout2
1370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[0],$inout0		# input^=tweak
1371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*3`($inp),$inout3
1372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[1],$inout1
1373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*4`($inp),$inout4
1374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[2],$inout2
1375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*5`($inp),$inout5
1376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	`16*6`($inp),$inp
1377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[3],$inout3
1378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key_),$rndkey0
1379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[4],$inout4
1380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[5],$inout5
1381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# inline _aesni_encrypt6 and interleave first and last rounds
1383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# with own code...
1384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key_),$rndkey1
1385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout0
1386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout1
1387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
1388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout0
1389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key_),$key
1390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout2
1391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[1],`16*1`(%rsp)
1392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout1
1393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout3
1394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[2],`16*2`(%rsp)
1395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout2
1396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout4
1397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[3],`16*3`(%rsp)
1398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout3
1399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout5
1400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
1401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
1402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[4],`16*4`(%rsp)
1403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout4
1404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[5],`16*5`(%rsp)
1405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout5
1406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp
1408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp		.Lxts_enc_loop6_enter
1409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_loop6:
1412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout0
1413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout1
1414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
1415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout2
1416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout3
1417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout4
1418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey1,$inout5
1419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_loop6_enter:
1420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
1421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout0
1422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout1
1423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
1424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout2
1425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout3
1426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout4
1427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesenc		$rndkey0,$inout5
1428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
1429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz		.Lxts_enc_loop6
1430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout0
1435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout1
1437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
1438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout2
1439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout3
1441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout4
1442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout5
1443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 $movkey	16($key),$rndkey1
1444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[0]
1448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey0,$inout0
1450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey0,$inout1
1452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey0,$inout2
1454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey0,$inout3
1456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey0,$inout4
1457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey0,$inout5
1458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 $movkey	32($key),$rndkey0
1459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[1]
1463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout0
1465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout1
1467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout2
1469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout3
1471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout4
1472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenc		$rndkey1,$inout5
1473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[2]
1477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenclast	$rndkey0,$inout0
1479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenclast	$rndkey0,$inout1
1481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenclast	$rndkey0,$inout2
1483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenclast	$rndkey0,$inout3
1485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenclast	$rndkey0,$inout4
1486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesenclast	$rndkey0,$inout5
1487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[3]
1491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
1493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 xorps	`16*1`(%rsp),$inout1
1495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	`16*2`(%rsp),$inout2
1499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,`16*0`($out)		# write output
1500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	`16*3`(%rsp),$inout3
1501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,`16*1`($out)
1502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	`16*4`(%rsp),$inout4
1503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,`16*2`($out)
1504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	`16*5`(%rsp),$inout5
1505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,`16*3`($out)
1506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds			# restore $rounds
1507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,`16*4`($out)
1508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,`16*5`($out)
1509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	`16*6`($out),$out
1510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$16*6,$len
1511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lxts_enc_grandloop
1512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	3($rounds,$rounds),$rounds	# restore original value
1514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key			# restore $key
1515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rounds,$rnds_			# backup $rounds
1516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_short:
1518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$16*6,$len
1519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_enc_done
1520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x20,$len
1522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lxts_enc_one
1523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lxts_enc_two
1524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x40,$len
1526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lxts_enc_three
1527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lxts_enc_four
1528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[4]
1531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqu	($inp),$inout0
1533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqu	16*1($inp),$inout1
1535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16*2($inp),$inout2
1538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[0],$inout0
1539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16*3($inp),$inout3
1540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[1],$inout1
1541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16*4($inp),$inout4
1542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*5($inp),$inp
1543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[2],$inout2
1544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[3],$inout3
1545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[4],$inout4
1546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt6
1548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[0]
1551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
1553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout0,($out)
1554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[3],$inout3
1555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout1,16*1($out)
1556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[4],$inout4
1557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout2,16*2($out)
1558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout3,16*3($out)
1559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout4,16*4($out)
1560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*5($out),$out
1561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
1562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_one:
1565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
1566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*1($inp),$inp
1567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key,$rounds);
1570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[1],@tweak[0]
1573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
1574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*1($out),$out
1575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
1576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_two:
1579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
1580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16($inp),$inout1
1581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($inp),$inp
1582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt3
1586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[2],@tweak[0]
1589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
1591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,16*1($out)
1592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*2($out),$out
1593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
1594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_three:
1597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
1598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*1($inp),$inout1
1599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*2($inp),$inout2
1600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*3($inp),$inp
1601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
1604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt3
1606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[3],@tweak[0]
1609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
1611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
1612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,16*1($out)
1613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,16*2($out)
1614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*3($out),$out
1615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
1616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_four:
1619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
1620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*1($inp),$inout1
1621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*2($inp),$inout2
1622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*3($inp),$inout3
1624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*4($inp),$inp
1625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
1627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[3],$inout3
1628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_encrypt4
1630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[0]
1633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
1635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
1636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[3],$inout3
1637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,16*1($out)
1638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,16*2($out)
1639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,16*3($out)
1640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*4($out),$out
1641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_enc_done
1642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_done:
1645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$15,$len_
1646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_enc_ret
1647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$len_,$len
1648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_steal:
1650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movzb	($inp),%eax			# borrow $rounds ...
1651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movzb	-16($out),%ecx			# ... and $key
1652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($inp),$inp
1653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%al,-16($out)
1654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%cl,0($out)
1655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($out),$out
1656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$1,$len
1657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lxts_enc_steal
1658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	$len_,$out			# rewind $out
1660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key			# restore $key
1661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds			# restore $rounds
1662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	-16($out),$inout0
1664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key,$rounds);
1667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,-16($out)
1670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_ret:
1672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rsp),%xmm6
1675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rsp),%xmm7
1676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rsp),%xmm8
1677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rsp),%xmm9
1678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rsp),%xmm10
1679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rsp),%xmm11
1680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xc0(%rsp),%xmm12
1681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xd0(%rsp),%xmm13
1682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xe0(%rsp),%xmm14
1683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xf0(%rsp),%xmm15
1684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	$frame_size(%rsp),%rsp
1687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_enc_epilogue:
1688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
1689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	aesni_xts_encrypt,.-aesni_xts_encrypt
1690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	aesni_xts_decrypt
1694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	aesni_xts_decrypt,\@function,6
1695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromaesni_xts_decrypt:
1697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-$frame_size(%rsp),%rsp
1698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
1700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6,0x60(%rsp)
1701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7,0x70(%rsp)
1702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8,0x80(%rsp)
1703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9,0x90(%rsp)
1704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm10,0xa0(%rsp)
1705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm11,0xb0(%rsp)
1706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm12,0xc0(%rsp)
1707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm13,0xd0(%rsp)
1708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm14,0xe0(%rsp)
1709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm15,0xf0(%rsp)
1710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_body:
1711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($ivp),@tweak[5]		# load clear-text tweak
1714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key2),$rounds		# key2->rounds
1715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rnds_		# key1->rounds
1716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# generate the tweak
1718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%eax,%eax			# if ($len%16) len-=16;
1721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	\$15,$len
1722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	setnz	%al
1723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$4,%rax
1724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	%rax,$len
1725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key,$key_			# backup $key
1727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds			# backup $rounds
1728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$len,$len_			# backup $len
1729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$-16,$len
1730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	.Lxts_magic(%rip),$twmask
1732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
1734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    for ($i=0;$i<4;$i++) {
1736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $code.=<<___;
1737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[$i]
1740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
1746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$16*6,$len
1748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jc	.Lxts_dec_short
1749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$1,$rounds
1751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$1,$rounds
1752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rounds,$rnds_
1753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_grandloop
1754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_grandloop:
1757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[4]
1759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*0`($inp),$inout0		# load input
1761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*1`($inp),$inout1
1763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*2`($inp),$inout2
1766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[0],$inout0		# input^=tweak
1767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*3`($inp),$inout3
1768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[1],$inout1
1769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*4`($inp),$inout4
1770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[2],$inout2
1771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	`16*5`($inp),$inout5
1772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	`16*6`($inp),$inp
1773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[3],$inout3
1774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key_),$rndkey0
1775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[4],$inout4
1776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[5],$inout5
1777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# inline _aesni_decrypt6 and interleave first and last rounds
1779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	# with own code...
1780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key_),$rndkey1
1781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout0
1782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout1
1783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
1784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout0
1785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key_),$key
1786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout2
1787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[1],`16*1`(%rsp)
1788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout1
1789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout3
1790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[2],`16*2`(%rsp)
1791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout2
1792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout4
1793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[3],`16*3`(%rsp)
1794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout3
1795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout5
1796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
1797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
1798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[4],`16*4`(%rsp)
1799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout4
1800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqa	@tweak[5],`16*5`(%rsp)
1801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout5
1802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp
1804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp		.Lxts_dec_loop6_enter
1805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_loop6:
1808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout0
1809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout1
1810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
1811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout2
1812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout3
1813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout4
1814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout5
1815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_loop6_enter:
1816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
1817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey0,$inout0
1818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey0,$inout1
1819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
1820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey0,$inout2
1821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey0,$inout3
1822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey0,$inout4
1823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey0,$inout5
1824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
1825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz		.Lxts_dec_loop6
1826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout0
1831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout1
1833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
1834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout2
1835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout3
1837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout4
1838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout5
1839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 $movkey	16($key),$rndkey1
1840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[0]
1844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey0,$inout0
1846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey0,$inout1
1848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey0,$inout2
1850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey0,$inout3
1852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey0,$inout4
1853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey0,$inout5
1854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 $movkey	32($key),$rndkey0
1855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[1]
1859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout0
1861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout1
1863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout2
1865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout3
1867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout4
1868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdec		$rndkey1,$inout5
1869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[2]
1873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdeclast	$rndkey0,$inout0
1875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdeclast	$rndkey0,$inout1
1877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdeclast	$rndkey0,$inout2
1879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdeclast	$rndkey0,$inout3
1881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdeclast	$rndkey0,$inout4
1882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 aesdeclast	$rndkey0,$inout5
1883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twtmp,$twtmp
1886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[3]
1887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
1888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
1889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 xorps	`16*1`(%rsp),$inout1
1891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
1892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	`16*2`(%rsp),$inout2
1895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,`16*0`($out)		# write output
1896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	`16*3`(%rsp),$inout3
1897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,`16*1`($out)
1898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	`16*4`(%rsp),$inout4
1899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,`16*2`($out)
1900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	`16*5`(%rsp),$inout5
1901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,`16*3`($out)
1902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds			# restore $rounds
1903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,`16*4`($out)
1904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,`16*5`($out)
1905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	`16*6`($out),$out
1906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$16*6,$len
1907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lxts_dec_grandloop
1908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	3($rounds,$rounds),$rounds	# restore original value
1910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key			# restore $key
1911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rounds,$rnds_			# backup $rounds
1912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_short:
1914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$16*6,$len
1915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_dec_done
1916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x20,$len
1918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lxts_dec_one
1919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lxts_dec_two
1920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x40,$len
1922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lxts_dec_three
1923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.Lxts_dec_four
1924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
1926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[4]
1927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqu	($inp),$inout0
1929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
1930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movdqu	16*1($inp),$inout1
1931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
1932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16*2($inp),$inout2
1934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[0],$inout0
1935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16*3($inp),$inout3
1936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[1],$inout1
1937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	16*4($inp),$inout4
1938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*5($inp),$inp
1939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[2],$inout2
1940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[3],$inout3
1941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[4],$inout4
1942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt6
1944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
1948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout0,($out)
1949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[3],$inout3
1950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout1,16*1($out)
1951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[4],$inout4
1952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout2,16*2($out)
1953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pxor		$twtmp,$twtmp
1954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout3,16*3($out)
1955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pcmpgtd	@tweak[5],$twtmp
1956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	$inout4,16*4($out)
1957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*5($out),$out
1958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
1959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$15,$len_
1960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_dec_ret
1961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[0]
1963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
1964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,@tweak[1]		# isolate carry and residue
1965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	@tweak[5],@tweak[1]
1966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done2
1967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_one:
1970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
1971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*1($inp),$inp
1972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
1974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("dec",$key,$rounds);
1975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
1976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[1],@tweak[0]
1978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
1979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[2],@tweak[1]
1980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*1($out),$out
1981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
1982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
1984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_two:
1985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
1986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16($inp),$inout1
1987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32($inp),$inp
1988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt3
1992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
1993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
1994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[2],@tweak[0]
1995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
1996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[3],@tweak[1]
1997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
1998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,16*1($out)
1999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*2($out),$out
2000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_three:
2004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
2005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*1($inp),$inout1
2006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*2($inp),$inout2
2007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*3($inp),$inp
2008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
2009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
2010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
2011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt3
2013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
2015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[3],@tweak[0]
2016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
2017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[1]
2018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
2019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,16*1($out)
2021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,16*2($out)
2022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*3($out),$out
2023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_four:
2027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0x13,$twtmp,$twres
2028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[4]
2029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movups	($inp),$inout0
2031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pand	$twmask,$twres			# isolate carry and residue
2032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	 movups	16*1($inp),$inout1
2033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	$twres,@tweak[5]
2034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*2($inp),$inout2
2036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
2037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16*3($inp),$inout3
2038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*4($inp),$inp
2039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
2040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
2041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[3],$inout3
2042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt4
2044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2045392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
2046392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[4],@tweak[0]
2047392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout1
2048392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	@tweak[5],@tweak[1]
2049392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[2],$inout2
2050392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2051392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[3],$inout3
2052392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,16*1($out)
2053392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,16*2($out)
2054392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,16*3($out)
2055392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16*4($out),$out
2056392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lxts_dec_done
2057392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2058392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2059392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_done:
2060392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$15,$len_
2061392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lxts_dec_ret
2062392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_done2:
2063392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$len_,$len
2064392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key			# restore $key
2065392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds			# restore $rounds
2066392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2067392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
2068392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout0
2069392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2070392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("dec",$key,$rounds);
2071392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2072392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[1],$inout0
2073392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2074392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2075392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_steal:
2076392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movzb	16($inp),%eax			# borrow $rounds ...
2077392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movzb	($out),%ecx			# ... and $key
2078392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($inp),$inp
2079392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%al,($out)
2080392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%cl,16($out)
2081392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($out),$out
2082392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$1,$len
2083392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lxts_dec_steal
2084392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2085392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	$len_,$out			# rewind $out
2086392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key			# restore $key
2087392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds			# restore $rounds
2088392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2089392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($out),$inout0
2090392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
2091392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2092392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("dec",$key,$rounds);
2093392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2094392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	@tweak[0],$inout0
2095392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2096392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2097392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_ret:
2098392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2099392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
2100392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x60(%rsp),%xmm6
2101392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x70(%rsp),%xmm7
2102392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x80(%rsp),%xmm8
2103392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x90(%rsp),%xmm9
2104392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xa0(%rsp),%xmm10
2105392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xb0(%rsp),%xmm11
2106392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xc0(%rsp),%xmm12
2107392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xd0(%rsp),%xmm13
2108392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xe0(%rsp),%xmm14
2109392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0xf0(%rsp),%xmm15
2110392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2111392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2112392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	$frame_size(%rsp),%rsp
2113392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_dec_epilogue:
2114392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2115392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2116392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2117392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom} }}
2118392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2119392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom########################################################################
2120392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2121392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#			    size_t length, const AES_KEY *key,
2122392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#			    unsigned char *ivp,const int enc);
2123392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{
2124392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrommy $reserved = $win64?0x40:-0x18;	# used in decrypt
2125392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2126392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	${PREFIX}_cbc_encrypt
2127392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	${PREFIX}_cbc_encrypt,\@function,6
2128392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2129392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom${PREFIX}_cbc_encrypt:
2130392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	$len,$len		# check length
2131392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lcbc_ret
2132392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2133392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	240($key),$rnds_	# key->rounds
2134392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key,$key_		# backup $key
2135392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	%r9d,%r9d		# 6th argument
2136392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lcbc_decrypt
2137392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#--------------------------- CBC ENCRYPT ------------------------------#
2138392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($ivp),$inout0		# load iv as initial state
2139392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds
2140392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$16,$len
2141392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcbc_enc_tail
2142392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$16,$len
2143392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_enc_loop
2144392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2145392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_enc_loop:
2146392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout1		# load input
2147392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($inp),$inp
2148392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	#xorps	$inout1,$inout0
2149392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2150392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
2151392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2152392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds		# restore $rounds
2153392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key		# restore $key
2154392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,0($out)		# store output
2155392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($out),$out
2156392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$16,$len
2157392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnc	.Lcbc_enc_loop
2158392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$16,$len
2159392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lcbc_enc_tail
2160392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($ivp)
2161392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_ret
2162392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2163392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_enc_tail:
2164392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$len,%rcx	# zaps $key
2165392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
2166392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0x9066A4F3	# rep movsb
2167392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$16,%ecx	# zero tail
2168392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	$len,%rcx
2169392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%eax,%eax
2170392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0x9066AAF3	# rep stosb
2171392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-16(%rdi),%rdi	# rewind $out by 1 block
2172392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds	# restore $rounds
2173392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdi,%rsi	# $inp and $out are the same
2174392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key	# restore $key
2175392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	$len,$len	# len=16
2176392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_enc_loop	# one more spin
2177392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#--------------------------- CBC DECRYPT ------------------------------#
2178392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2179392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_decrypt:
2180392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2181392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
2182392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-0x58(%rsp),%rsp
2183392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm6,(%rsp)
2184392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm7,0x10(%rsp)
2185392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm8,0x20(%rsp)
2186392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm9,0x30(%rsp)
2187392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_decrypt_body:
2188392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2189392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2190392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($ivp),$iv
2191392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds
2192392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x70,$len
2193392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jbe	.Lcbc_dec_tail
2194392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shr	\$1,$rnds_
2195392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x70,$len
2196392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds
2197392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$iv,$reserved(%rsp)
2198392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_loop8_enter
2199392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2200392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_loop8:
2201392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$rndkey0,$reserved(%rsp)	# save IV
2202392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout7,($out)
2203392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($out),$out
2204392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_loop8_enter:
2205392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
2206392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0			# load input
2207392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$inout1
2208392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
2209392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2210392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea		32($key),$key
2211392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x20($inp),$inout2
2212392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps		$rndkey0,$inout0
2213392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x30($inp),$inout3
2214392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps		$rndkey0,$inout1
2215392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x40($inp),$inout4
2216392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout0
2217392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout2
2218392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x50($inp),$inout5
2219392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout1
2220392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout3
2221392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x60($inp),$inout6
2222392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout2
2223392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout4
2224392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqu	0x70($inp),$inout7
2225392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout3
2226392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout5
2227392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	dec		$rounds
2228392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout4
2229392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout6
2230392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout5
2231392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor		$rndkey0,$inout7
2232392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		($key),$rndkey0
2233392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout6
2234392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesdec		$rndkey1,$inout7
2235392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey		16($key),$rndkey1
2236392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2237392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Ldec_loop8_enter
2238392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2239392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$rndkey1		# re-load input
2240392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$rndkey0
2241392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$reserved(%rsp),$inout0	# ^= IV
2242392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout1
2243392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x20($inp),$rndkey1
2244392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout2
2245392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$rndkey0
2246392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout3
2247392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x40($inp),$rndkey1
2248392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout4
2249392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x50($inp),$rndkey0
2250392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout5
2251392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x60($inp),$rndkey1
2252392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout6
2253392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x70($inp),$rndkey0	# IV
2254392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout7
2255392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2256392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
2257392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
2258392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
2259392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$rnds_,$rounds		# restore $rounds
2260392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
2261392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$key_,$key		# restore $key
2262392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
2263392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x80($inp),$inp
2264392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout6,0x60($out)
2265392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x70($out),$out
2266392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x80,$len
2267392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ja	.Lcbc_dec_loop8
2268392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2269392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout7,$inout0
2270392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$rndkey0,$iv
2271392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$0x70,$len
2272392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jle	.Lcbc_dec_tail_collected
2273392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2274392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	1($rnds_,$rnds_),$rounds
2275392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($out),$out
2276392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_tail:
2277392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$inout0
2278392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout0,$in0
2279392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x10,$len
2280392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jbe	.Lcbc_dec_one
2281392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2282392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$inout1
2283392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout1,$in1
2284392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x20,$len
2285392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jbe	.Lcbc_dec_two
2286392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2287392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x20($inp),$inout2
2288392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout2,$in2
2289392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x30,$len
2290392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jbe	.Lcbc_dec_three
2291392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2292392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$inout3
2293392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x40,$len
2294392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jbe	.Lcbc_dec_four
2295392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2296392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x40($inp),$inout4
2297392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x50,$len
2298392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jbe	.Lcbc_dec_five
2299392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2300392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x50($inp),$inout5
2301392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$0x60,$len
2302392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jbe	.Lcbc_dec_six
2303392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2304392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x60($inp),$inout6
2305392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$iv,$reserved(%rsp)	# save IV
2306392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt8
2307392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),$rndkey1
2308392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$rndkey0
2309392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$reserved(%rsp),$inout0	# ^= IV
2310392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout1
2311392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x20($inp),$rndkey1
2312392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout2
2313392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$rndkey0
2314392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout3
2315392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x40($inp),$rndkey1
2316392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout4
2317392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x50($inp),$rndkey0
2318392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout5
2319392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x60($inp),$iv		# IV
2320392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout6
2321392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2322392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
2323392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
2324392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
2325392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
2326392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout5,0x50($out)
2327392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x60($out),$out
2328392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout6,$inout0
2329392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x70,$len
2330392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_tail_collected
2331392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2332392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_one:
2333392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2334392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	&aesni_generate1("dec",$key,$rounds);
2335392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2336392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$iv,$inout0
2337392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$in0,$iv
2338392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x10,$len
2339392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_tail_collected
2340392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2341392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_two:
2342392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout2,$inout2
2343392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt3
2344392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$iv,$inout0
2345392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in0,$inout1
2346392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2347392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$in1,$iv
2348392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout1,$inout0
2349392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x10($out),$out
2350392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x20,$len
2351392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_tail_collected
2352392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2353392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_three:
2354392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt3
2355392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$iv,$inout0
2356392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in0,$inout1
2357392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2358392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in1,$inout2
2359392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
2360392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$in2,$iv
2361392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout2,$inout0
2362392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20($out),$out
2363392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x30,$len
2364392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_tail_collected
2365392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2366392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_four:
2367392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt4
2368392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$iv,$inout0
2369392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$iv
2370392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in0,$inout1
2371392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2372392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in1,$inout2
2373392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
2374392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in2,$inout3
2375392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
2376392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout3,$inout0
2377392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x30($out),$out
2378392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x40,$len
2379392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_tail_collected
2380392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2381392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_five:
2382392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$inout5,$inout5
2383392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt6
2384392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$rndkey1
2385392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x20($inp),$rndkey0
2386392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$iv,$inout0
2387392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in0,$inout1
2388392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout2
2389392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$rndkey1
2390392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout3
2391392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x40($inp),$iv
2392392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout4
2393392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2394392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
2395392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
2396392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
2397392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x40($out),$out
2398392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout4,$inout0
2399392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x50,$len
2400392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_tail_collected
2401392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2402392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_six:
2403392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	_aesni_decrypt6
2404392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x10($inp),$rndkey1
2405392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x20($inp),$rndkey0
2406392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$iv,$inout0
2407392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$in0,$inout1
2408392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout2
2409392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x30($inp),$rndkey1
2410392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout3
2411392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x40($inp),$rndkey0
2412392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey1,$inout4
2413392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	0x50($inp),$iv
2414392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	$rndkey0,$inout5
2415392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2416392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout1,0x10($out)
2417392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout2,0x20($out)
2418392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout3,0x30($out)
2419392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout4,0x40($out)
2420392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x50($out),$out
2421392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout5,$inout0
2422392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$0x60,$len
2423392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_tail_collected
2424392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2425392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_tail_collected:
2426392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	and	\$15,$len
2427392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$iv,($ivp)
2428392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Lcbc_dec_tail_partial
2429392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	$inout0,($out)
2430392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcbc_dec_ret
2431392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2432392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_tail_partial:
2433392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	$inout0,$reserved(%rsp)
2434392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$16,%rcx
2435392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$out,%rdi
2436392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	$len,%rcx
2437392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	$reserved(%rsp),%rsi
2438392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0x9066A4F3	# rep movsb
2439392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2440392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_dec_ret:
2441392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2442392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($win64);
2443392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	(%rsp),%xmm6
2444392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x10(%rsp),%xmm7
2445392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x20(%rsp),%xmm8
2446392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	0x30(%rsp),%xmm9
2447392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x58(%rsp),%rsp
2448392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2449392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2450392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcbc_ret:
2451392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2452392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
2453392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2454392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
2455392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
2456392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#				int bits, AES_KEY *key)
2457392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom{ my ($inp,$bits,$key) = @_4args;
2458392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom  $bits =~ s/%r/%e/;
2459392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2460392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2461392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	${PREFIX}_set_decrypt_key
2462392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
2463392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2464392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom${PREFIX}_set_decrypt_key:
2465392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
2466392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	__aesni_set_encrypt_key
2467392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
2468392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	%eax,%eax
2469392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jnz	.Ldec_key_ret
2470392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($key,$bits),$inp	# points at the end of key schedule
2471392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2472392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),%xmm0		# just swap
2473392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($inp),%xmm1
2474392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,($inp)
2475392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm1,($key)
2476392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($key),$key
2477392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-16($inp),$inp
2478392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2479392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Ldec_key_inverse:
2480392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),%xmm0		# swap and inverse
2481392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($inp),%xmm1
2482392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesimc	%xmm0,%xmm0
2483392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesimc	%xmm1,%xmm1
2484392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($key),$key
2485392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	-16($inp),$inp
2486392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,16($inp)
2487392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm1,-16($key)
2488392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	$key,$inp
2489392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ja	.Ldec_key_inverse
2490392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2491392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	($key),%xmm0		# inverse middle
2492392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aesimc	%xmm0,%xmm0
2493392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,($inp)
2494392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Ldec_key_ret:
2495392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$8,%rsp
2496392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2497392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_end_set_decrypt_key:
2498392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
2499392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2500392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2501392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# This is based on submission by
2502392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
2503392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	Huang Ying <ying.huang@intel.com>
2504392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	Vinodh Gopal <vinodh.gopal@intel.com>
2505392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#	Kahraman Akdemir
2506392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
2507392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# Agressively optimized in respect to aeskeygenassist's critical path
2508392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# and is contained in %xmm0-5 to meet Win64 ABI requirement.
2509392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#
2510392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2511392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.globl	${PREFIX}_set_encrypt_key
2512392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
2513392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2514392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom${PREFIX}_set_encrypt_key:
2515392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom__aesni_set_encrypt_key:
2516392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
2517392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$-1,%rax
2518392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	$inp,$inp
2519392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lenc_key_ret
2520392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	test	$key,$key
2521392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jz	.Lenc_key_ret
2522392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2523392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	($inp),%xmm0		# pull first 128 bits of *userKey
2524392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
2525392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16($key),%rax
2526392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$256,$bits
2527392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.L14rounds
2528392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$192,$bits
2529392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	je	.L12rounds
2530392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	\$128,$bits
2531392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jne	.Lbad_keybits
2532392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2533392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L10rounds:
2534392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$9,$bits			# 10 rounds for 128-bit key
2535392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,($key)			# round 0
2536392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
2537392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128_cold
2538392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
2539392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2540392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
2541392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2542392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
2543392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2544392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
2545392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2546392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
2547392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2548392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
2549392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2550392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
2551392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2552392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
2553392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2554392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
2555392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_128
2556392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,(%rax)
2557392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$bits,80(%rax)	# 240(%rdx)
2558392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%eax,%eax
2559392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lenc_key_ret
2560392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2561392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2562392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L12rounds:
2563392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
2564392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$11,$bits			# 12 rounds for 192
2565392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,($key)			# round 0
2566392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
2567392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_192a_cold
2568392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
2569392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_192b
2570392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
2571392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_192a
2572392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
2573392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_192b
2574392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
2575392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_192a
2576392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
2577392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_192b
2578392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
2579392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_192a
2580392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
2581392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_192b
2582392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,(%rax)
2583392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$bits,48(%rax)	# 240(%rdx)
2584392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%rax, %rax
2585392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lenc_key_ret
2586392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2587392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2588392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.L14rounds:
2589392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movups	16($inp),%xmm2			# remaning half of *userKey
2590392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$13,$bits			# 14 rounds for 256
2591392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16(%rax),%rax
2592392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,($key)			# round 0
2593392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm2,16($key)			# round 1
2594392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
2595392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256a_cold
2596392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
2597392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256b
2598392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
2599392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256a
2600392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
2601392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256b
2602392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
2603392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256a
2604392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
2605392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256b
2606392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
2607392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256a
2608392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
2609392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256b
2610392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
2611392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256a
2612392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
2613392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256b
2614392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
2615392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256a
2616392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
2617392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256b
2618392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
2619392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call		.Lkey_expansion_256a
2620392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,(%rax)
2621392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$bits,16(%rax)	# 240(%rdx)
2622392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%rax,%rax
2623392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lenc_key_ret
2624392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2625392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2626392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lbad_keybits:
2627392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$-2,%rax
2628392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lenc_key_ret:
2629392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$8,%rsp
2630392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2631392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_end_set_encrypt_key:
2632392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2633392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2634392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_128:
2635392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,(%rax)
2636392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16(%rax),%rax
2637392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_128_cold:
2638392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b00010000,%xmm0,%xmm4
2639392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4, %xmm0
2640392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b10001100,%xmm0,%xmm4
2641392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4, %xmm0
2642392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
2643392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm1,%xmm0
2644392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2645392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2646392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16
2647392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_192a:
2648392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,(%rax)
2649392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16(%rax),%rax
2650392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_192a_cold:
2651392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm2, %xmm5
2652392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_192b_warm:
2653392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b00010000,%xmm0,%xmm4
2654392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movdqa	%xmm2,%xmm3
2655392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4,%xmm0
2656392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b10001100,%xmm0,%xmm4
2657392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pslldq	\$4,%xmm3
2658392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4,%xmm0
2659392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
2660392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm3,%xmm2
2661392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm1,%xmm0
2662392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pshufd	\$0b11111111,%xmm0,%xmm3
2663392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pxor	%xmm3,%xmm2
2664392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2665392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2666392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16
2667392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_192b:
2668392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	movaps	%xmm0,%xmm3
2669392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b01000100,%xmm0,%xmm5
2670392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm5,(%rax)
2671392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b01001110,%xmm2,%xmm3
2672392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm3,16(%rax)
2673392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	32(%rax),%rax
2674392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lkey_expansion_192b_warm
2675392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2676392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2677392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_256a:
2678392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm2,(%rax)
2679392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16(%rax),%rax
2680392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_256a_cold:
2681392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b00010000,%xmm0,%xmm4
2682392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4,%xmm0
2683392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b10001100,%xmm0,%xmm4
2684392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4,%xmm0
2685392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
2686392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm1,%xmm0
2687392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2688392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2689392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align 16
2690392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lkey_expansion_256b:
2691392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	$movkey	%xmm0,(%rax)
2692392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	16(%rax),%rax
2693392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2694392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b00010000,%xmm2,%xmm4
2695392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4,%xmm2
2696392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b10001100,%xmm2,%xmm4
2697392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm4,%xmm2
2698392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
2699392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xorps	%xmm1,%xmm2
2700392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2701392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
2702392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
2703392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2704392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
2705392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2706392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2707392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
2708392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lbswap_mask:
2709392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
2710392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lincrement32:
2711392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	6,6,6,0
2712392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lincrement64:
2713392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	1,0,0,0
2714392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lxts_magic:
2715392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0x87,0,1,0
2716392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2717392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
2718392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	64
2719392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2720392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2721392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2722392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2723392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromif ($win64) {
2724392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$rec="%rcx";
2725392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$frame="%rdx";
2726392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$context="%r8";
2727392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$disp="%r9";
2728392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2729392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2730392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.extern	__imp_RtlVirtualUnwind
2731392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2732392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($PREFIX eq "aesni");
2733392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	ecb_se_handler,\@abi-omnipotent
2734392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2735392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromecb_se_handler:
2736392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rsi
2737392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rdi
2738392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
2739392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
2740392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
2741392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
2742392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
2743392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
2744392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pushfq
2745392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$64,%rsp
2746392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2747392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	152($context),%rax	# pull context->Rsp
2748392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2749392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcommon_seh_tail
2750392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	ecb_se_handler,.-ecb_se_handler
2751392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2752392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	ccm64_se_handler,\@abi-omnipotent
2753392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2754392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromccm64_se_handler:
2755392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rsi
2756392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rdi
2757392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
2758392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
2759392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
2760392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
2761392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
2762392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
2763392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pushfq
2764392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$64,%rsp
2765392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2766392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	120($context),%rax	# pull context->Rax
2767392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	248($context),%rbx	# pull context->Rip
2768392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2769392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($disp),%rsi		# disp->ImageBase
2770392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	56($disp),%r11		# disp->HandlerData
2771392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2772392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%r11),%r10d		# HandlerData[0]
2773392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsi,%r10),%r10	# prologue label
2774392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip<prologue label
2775392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcommon_seh_tail
2776392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2777392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	152($context),%rax	# pull context->Rsp
2778392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2779392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	4(%r11),%r10d		# HandlerData[1]
2780392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsi,%r10),%r10	# epilogue label
2781392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip>=epilogue label
2782392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jae	.Lcommon_seh_tail
2783392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2784392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0(%rax),%rsi		# %xmm save area
2785392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	512($context),%rdi	# &context.Xmm6
2786392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
2787392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0xa548f3fc		# cld; rep movsq
2788392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x58(%rax),%rax		# adjust stack pointer
2789392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2790392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcommon_seh_tail
2791392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	ccm64_se_handler,.-ccm64_se_handler
2792392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2793392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	ctr32_se_handler,\@abi-omnipotent
2794392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2795392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromctr32_se_handler:
2796392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rsi
2797392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rdi
2798392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
2799392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
2800392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
2801392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
2802392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
2803392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
2804392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pushfq
2805392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$64,%rsp
2806392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2807392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	120($context),%rax	# pull context->Rax
2808392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	248($context),%rbx	# pull context->Rip
2809392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2810392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.Lctr32_body(%rip),%r10
2811392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip<"prologue" label
2812392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcommon_seh_tail
2813392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2814392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	152($context),%rax	# pull context->Rsp
2815392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2816392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.Lctr32_ret(%rip),%r10
2817392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx
2818392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jae	.Lcommon_seh_tail
2819392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2820392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x20(%rax),%rsi		# %xmm save area
2821392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	512($context),%rdi	# &context.Xmm6
2822392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2823392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0xa548f3fc		# cld; rep movsq
2824392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0xc8(%rax),%rax		# adjust stack pointer
2825392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2826392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcommon_seh_tail
2827392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	ctr32_se_handler,.-ctr32_se_handler
2828392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2829392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	xts_se_handler,\@abi-omnipotent
2830392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2831392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromxts_se_handler:
2832392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rsi
2833392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rdi
2834392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
2835392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
2836392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
2837392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
2838392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
2839392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
2840392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pushfq
2841392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$64,%rsp
2842392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2843392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	120($context),%rax	# pull context->Rax
2844392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	248($context),%rbx	# pull context->Rip
2845392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2846392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8($disp),%rsi		# disp->ImageBase
2847392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	56($disp),%r11		# disp->HandlerData
2848392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2849392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%r11),%r10d		# HandlerData[0]
2850392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsi,%r10),%r10	# prologue lable
2851392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip<prologue label
2852392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcommon_seh_tail
2853392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2854392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	152($context),%rax	# pull context->Rsp
2855392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2856392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	4(%r11),%r10d		# HandlerData[1]
2857392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	(%rsi,%r10),%r10	# epilogue label
2858392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip>=epilogue label
2859392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jae	.Lcommon_seh_tail
2860392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2861392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x60(%rax),%rsi		# %xmm save area
2862392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	512($context),%rdi	# & context.Xmm6
2863392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2864392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0xa548f3fc		# cld; rep movsq
2865392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x68+160(%rax),%rax	# adjust stack pointer
2866392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2867392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcommon_seh_tail
2868392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	xts_se_handler,.-xts_se_handler
2869392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2870392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2871392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.type	cbc_se_handler,\@abi-omnipotent
2872392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	16
2873392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromcbc_se_handler:
2874392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rsi
2875392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rdi
2876392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbx
2877392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%rbp
2878392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r12
2879392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r13
2880392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r14
2881392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push	%r15
2882392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pushfq
2883392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	sub	\$64,%rsp
2884392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2885392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	152($context),%rax	# pull context->Rsp
2886392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	248($context),%rbx	# pull context->Rip
2887392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2888392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.Lcbc_decrypt(%rip),%r10
2889392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip<"prologue" label
2890392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lcommon_seh_tail
2891392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2892392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.Lcbc_decrypt_body(%rip),%r10
2893392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
2894392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jb	.Lrestore_cbc_rax
2895392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2896392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	.Lcbc_ret(%rip),%r10
2897392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	cmp	%r10,%rbx		# context->Rip>="epilogue" label
2898392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jae	.Lcommon_seh_tail
2899392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2900392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0(%rax),%rsi		# top of stack
2901392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	512($context),%rdi	# &context.Xmm6
2902392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
2903392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0xa548f3fc		# cld; rep movsq
2904392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	0x58(%rax),%rax		# adjust stack pointer
2905392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	jmp	.Lcommon_seh_tail
2906392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2907392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lrestore_cbc_rax:
2908392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	120($context),%rax
2909392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2910392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.Lcommon_seh_tail:
2911392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8(%rax),%rdi
2912392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16(%rax),%rsi
2913392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rax,152($context)	# restore context->Rsp
2914392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rsi,168($context)	# restore context->Rsi
2915392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rdi,176($context)	# restore context->Rdi
2916392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2917392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40($disp),%rdi		# disp->ContextRecord
2918392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$context,%rsi		# context
2919392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$154,%ecx		# sizeof(CONTEXT)
2920392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.long	0xa548f3fc		# cld; rep movsq
2921392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2922392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	$disp,%rsi
2923392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2924392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2925392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2926392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2927392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	40(%rsi),%r10		# disp->ContextRecord
2928392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	56(%rsi),%r11		# &disp->HandlerData
2929392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2930392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r10,32(%rsp)		# arg5
2931392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r11,40(%rsp)		# arg6
2932392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%r12,48(%rsp)		# arg7
2933392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	%rcx,56(%rsp)		# arg8, (NULL)
2934392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	call	*__imp_RtlVirtualUnwind(%rip)
2935392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2936392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	mov	\$1,%eax		# ExceptionContinueSearch
2937392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	add	\$64,%rsp
2938392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	popfq
2939392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%r15
2940392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%r14
2941392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%r13
2942392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%r12
2943392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%rbp
2944392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%rbx
2945392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%rdi
2946392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	pop	%rsi
2947392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	ret
2948392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.size	cbc_se_handler,.-cbc_se_handler
2949392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2950392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.section	.pdata
2951392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	4
2952392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2953392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($PREFIX eq "aesni");
2954392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_aesni_ecb_encrypt
2955392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_aesni_ecb_encrypt
2956392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_ecb
2957392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2958392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
2959392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
2960392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_ccm64_enc
2961392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2962392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
2963392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
2964392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_ccm64_dec
2965392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2966392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
2967392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
2968392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_ctr32
2969392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2970392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_aesni_xts_encrypt
2971392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_aesni_xts_encrypt
2972392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_xts_enc
2973392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2974392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_aesni_xts_decrypt
2975392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_aesni_xts_decrypt
2976392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_xts_dec
2977392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2978392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
2979392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
2980392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
2981392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_cbc
2982392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2983392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	${PREFIX}_set_decrypt_key
2984392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_set_decrypt_key
2985392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_key
2986392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
2987392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	${PREFIX}_set_encrypt_key
2988392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_end_set_encrypt_key
2989392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.LSEH_info_key
2990392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.section	.xdata
2991392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.align	8
2992392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
2993392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___ if ($PREFIX eq "aesni");
2994392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_ecb:
2995392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
2996392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	ecb_se_handler
2997392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_ccm64_enc:
2998392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
2999392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	ccm64_se_handler
3000392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
3001392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_ccm64_dec:
3002392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3003392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	ccm64_se_handler
3004392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
3005392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_ctr32:
3006392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3007392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	ctr32_se_handler
3008392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_xts_enc:
3009392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3010392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	xts_se_handler
3011392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3012392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_xts_dec:
3013392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3014392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	xts_se_handler
3015392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3016392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
3017392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code.=<<___;
3018392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_cbc:
3019392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	9,0,0,0
3020392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.rva	cbc_se_handler
3021392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom.LSEH_info_key:
3022392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	0x01,0x04,0x01,0x00
3023392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
3024392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom___
3025392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
3026392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3027392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub rex {
3028392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom  local *opcode=shift;
3029392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom  my ($dst,$src)=@_;
3030392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom  my $rex=0;
3031392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3032392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $rex|=0x04			if($dst>=8);
3033392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    $rex|=0x01			if($src>=8);
3034392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    push @opcode,$rex|0x40	if($rex);
3035392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
3036392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3037392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromsub aesni {
3038392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom  my $line=shift;
3039392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom  my @opcode=(0x66);
3040392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3041392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
3042392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	rex(\@opcode,$4,$3);
3043392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push @opcode,0x0f,0x3a,0xdf;
3044392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
3045392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	my $c=$2;
3046392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push @opcode,$c=~/^0/?oct($c):$c;
3047392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	return ".byte\t".join(',',@opcode);
3048392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
3049392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
3050392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	my %opcodelet = (
3051392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		"aesimc" => 0xdb,
3052392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
3053392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom		"aesdec" => 0xde,	"aesdeclast" => 0xdf
3054392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	);
3055392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	return undef if (!defined($opcodelet{$1}));
3056392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	rex(\@opcode,$3,$2);
3057392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push @opcode,0x0f,0x38,$opcodelet{$1};
3058392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
3059392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom	return ".byte\t".join(',',@opcode);
3060392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    }
3061392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom    return $line;
3062392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom}
3063392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3064392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3065392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
3066392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3067392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromprint $code;
3068392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstrom
3069392aa7cc7d2b122614c5393c3e357da07fd07af3Brian Carlstromclose STDOUT;
3070