aesni-x86.pl revision 392aa7cc7d2b122614c5393c3e357da07fd07af3
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24#	16-byte     64-byte     256-byte    1-KB        8-KB
25#	53-67%      67-84%      91-94%      95-98%      97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
47			# generates drop-in replacement for
48			# crypto/aes/asm/aes-586.pl:-)
49$inline=1;		# inline _aesni_[en|de]crypt
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52push(@INC,"${dir}","${dir}../../perlasm");
53require "x86asm.pl";
54
55&asm_init($ARGV[0],$0);
56
57if ($PREFIX eq "aesni")	{ $movekey=*movups; }
58else			{ $movekey=*movups; }
59
60$len="eax";
61$rounds="ecx";
62$key="edx";
63$inp="esi";
64$out="edi";
65$rounds_="ebx";	# backup copy for $rounds
66$key_="ebp";	# backup copy for $key
67
68$rndkey0="xmm0";
69$rndkey1="xmm1";
70$inout0="xmm2";
71$inout1="xmm3";
72$inout2="xmm4";
73$inout3="xmm5";	$in1="xmm5";
74$inout4="xmm6";	$in0="xmm6";
75$inout5="xmm7";	$ivec="xmm7";
76
77# AESNI extenstion
78sub aeskeygenassist
79{ my($dst,$src,$imm)=@_;
80    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
82}
83sub aescommon
84{ my($opcodelet,$dst,$src)=@_;
85    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
87}
88sub aesimc	{ aescommon(0xdb,@_); }
89sub aesenc	{ aescommon(0xdc,@_); }
90sub aesenclast	{ aescommon(0xdd,@_); }
91sub aesdec	{ aescommon(0xde,@_); }
92sub aesdeclast	{ aescommon(0xdf,@_); }
93
94# Inline version of internal aesni_[en|de]crypt1
95{ my $sn;
96sub aesni_inline_generate1
97{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
98  $sn++;
99
100    &$movekey		($rndkey0,&QWP(0,$key));
101    &$movekey		($rndkey1,&QWP(16,$key));
102    &xorps		($ivec,$rndkey0)	if (defined($ivec));
103    &lea		($key,&DWP(32,$key));
104    &xorps		($inout,$ivec)		if (defined($ivec));
105    &xorps		($inout,$rndkey0)	if (!defined($ivec));
106    &set_label("${p}1_loop_$sn");
107	eval"&aes${p}	($inout,$rndkey1)";
108	&dec		($rounds);
109	&$movekey	($rndkey1,&QWP(0,$key));
110	&lea		($key,&DWP(16,$key));
111    &jnz		(&label("${p}1_loop_$sn"));
112    eval"&aes${p}last	($inout,$rndkey1)";
113}}
114
115sub aesni_generate1	# fully unrolled loop
116{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
117
118    &function_begin_B("_aesni_${p}rypt1");
119	&movups		($rndkey0,&QWP(0,$key));
120	&$movekey	($rndkey1,&QWP(0x10,$key));
121	&xorps		($inout,$rndkey0);
122	&$movekey	($rndkey0,&QWP(0x20,$key));
123	&lea		($key,&DWP(0x30,$key));
124	&cmp		($rounds,11);
125	&jb		(&label("${p}128"));
126	&lea		($key,&DWP(0x20,$key));
127	&je		(&label("${p}192"));
128	&lea		($key,&DWP(0x20,$key));
129	eval"&aes${p}	($inout,$rndkey1)";
130	&$movekey	($rndkey1,&QWP(-0x40,$key));
131	eval"&aes${p}	($inout,$rndkey0)";
132	&$movekey	($rndkey0,&QWP(-0x30,$key));
133    &set_label("${p}192");
134	eval"&aes${p}	($inout,$rndkey1)";
135	&$movekey	($rndkey1,&QWP(-0x20,$key));
136	eval"&aes${p}	($inout,$rndkey0)";
137	&$movekey	($rndkey0,&QWP(-0x10,$key));
138    &set_label("${p}128");
139	eval"&aes${p}	($inout,$rndkey1)";
140	&$movekey	($rndkey1,&QWP(0,$key));
141	eval"&aes${p}	($inout,$rndkey0)";
142	&$movekey	($rndkey0,&QWP(0x10,$key));
143	eval"&aes${p}	($inout,$rndkey1)";
144	&$movekey	($rndkey1,&QWP(0x20,$key));
145	eval"&aes${p}	($inout,$rndkey0)";
146	&$movekey	($rndkey0,&QWP(0x30,$key));
147	eval"&aes${p}	($inout,$rndkey1)";
148	&$movekey	($rndkey1,&QWP(0x40,$key));
149	eval"&aes${p}	($inout,$rndkey0)";
150	&$movekey	($rndkey0,&QWP(0x50,$key));
151	eval"&aes${p}	($inout,$rndkey1)";
152	&$movekey	($rndkey1,&QWP(0x60,$key));
153	eval"&aes${p}	($inout,$rndkey0)";
154	&$movekey	($rndkey0,&QWP(0x70,$key));
155	eval"&aes${p}	($inout,$rndkey1)";
156    eval"&aes${p}last	($inout,$rndkey0)";
157    &ret();
158    &function_end_B("_aesni_${p}rypt1");
159}
160
161# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162&aesni_generate1("enc") if (!$inline);
163&function_begin_B("${PREFIX}_encrypt");
164	&mov	("eax",&wparam(0));
165	&mov	($key,&wparam(2));
166	&movups	($inout0,&QWP(0,"eax"));
167	&mov	($rounds,&DWP(240,$key));
168	&mov	("eax",&wparam(1));
169	if ($inline)
170	{   &aesni_inline_generate1("enc");	}
171	else
172	{   &call	("_aesni_encrypt1");	}
173	&movups	(&QWP(0,"eax"),$inout0);
174	&ret	();
175&function_end_B("${PREFIX}_encrypt");
176
177# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178&aesni_generate1("dec") if(!$inline);
179&function_begin_B("${PREFIX}_decrypt");
180	&mov	("eax",&wparam(0));
181	&mov	($key,&wparam(2));
182	&movups	($inout0,&QWP(0,"eax"));
183	&mov	($rounds,&DWP(240,$key));
184	&mov	("eax",&wparam(1));
185	if ($inline)
186	{   &aesni_inline_generate1("dec");	}
187	else
188	{   &call	("_aesni_decrypt1");	}
189	&movups	(&QWP(0,"eax"),$inout0);
190	&ret	();
191&function_end_B("${PREFIX}_decrypt");
192
193# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194# factor. Why 3x subroutine were originally used in loops? Even though
195# aes[enc|dec] latency was originally 6, it could be scheduled only
196# every *2nd* cycle. Thus 3x interleave was the one providing optimal
197# utilization, i.e. when subroutine's throughput is virtually same as
198# of non-interleaved subroutine [for number of input blocks up to 3].
199# This is why it makes no sense to implement 2x subroutine.
200# aes[enc|dec] latency in next processor generation is 8, but the
201# instructions can be scheduled every cycle. Optimal interleave for
202# new processor is therefore 8x, but it's unfeasible to accommodate it
203# in XMM registers addreassable in 32-bit mode and therefore 6x is
204# used instead...
205
206sub aesni_generate3
207{ my $p=shift;
208
209    &function_begin_B("_aesni_${p}rypt3");
210	&$movekey	($rndkey0,&QWP(0,$key));
211	&shr		($rounds,1);
212	&$movekey	($rndkey1,&QWP(16,$key));
213	&lea		($key,&DWP(32,$key));
214	&xorps		($inout0,$rndkey0);
215	&pxor		($inout1,$rndkey0);
216	&pxor		($inout2,$rndkey0);
217	&$movekey	($rndkey0,&QWP(0,$key));
218
219    &set_label("${p}3_loop");
220	eval"&aes${p}	($inout0,$rndkey1)";
221	eval"&aes${p}	($inout1,$rndkey1)";
222	&dec		($rounds);
223	eval"&aes${p}	($inout2,$rndkey1)";
224	&$movekey	($rndkey1,&QWP(16,$key));
225	eval"&aes${p}	($inout0,$rndkey0)";
226	eval"&aes${p}	($inout1,$rndkey0)";
227	&lea		($key,&DWP(32,$key));
228	eval"&aes${p}	($inout2,$rndkey0)";
229	&$movekey	($rndkey0,&QWP(0,$key));
230	&jnz		(&label("${p}3_loop"));
231    eval"&aes${p}	($inout0,$rndkey1)";
232    eval"&aes${p}	($inout1,$rndkey1)";
233    eval"&aes${p}	($inout2,$rndkey1)";
234    eval"&aes${p}last	($inout0,$rndkey0)";
235    eval"&aes${p}last	($inout1,$rndkey0)";
236    eval"&aes${p}last	($inout2,$rndkey0)";
237    &ret();
238    &function_end_B("_aesni_${p}rypt3");
239}
240
241# 4x interleave is implemented to improve small block performance,
242# most notably [and naturally] 4 block by ~30%. One can argue that one
243# should have implemented 5x as well, but improvement  would be <20%,
244# so it's not worth it...
245sub aesni_generate4
246{ my $p=shift;
247
248    &function_begin_B("_aesni_${p}rypt4");
249	&$movekey	($rndkey0,&QWP(0,$key));
250	&$movekey	($rndkey1,&QWP(16,$key));
251	&shr		($rounds,1);
252	&lea		($key,&DWP(32,$key));
253	&xorps		($inout0,$rndkey0);
254	&pxor		($inout1,$rndkey0);
255	&pxor		($inout2,$rndkey0);
256	&pxor		($inout3,$rndkey0);
257	&$movekey	($rndkey0,&QWP(0,$key));
258
259    &set_label("${p}4_loop");
260	eval"&aes${p}	($inout0,$rndkey1)";
261	eval"&aes${p}	($inout1,$rndkey1)";
262	&dec		($rounds);
263	eval"&aes${p}	($inout2,$rndkey1)";
264	eval"&aes${p}	($inout3,$rndkey1)";
265	&$movekey	($rndkey1,&QWP(16,$key));
266	eval"&aes${p}	($inout0,$rndkey0)";
267	eval"&aes${p}	($inout1,$rndkey0)";
268	&lea		($key,&DWP(32,$key));
269	eval"&aes${p}	($inout2,$rndkey0)";
270	eval"&aes${p}	($inout3,$rndkey0)";
271	&$movekey	($rndkey0,&QWP(0,$key));
272    &jnz		(&label("${p}4_loop"));
273
274    eval"&aes${p}	($inout0,$rndkey1)";
275    eval"&aes${p}	($inout1,$rndkey1)";
276    eval"&aes${p}	($inout2,$rndkey1)";
277    eval"&aes${p}	($inout3,$rndkey1)";
278    eval"&aes${p}last	($inout0,$rndkey0)";
279    eval"&aes${p}last	($inout1,$rndkey0)";
280    eval"&aes${p}last	($inout2,$rndkey0)";
281    eval"&aes${p}last	($inout3,$rndkey0)";
282    &ret();
283    &function_end_B("_aesni_${p}rypt4");
284}
285
286sub aesni_generate6
287{ my $p=shift;
288
289    &function_begin_B("_aesni_${p}rypt6");
290    &static_label("_aesni_${p}rypt6_enter");
291	&$movekey	($rndkey0,&QWP(0,$key));
292	&shr		($rounds,1);
293	&$movekey	($rndkey1,&QWP(16,$key));
294	&lea		($key,&DWP(32,$key));
295	&xorps		($inout0,$rndkey0);
296	&pxor		($inout1,$rndkey0);	# pxor does better here
297	eval"&aes${p}	($inout0,$rndkey1)";
298	&pxor		($inout2,$rndkey0);
299	eval"&aes${p}	($inout1,$rndkey1)";
300	&pxor		($inout3,$rndkey0);
301	&dec		($rounds);
302	eval"&aes${p}	($inout2,$rndkey1)";
303	&pxor		($inout4,$rndkey0);
304	eval"&aes${p}	($inout3,$rndkey1)";
305	&pxor		($inout5,$rndkey0);
306	eval"&aes${p}	($inout4,$rndkey1)";
307	&$movekey	($rndkey0,&QWP(0,$key));
308	eval"&aes${p}	($inout5,$rndkey1)";
309	&jmp		(&label("_aesni_${p}rypt6_enter"));
310
311    &set_label("${p}6_loop",16);
312	eval"&aes${p}	($inout0,$rndkey1)";
313	eval"&aes${p}	($inout1,$rndkey1)";
314	&dec		($rounds);
315	eval"&aes${p}	($inout2,$rndkey1)";
316	eval"&aes${p}	($inout3,$rndkey1)";
317	eval"&aes${p}	($inout4,$rndkey1)";
318	eval"&aes${p}	($inout5,$rndkey1)";
319    &set_label("_aesni_${p}rypt6_enter",16);
320	&$movekey	($rndkey1,&QWP(16,$key));
321	eval"&aes${p}	($inout0,$rndkey0)";
322	eval"&aes${p}	($inout1,$rndkey0)";
323	&lea		($key,&DWP(32,$key));
324	eval"&aes${p}	($inout2,$rndkey0)";
325	eval"&aes${p}	($inout3,$rndkey0)";
326	eval"&aes${p}	($inout4,$rndkey0)";
327	eval"&aes${p}	($inout5,$rndkey0)";
328	&$movekey	($rndkey0,&QWP(0,$key));
329    &jnz		(&label("${p}6_loop"));
330
331    eval"&aes${p}	($inout0,$rndkey1)";
332    eval"&aes${p}	($inout1,$rndkey1)";
333    eval"&aes${p}	($inout2,$rndkey1)";
334    eval"&aes${p}	($inout3,$rndkey1)";
335    eval"&aes${p}	($inout4,$rndkey1)";
336    eval"&aes${p}	($inout5,$rndkey1)";
337    eval"&aes${p}last	($inout0,$rndkey0)";
338    eval"&aes${p}last	($inout1,$rndkey0)";
339    eval"&aes${p}last	($inout2,$rndkey0)";
340    eval"&aes${p}last	($inout3,$rndkey0)";
341    eval"&aes${p}last	($inout4,$rndkey0)";
342    eval"&aes${p}last	($inout5,$rndkey0)";
343    &ret();
344    &function_end_B("_aesni_${p}rypt6");
345}
346&aesni_generate3("enc") if ($PREFIX eq "aesni");
347&aesni_generate3("dec");
348&aesni_generate4("enc") if ($PREFIX eq "aesni");
349&aesni_generate4("dec");
350&aesni_generate6("enc") if ($PREFIX eq "aesni");
351&aesni_generate6("dec");
352
353if ($PREFIX eq "aesni") {
354######################################################################
355# void aesni_ecb_encrypt (const void *in, void *out,
356#                         size_t length, const AES_KEY *key,
357#                         int enc);
358&function_begin("aesni_ecb_encrypt");
359	&mov	($inp,&wparam(0));
360	&mov	($out,&wparam(1));
361	&mov	($len,&wparam(2));
362	&mov	($key,&wparam(3));
363	&mov	($rounds_,&wparam(4));
364	&and	($len,-16);
365	&jz	(&label("ecb_ret"));
366	&mov	($rounds,&DWP(240,$key));
367	&test	($rounds_,$rounds_);
368	&jz	(&label("ecb_decrypt"));
369
370	&mov	($key_,$key);		# backup $key
371	&mov	($rounds_,$rounds);	# backup $rounds
372	&cmp	($len,0x60);
373	&jb	(&label("ecb_enc_tail"));
374
375	&movdqu	($inout0,&QWP(0,$inp));
376	&movdqu	($inout1,&QWP(0x10,$inp));
377	&movdqu	($inout2,&QWP(0x20,$inp));
378	&movdqu	($inout3,&QWP(0x30,$inp));
379	&movdqu	($inout4,&QWP(0x40,$inp));
380	&movdqu	($inout5,&QWP(0x50,$inp));
381	&lea	($inp,&DWP(0x60,$inp));
382	&sub	($len,0x60);
383	&jmp	(&label("ecb_enc_loop6_enter"));
384
385&set_label("ecb_enc_loop6",16);
386	&movups	(&QWP(0,$out),$inout0);
387	&movdqu	($inout0,&QWP(0,$inp));
388	&movups	(&QWP(0x10,$out),$inout1);
389	&movdqu	($inout1,&QWP(0x10,$inp));
390	&movups	(&QWP(0x20,$out),$inout2);
391	&movdqu	($inout2,&QWP(0x20,$inp));
392	&movups	(&QWP(0x30,$out),$inout3);
393	&movdqu	($inout3,&QWP(0x30,$inp));
394	&movups	(&QWP(0x40,$out),$inout4);
395	&movdqu	($inout4,&QWP(0x40,$inp));
396	&movups	(&QWP(0x50,$out),$inout5);
397	&lea	($out,&DWP(0x60,$out));
398	&movdqu	($inout5,&QWP(0x50,$inp));
399	&lea	($inp,&DWP(0x60,$inp));
400&set_label("ecb_enc_loop6_enter");
401
402	&call	("_aesni_encrypt6");
403
404	&mov	($key,$key_);		# restore $key
405	&mov	($rounds,$rounds_);	# restore $rounds
406	&sub	($len,0x60);
407	&jnc	(&label("ecb_enc_loop6"));
408
409	&movups	(&QWP(0,$out),$inout0);
410	&movups	(&QWP(0x10,$out),$inout1);
411	&movups	(&QWP(0x20,$out),$inout2);
412	&movups	(&QWP(0x30,$out),$inout3);
413	&movups	(&QWP(0x40,$out),$inout4);
414	&movups	(&QWP(0x50,$out),$inout5);
415	&lea	($out,&DWP(0x60,$out));
416	&add	($len,0x60);
417	&jz	(&label("ecb_ret"));
418
419&set_label("ecb_enc_tail");
420	&movups	($inout0,&QWP(0,$inp));
421	&cmp	($len,0x20);
422	&jb	(&label("ecb_enc_one"));
423	&movups	($inout1,&QWP(0x10,$inp));
424	&je	(&label("ecb_enc_two"));
425	&movups	($inout2,&QWP(0x20,$inp));
426	&cmp	($len,0x40);
427	&jb	(&label("ecb_enc_three"));
428	&movups	($inout3,&QWP(0x30,$inp));
429	&je	(&label("ecb_enc_four"));
430	&movups	($inout4,&QWP(0x40,$inp));
431	&xorps	($inout5,$inout5);
432	&call	("_aesni_encrypt6");
433	&movups	(&QWP(0,$out),$inout0);
434	&movups	(&QWP(0x10,$out),$inout1);
435	&movups	(&QWP(0x20,$out),$inout2);
436	&movups	(&QWP(0x30,$out),$inout3);
437	&movups	(&QWP(0x40,$out),$inout4);
438	jmp	(&label("ecb_ret"));
439
440&set_label("ecb_enc_one",16);
441	if ($inline)
442	{   &aesni_inline_generate1("enc");	}
443	else
444	{   &call	("_aesni_encrypt1");	}
445	&movups	(&QWP(0,$out),$inout0);
446	&jmp	(&label("ecb_ret"));
447
448&set_label("ecb_enc_two",16);
449	&xorps	($inout2,$inout2);
450	&call	("_aesni_encrypt3");
451	&movups	(&QWP(0,$out),$inout0);
452	&movups	(&QWP(0x10,$out),$inout1);
453	&jmp	(&label("ecb_ret"));
454
455&set_label("ecb_enc_three",16);
456	&call	("_aesni_encrypt3");
457	&movups	(&QWP(0,$out),$inout0);
458	&movups	(&QWP(0x10,$out),$inout1);
459	&movups	(&QWP(0x20,$out),$inout2);
460	&jmp	(&label("ecb_ret"));
461
462&set_label("ecb_enc_four",16);
463	&call	("_aesni_encrypt4");
464	&movups	(&QWP(0,$out),$inout0);
465	&movups	(&QWP(0x10,$out),$inout1);
466	&movups	(&QWP(0x20,$out),$inout2);
467	&movups	(&QWP(0x30,$out),$inout3);
468	&jmp	(&label("ecb_ret"));
469######################################################################
470&set_label("ecb_decrypt",16);
471	&mov	($key_,$key);		# backup $key
472	&mov	($rounds_,$rounds);	# backup $rounds
473	&cmp	($len,0x60);
474	&jb	(&label("ecb_dec_tail"));
475
476	&movdqu	($inout0,&QWP(0,$inp));
477	&movdqu	($inout1,&QWP(0x10,$inp));
478	&movdqu	($inout2,&QWP(0x20,$inp));
479	&movdqu	($inout3,&QWP(0x30,$inp));
480	&movdqu	($inout4,&QWP(0x40,$inp));
481	&movdqu	($inout5,&QWP(0x50,$inp));
482	&lea	($inp,&DWP(0x60,$inp));
483	&sub	($len,0x60);
484	&jmp	(&label("ecb_dec_loop6_enter"));
485
486&set_label("ecb_dec_loop6",16);
487	&movups	(&QWP(0,$out),$inout0);
488	&movdqu	($inout0,&QWP(0,$inp));
489	&movups	(&QWP(0x10,$out),$inout1);
490	&movdqu	($inout1,&QWP(0x10,$inp));
491	&movups	(&QWP(0x20,$out),$inout2);
492	&movdqu	($inout2,&QWP(0x20,$inp));
493	&movups	(&QWP(0x30,$out),$inout3);
494	&movdqu	($inout3,&QWP(0x30,$inp));
495	&movups	(&QWP(0x40,$out),$inout4);
496	&movdqu	($inout4,&QWP(0x40,$inp));
497	&movups	(&QWP(0x50,$out),$inout5);
498	&lea	($out,&DWP(0x60,$out));
499	&movdqu	($inout5,&QWP(0x50,$inp));
500	&lea	($inp,&DWP(0x60,$inp));
501&set_label("ecb_dec_loop6_enter");
502
503	&call	("_aesni_decrypt6");
504
505	&mov	($key,$key_);		# restore $key
506	&mov	($rounds,$rounds_);	# restore $rounds
507	&sub	($len,0x60);
508	&jnc	(&label("ecb_dec_loop6"));
509
510	&movups	(&QWP(0,$out),$inout0);
511	&movups	(&QWP(0x10,$out),$inout1);
512	&movups	(&QWP(0x20,$out),$inout2);
513	&movups	(&QWP(0x30,$out),$inout3);
514	&movups	(&QWP(0x40,$out),$inout4);
515	&movups	(&QWP(0x50,$out),$inout5);
516	&lea	($out,&DWP(0x60,$out));
517	&add	($len,0x60);
518	&jz	(&label("ecb_ret"));
519
520&set_label("ecb_dec_tail");
521	&movups	($inout0,&QWP(0,$inp));
522	&cmp	($len,0x20);
523	&jb	(&label("ecb_dec_one"));
524	&movups	($inout1,&QWP(0x10,$inp));
525	&je	(&label("ecb_dec_two"));
526	&movups	($inout2,&QWP(0x20,$inp));
527	&cmp	($len,0x40);
528	&jb	(&label("ecb_dec_three"));
529	&movups	($inout3,&QWP(0x30,$inp));
530	&je	(&label("ecb_dec_four"));
531	&movups	($inout4,&QWP(0x40,$inp));
532	&xorps	($inout5,$inout5);
533	&call	("_aesni_decrypt6");
534	&movups	(&QWP(0,$out),$inout0);
535	&movups	(&QWP(0x10,$out),$inout1);
536	&movups	(&QWP(0x20,$out),$inout2);
537	&movups	(&QWP(0x30,$out),$inout3);
538	&movups	(&QWP(0x40,$out),$inout4);
539	&jmp	(&label("ecb_ret"));
540
541&set_label("ecb_dec_one",16);
542	if ($inline)
543	{   &aesni_inline_generate1("dec");	}
544	else
545	{   &call	("_aesni_decrypt1");	}
546	&movups	(&QWP(0,$out),$inout0);
547	&jmp	(&label("ecb_ret"));
548
549&set_label("ecb_dec_two",16);
550	&xorps	($inout2,$inout2);
551	&call	("_aesni_decrypt3");
552	&movups	(&QWP(0,$out),$inout0);
553	&movups	(&QWP(0x10,$out),$inout1);
554	&jmp	(&label("ecb_ret"));
555
556&set_label("ecb_dec_three",16);
557	&call	("_aesni_decrypt3");
558	&movups	(&QWP(0,$out),$inout0);
559	&movups	(&QWP(0x10,$out),$inout1);
560	&movups	(&QWP(0x20,$out),$inout2);
561	&jmp	(&label("ecb_ret"));
562
563&set_label("ecb_dec_four",16);
564	&call	("_aesni_decrypt4");
565	&movups	(&QWP(0,$out),$inout0);
566	&movups	(&QWP(0x10,$out),$inout1);
567	&movups	(&QWP(0x20,$out),$inout2);
568	&movups	(&QWP(0x30,$out),$inout3);
569
570&set_label("ecb_ret");
571&function_end("aesni_ecb_encrypt");
572
573######################################################################
574# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575#                         size_t blocks, const AES_KEY *key,
576#                         const char *ivec,char *cmac);
577#
578# Handles only complete blocks, operates on 64-bit counter and
579# does not update *ivec! Nor does it finalize CMAC value
580# (see engine/eng_aesni.c for details)
581#
582{ my $cmac=$inout1;
583&function_begin("aesni_ccm64_encrypt_blocks");
584	&mov	($inp,&wparam(0));
585	&mov	($out,&wparam(1));
586	&mov	($len,&wparam(2));
587	&mov	($key,&wparam(3));
588	&mov	($rounds_,&wparam(4));
589	&mov	($rounds,&wparam(5));
590	&mov	($key_,"esp");
591	&sub	("esp",60);
592	&and	("esp",-16);			# align stack
593	&mov	(&DWP(48,"esp"),$key_);
594
595	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
596	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
597	&mov	($rounds,&DWP(240,$key));
598
599	# compose byte-swap control mask for pshufb on stack
600	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
601	&mov	(&DWP(4,"esp"),0x08090a0b);
602	&mov	(&DWP(8,"esp"),0x04050607);
603	&mov	(&DWP(12,"esp"),0x00010203);
604
605	# compose counter increment vector on stack
606	&mov	($rounds_,1);
607	&xor	($key_,$key_);
608	&mov	(&DWP(16,"esp"),$rounds_);
609	&mov	(&DWP(20,"esp"),$key_);
610	&mov	(&DWP(24,"esp"),$key_);
611	&mov	(&DWP(28,"esp"),$key_);
612
613	&shr	($rounds,1);
614	&lea	($key_,&DWP(0,$key));
615	&movdqa	($inout3,&QWP(0,"esp"));
616	&movdqa	($inout0,$ivec);
617	&mov	($rounds_,$rounds);
618	&pshufb	($ivec,$inout3);
619
620&set_label("ccm64_enc_outer");
621	&$movekey	($rndkey0,&QWP(0,$key_));
622	&mov		($rounds,$rounds_);
623	&movups		($in0,&QWP(0,$inp));
624
625	&xorps		($inout0,$rndkey0);
626	&$movekey	($rndkey1,&QWP(16,$key_));
627	&xorps		($rndkey0,$in0);
628	&lea		($key,&DWP(32,$key_));
629	&xorps		($cmac,$rndkey0);		# cmac^=inp
630	&$movekey	($rndkey0,&QWP(0,$key));
631
632&set_label("ccm64_enc2_loop");
633	&aesenc		($inout0,$rndkey1);
634	&dec		($rounds);
635	&aesenc		($cmac,$rndkey1);
636	&$movekey	($rndkey1,&QWP(16,$key));
637	&aesenc		($inout0,$rndkey0);
638	&lea		($key,&DWP(32,$key));
639	&aesenc		($cmac,$rndkey0);
640	&$movekey	($rndkey0,&QWP(0,$key));
641	&jnz		(&label("ccm64_enc2_loop"));
642	&aesenc		($inout0,$rndkey1);
643	&aesenc		($cmac,$rndkey1);
644	&paddq		($ivec,&QWP(16,"esp"));
645	&aesenclast	($inout0,$rndkey0);
646	&aesenclast	($cmac,$rndkey0);
647
648	&dec	($len);
649	&lea	($inp,&DWP(16,$inp));
650	&xorps	($in0,$inout0);			# inp^=E(ivec)
651	&movdqa	($inout0,$ivec);
652	&movups	(&QWP(0,$out),$in0);		# save output
653	&lea	($out,&DWP(16,$out));
654	&pshufb	($inout0,$inout3);
655	&jnz	(&label("ccm64_enc_outer"));
656
657	&mov	("esp",&DWP(48,"esp"));
658	&mov	($out,&wparam(5));
659	&movups	(&QWP(0,$out),$cmac);
660&function_end("aesni_ccm64_encrypt_blocks");
661
662&function_begin("aesni_ccm64_decrypt_blocks");
663	&mov	($inp,&wparam(0));
664	&mov	($out,&wparam(1));
665	&mov	($len,&wparam(2));
666	&mov	($key,&wparam(3));
667	&mov	($rounds_,&wparam(4));
668	&mov	($rounds,&wparam(5));
669	&mov	($key_,"esp");
670	&sub	("esp",60);
671	&and	("esp",-16);			# align stack
672	&mov	(&DWP(48,"esp"),$key_);
673
674	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
675	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
676	&mov	($rounds,&DWP(240,$key));
677
678	# compose byte-swap control mask for pshufb on stack
679	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
680	&mov	(&DWP(4,"esp"),0x08090a0b);
681	&mov	(&DWP(8,"esp"),0x04050607);
682	&mov	(&DWP(12,"esp"),0x00010203);
683
684	# compose counter increment vector on stack
685	&mov	($rounds_,1);
686	&xor	($key_,$key_);
687	&mov	(&DWP(16,"esp"),$rounds_);
688	&mov	(&DWP(20,"esp"),$key_);
689	&mov	(&DWP(24,"esp"),$key_);
690	&mov	(&DWP(28,"esp"),$key_);
691
692	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
693	&movdqa	($inout0,$ivec);
694
695	&mov	($key_,$key);
696	&mov	($rounds_,$rounds);
697
698	&pshufb	($ivec,$inout3);
699	if ($inline)
700	{   &aesni_inline_generate1("enc");	}
701	else
702	{   &call	("_aesni_encrypt1");	}
703	&movups	($in0,&QWP(0,$inp));		# load inp
704	&paddq	($ivec,&QWP(16,"esp"));
705	&lea	($inp,&QWP(16,$inp));
706	&jmp	(&label("ccm64_dec_outer"));
707
708&set_label("ccm64_dec_outer",16);
709	&xorps	($in0,$inout0);			# inp ^= E(ivec)
710	&movdqa	($inout0,$ivec);
711	&mov	($rounds,$rounds_);
712	&movups	(&QWP(0,$out),$in0);		# save output
713	&lea	($out,&DWP(16,$out));
714	&pshufb	($inout0,$inout3);
715
716	&sub	($len,1);
717	&jz	(&label("ccm64_dec_break"));
718
719	&$movekey	($rndkey0,&QWP(0,$key_));
720	&shr		($rounds,1);
721	&$movekey	($rndkey1,&QWP(16,$key_));
722	&xorps		($in0,$rndkey0);
723	&lea		($key,&DWP(32,$key_));
724	&xorps		($inout0,$rndkey0);
725	&xorps		($cmac,$in0);		# cmac^=out
726	&$movekey	($rndkey0,&QWP(0,$key));
727
728&set_label("ccm64_dec2_loop");
729	&aesenc		($inout0,$rndkey1);
730	&dec		($rounds);
731	&aesenc		($cmac,$rndkey1);
732	&$movekey	($rndkey1,&QWP(16,$key));
733	&aesenc		($inout0,$rndkey0);
734	&lea		($key,&DWP(32,$key));
735	&aesenc		($cmac,$rndkey0);
736	&$movekey	($rndkey0,&QWP(0,$key));
737	&jnz		(&label("ccm64_dec2_loop"));
738	&movups		($in0,&QWP(0,$inp));	# load inp
739	&paddq		($ivec,&QWP(16,"esp"));
740	&aesenc		($inout0,$rndkey1);
741	&aesenc		($cmac,$rndkey1);
742	&lea		($inp,&QWP(16,$inp));
743	&aesenclast	($inout0,$rndkey0);
744	&aesenclast	($cmac,$rndkey0);
745	&jmp	(&label("ccm64_dec_outer"));
746
747&set_label("ccm64_dec_break",16);
748	&mov	($key,$key_);
749	if ($inline)
750	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
751	else
752	{   &call	("_aesni_encrypt1",$cmac);	}
753
754	&mov	("esp",&DWP(48,"esp"));
755	&mov	($out,&wparam(5));
756	&movups	(&QWP(0,$out),$cmac);
757&function_end("aesni_ccm64_decrypt_blocks");
758}
759
760######################################################################
761# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
762#                         size_t blocks, const AES_KEY *key,
763#                         const char *ivec);
764#
765# Handles only complete blocks, operates on 32-bit counter and
766# does not update *ivec! (see engine/eng_aesni.c for details)
767#
768# stack layout:
769#	0	pshufb mask
770#	16	vector addend: 0,6,6,6
771# 	32	counter-less ivec
772#	48	1st triplet of counter vector
773#	64	2nd triplet of counter vector
774#	80	saved %esp
775
776&function_begin("aesni_ctr32_encrypt_blocks");
777	&mov	($inp,&wparam(0));
778	&mov	($out,&wparam(1));
779	&mov	($len,&wparam(2));
780	&mov	($key,&wparam(3));
781	&mov	($rounds_,&wparam(4));
782	&mov	($key_,"esp");
783	&sub	("esp",88);
784	&and	("esp",-16);			# align stack
785	&mov	(&DWP(80,"esp"),$key_);
786
787	&cmp	($len,1);
788	&je	(&label("ctr32_one_shortcut"));
789
790	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
791
792	# compose byte-swap control mask for pshufb on stack
793	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
794	&mov	(&DWP(4,"esp"),0x08090a0b);
795	&mov	(&DWP(8,"esp"),0x04050607);
796	&mov	(&DWP(12,"esp"),0x00010203);
797
798	# compose counter increment vector on stack
799	&mov	($rounds,6);
800	&xor	($key_,$key_);
801	&mov	(&DWP(16,"esp"),$rounds);
802	&mov	(&DWP(20,"esp"),$rounds);
803	&mov	(&DWP(24,"esp"),$rounds);
804	&mov	(&DWP(28,"esp"),$key_);
805
806	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
807	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
808
809	&mov	($rounds,&DWP(240,$key));	# key->rounds
810
811	# compose 2 vectors of 3x32-bit counters
812	&bswap	($rounds_);
813	&pxor	($rndkey1,$rndkey1);
814	&pxor	($rndkey0,$rndkey0);
815	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
816	&pinsrd	($rndkey1,$rounds_,0);
817	&lea	($key_,&DWP(3,$rounds_));
818	&pinsrd	($rndkey0,$key_,0);
819	&inc	($rounds_);
820	&pinsrd	($rndkey1,$rounds_,1);
821	&inc	($key_);
822	&pinsrd	($rndkey0,$key_,1);
823	&inc	($rounds_);
824	&pinsrd	($rndkey1,$rounds_,2);
825	&inc	($key_);
826	&pinsrd	($rndkey0,$key_,2);
827	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
828	&pshufb	($rndkey1,$inout0);		# byte swap
829	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
830	&pshufb	($rndkey0,$inout0);		# byte swap
831
832	&pshufd	($inout0,$rndkey1,3<<6);	# place counter to upper dword
833	&pshufd	($inout1,$rndkey1,2<<6);
834	&cmp	($len,6);
835	&jb	(&label("ctr32_tail"));
836	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec
837	&shr	($rounds,1);
838	&mov	($key_,$key);			# backup $key
839	&mov	($rounds_,$rounds);		# backup $rounds
840	&sub	($len,6);
841	&jmp	(&label("ctr32_loop6"));
842
843&set_label("ctr32_loop6",16);
844	&pshufd	($inout2,$rndkey1,1<<6);
845	&movdqa	($rndkey1,&QWP(32,"esp"));	# pull counter-less ivec
846	&pshufd	($inout3,$rndkey0,3<<6);
847	&por	($inout0,$rndkey1);		# merge counter-less ivec
848	&pshufd	($inout4,$rndkey0,2<<6);
849	&por	($inout1,$rndkey1);
850	&pshufd	($inout5,$rndkey0,1<<6);
851	&por	($inout2,$rndkey1);
852	&por	($inout3,$rndkey1);
853	&por	($inout4,$rndkey1);
854	&por	($inout5,$rndkey1);
855
856	# inlining _aesni_encrypt6's prologue gives ~4% improvement...
857	&$movekey	($rndkey0,&QWP(0,$key_));
858	&$movekey	($rndkey1,&QWP(16,$key_));
859	&lea		($key,&DWP(32,$key_));
860	&dec		($rounds);
861	&pxor		($inout0,$rndkey0);
862	&pxor		($inout1,$rndkey0);
863	&aesenc		($inout0,$rndkey1);
864	&pxor		($inout2,$rndkey0);
865	&aesenc		($inout1,$rndkey1);
866	&pxor		($inout3,$rndkey0);
867	&aesenc		($inout2,$rndkey1);
868	&pxor		($inout4,$rndkey0);
869	&aesenc		($inout3,$rndkey1);
870	&pxor		($inout5,$rndkey0);
871	&aesenc		($inout4,$rndkey1);
872	&$movekey	($rndkey0,&QWP(0,$key));
873	&aesenc		($inout5,$rndkey1);
874
875	&call		(&label("_aesni_encrypt6_enter"));
876
877	&movups	($rndkey1,&QWP(0,$inp));
878	&movups	($rndkey0,&QWP(0x10,$inp));
879	&xorps	($inout0,$rndkey1);
880	&movups	($rndkey1,&QWP(0x20,$inp));
881	&xorps	($inout1,$rndkey0);
882	&movups	(&QWP(0,$out),$inout0);
883	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
884	&xorps	($inout2,$rndkey1);
885	&movdqa	($rndkey1,&QWP(48,"esp"));	# load 1st triplet
886	&movups	(&QWP(0x10,$out),$inout1);
887	&movups	(&QWP(0x20,$out),$inout2);
888
889	&paddd	($rndkey1,$rndkey0);		# 1st triplet increment
890	&paddd	($rndkey0,&QWP(64,"esp"));	# 2nd triplet increment
891	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
892
893	&movups	($inout1,&QWP(0x30,$inp));
894	&movups	($inout2,&QWP(0x40,$inp));
895	&xorps	($inout3,$inout1);
896	&movups	($inout1,&QWP(0x50,$inp));
897	&lea	($inp,&DWP(0x60,$inp));
898	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
899	&pshufb	($rndkey1,$inout0);		# byte swap
900	&xorps	($inout4,$inout2);
901	&movups	(&QWP(0x30,$out),$inout3);
902	&xorps	($inout5,$inout1);
903	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
904	&pshufb	($rndkey0,$inout0);		# byte swap
905	&movups	(&QWP(0x40,$out),$inout4);
906	&pshufd	($inout0,$rndkey1,3<<6);
907	&movups	(&QWP(0x50,$out),$inout5);
908	&lea	($out,&DWP(0x60,$out));
909
910	&mov	($rounds,$rounds_);
911	&pshufd	($inout1,$rndkey1,2<<6);
912	&sub	($len,6);
913	&jnc	(&label("ctr32_loop6"));
914
915	&add	($len,6);
916	&jz	(&label("ctr32_ret"));
917	&mov	($key,$key_);
918	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
919	&movdqa	($inout5,&QWP(32,"esp"));	# pull count-less ivec
920
921&set_label("ctr32_tail");
922	&por	($inout0,$inout5);
923	&cmp	($len,2);
924	&jb	(&label("ctr32_one"));
925
926	&pshufd	($inout2,$rndkey1,1<<6);
927	&por	($inout1,$inout5);
928	&je	(&label("ctr32_two"));
929
930	&pshufd	($inout3,$rndkey0,3<<6);
931	&por	($inout2,$inout5);
932	&cmp	($len,4);
933	&jb	(&label("ctr32_three"));
934
935	&pshufd	($inout4,$rndkey0,2<<6);
936	&por	($inout3,$inout5);
937	&je	(&label("ctr32_four"));
938
939	&por	($inout4,$inout5);
940	&call	("_aesni_encrypt6");
941	&movups	($rndkey1,&QWP(0,$inp));
942	&movups	($rndkey0,&QWP(0x10,$inp));
943	&xorps	($inout0,$rndkey1);
944	&movups	($rndkey1,&QWP(0x20,$inp));
945	&xorps	($inout1,$rndkey0);
946	&movups	($rndkey0,&QWP(0x30,$inp));
947	&xorps	($inout2,$rndkey1);
948	&movups	($rndkey1,&QWP(0x40,$inp));
949	&xorps	($inout3,$rndkey0);
950	&movups	(&QWP(0,$out),$inout0);
951	&xorps	($inout4,$rndkey1);
952	&movups	(&QWP(0x10,$out),$inout1);
953	&movups	(&QWP(0x20,$out),$inout2);
954	&movups	(&QWP(0x30,$out),$inout3);
955	&movups	(&QWP(0x40,$out),$inout4);
956	&jmp	(&label("ctr32_ret"));
957
958&set_label("ctr32_one_shortcut",16);
959	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
960	&mov	($rounds,&DWP(240,$key));
961
962&set_label("ctr32_one");
963	if ($inline)
964	{   &aesni_inline_generate1("enc");	}
965	else
966	{   &call	("_aesni_encrypt1");	}
967	&movups	($in0,&QWP(0,$inp));
968	&xorps	($in0,$inout0);
969	&movups	(&QWP(0,$out),$in0);
970	&jmp	(&label("ctr32_ret"));
971
972&set_label("ctr32_two",16);
973	&call	("_aesni_encrypt3");
974	&movups	($inout3,&QWP(0,$inp));
975	&movups	($inout4,&QWP(0x10,$inp));
976	&xorps	($inout0,$inout3);
977	&xorps	($inout1,$inout4);
978	&movups	(&QWP(0,$out),$inout0);
979	&movups	(&QWP(0x10,$out),$inout1);
980	&jmp	(&label("ctr32_ret"));
981
982&set_label("ctr32_three",16);
983	&call	("_aesni_encrypt3");
984	&movups	($inout3,&QWP(0,$inp));
985	&movups	($inout4,&QWP(0x10,$inp));
986	&xorps	($inout0,$inout3);
987	&movups	($inout5,&QWP(0x20,$inp));
988	&xorps	($inout1,$inout4);
989	&movups	(&QWP(0,$out),$inout0);
990	&xorps	($inout2,$inout5);
991	&movups	(&QWP(0x10,$out),$inout1);
992	&movups	(&QWP(0x20,$out),$inout2);
993	&jmp	(&label("ctr32_ret"));
994
995&set_label("ctr32_four",16);
996	&call	("_aesni_encrypt4");
997	&movups	($inout4,&QWP(0,$inp));
998	&movups	($inout5,&QWP(0x10,$inp));
999	&movups	($rndkey1,&QWP(0x20,$inp));
1000	&xorps	($inout0,$inout4);
1001	&movups	($rndkey0,&QWP(0x30,$inp));
1002	&xorps	($inout1,$inout5);
1003	&movups	(&QWP(0,$out),$inout0);
1004	&xorps	($inout2,$rndkey1);
1005	&movups	(&QWP(0x10,$out),$inout1);
1006	&xorps	($inout3,$rndkey0);
1007	&movups	(&QWP(0x20,$out),$inout2);
1008	&movups	(&QWP(0x30,$out),$inout3);
1009
1010&set_label("ctr32_ret");
1011	&mov	("esp",&DWP(80,"esp"));
1012&function_end("aesni_ctr32_encrypt_blocks");
1013
1014######################################################################
1015# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016#	const AES_KEY *key1, const AES_KEY *key2
1017#	const unsigned char iv[16]);
1018#
1019{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1020
1021&function_begin("aesni_xts_encrypt");
1022	&mov	($key,&wparam(4));		# key2
1023	&mov	($inp,&wparam(5));		# clear-text tweak
1024
1025	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1026	&movups	($inout0,&QWP(0,$inp));
1027	if ($inline)
1028	{   &aesni_inline_generate1("enc");	}
1029	else
1030	{   &call	("_aesni_encrypt1");	}
1031
1032	&mov	($inp,&wparam(0));
1033	&mov	($out,&wparam(1));
1034	&mov	($len,&wparam(2));
1035	&mov	($key,&wparam(3));		# key1
1036
1037	&mov	($key_,"esp");
1038	&sub	("esp",16*7+8);
1039	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1040	&and	("esp",-16);			# align stack
1041
1042	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1043	&mov	(&DWP(16*6+4,"esp"),0);
1044	&mov	(&DWP(16*6+8,"esp"),1);
1045	&mov	(&DWP(16*6+12,"esp"),0);
1046	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1047	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1048
1049	&movdqa	($tweak,$inout0);
1050	&pxor	($twtmp,$twtmp);
1051	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1052	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1053
1054	&and	($len,-16);
1055	&mov	($key_,$key);			# backup $key
1056	&mov	($rounds_,$rounds);		# backup $rounds
1057	&sub	($len,16*6);
1058	&jc	(&label("xts_enc_short"));
1059
1060	&shr	($rounds,1);
1061	&mov	($rounds_,$rounds);
1062	&jmp	(&label("xts_enc_loop6"));
1063
1064&set_label("xts_enc_loop6",16);
1065	for ($i=0;$i<4;$i++) {
1066	    &pshufd	($twres,$twtmp,0x13);
1067	    &pxor	($twtmp,$twtmp);
1068	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1069	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1070	    &pand	($twres,$twmask);	# isolate carry and residue
1071	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1072	    &pxor	($tweak,$twres);
1073	}
1074	&pshufd	($inout5,$twtmp,0x13);
1075	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1076	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1077	 &$movekey	($rndkey0,&QWP(0,$key_));
1078	&pand	($inout5,$twmask);		# isolate carry and residue
1079	 &movups	($inout0,&QWP(0,$inp));	# load input
1080	&pxor	($inout5,$tweak);
1081
1082	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083	&movdqu	($inout1,&QWP(16*1,$inp));
1084	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1085	&movdqu	($inout2,&QWP(16*2,$inp));
1086	 &pxor		($inout1,$rndkey0);
1087	&movdqu	($inout3,&QWP(16*3,$inp));
1088	 &pxor		($inout2,$rndkey0);
1089	&movdqu	($inout4,&QWP(16*4,$inp));
1090	 &pxor		($inout3,$rndkey0);
1091	&movdqu	($rndkey1,&QWP(16*5,$inp));
1092	 &pxor		($inout4,$rndkey0);
1093	&lea	($inp,&DWP(16*6,$inp));
1094	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1095	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1096	&pxor	($inout5,$rndkey1);
1097
1098	 &$movekey	($rndkey1,&QWP(16,$key_));
1099	 &lea		($key,&DWP(32,$key_));
1100	&pxor	($inout1,&QWP(16*1,"esp"));
1101	 &aesenc	($inout0,$rndkey1);
1102	&pxor	($inout2,&QWP(16*2,"esp"));
1103	 &aesenc	($inout1,$rndkey1);
1104	&pxor	($inout3,&QWP(16*3,"esp"));
1105	 &dec		($rounds);
1106	 &aesenc	($inout2,$rndkey1);
1107	&pxor	($inout4,&QWP(16*4,"esp"));
1108	 &aesenc	($inout3,$rndkey1);
1109	&pxor		($inout5,$rndkey0);
1110	 &aesenc	($inout4,$rndkey1);
1111	 &$movekey	($rndkey0,&QWP(0,$key));
1112	 &aesenc	($inout5,$rndkey1);
1113	&call		(&label("_aesni_encrypt6_enter"));
1114
1115	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1116       &pxor	($twtmp,$twtmp);
1117	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1118       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1119	&xorps	($inout1,&QWP(16*1,"esp"));
1120	&movups	(&QWP(16*0,$out),$inout0);	# write output
1121	&xorps	($inout2,&QWP(16*2,"esp"));
1122	&movups	(&QWP(16*1,$out),$inout1);
1123	&xorps	($inout3,&QWP(16*3,"esp"));
1124	&movups	(&QWP(16*2,$out),$inout2);
1125	&xorps	($inout4,&QWP(16*4,"esp"));
1126	&movups	(&QWP(16*3,$out),$inout3);
1127	&xorps	($inout5,$tweak);
1128	&movups	(&QWP(16*4,$out),$inout4);
1129       &pshufd	($twres,$twtmp,0x13);
1130	&movups	(&QWP(16*5,$out),$inout5);
1131	&lea	($out,&DWP(16*6,$out));
1132       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1133
1134	&pxor	($twtmp,$twtmp);
1135	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1136	&pand	($twres,$twmask);		# isolate carry and residue
1137	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1138	&mov	($rounds,$rounds_);		# restore $rounds
1139	&pxor	($tweak,$twres);
1140
1141	&sub	($len,16*6);
1142	&jnc	(&label("xts_enc_loop6"));
1143
1144	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
1145	&mov	($key,$key_);			# restore $key
1146	&mov	($rounds_,$rounds);
1147
1148&set_label("xts_enc_short");
1149	&add	($len,16*6);
1150	&jz	(&label("xts_enc_done6x"));
1151
1152	&movdqa	($inout3,$tweak);		# put aside previous tweak
1153	&cmp	($len,0x20);
1154	&jb	(&label("xts_enc_one"));
1155
1156	&pshufd	($twres,$twtmp,0x13);
1157	&pxor	($twtmp,$twtmp);
1158	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1159	&pand	($twres,$twmask);		# isolate carry and residue
1160	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1161	&pxor	($tweak,$twres);
1162	&je	(&label("xts_enc_two"));
1163
1164	&pshufd	($twres,$twtmp,0x13);
1165	&pxor	($twtmp,$twtmp);
1166	&movdqa	($inout4,$tweak);		# put aside previous tweak
1167	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1168	&pand	($twres,$twmask);		# isolate carry and residue
1169	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1170	&pxor	($tweak,$twres);
1171	&cmp	($len,0x40);
1172	&jb	(&label("xts_enc_three"));
1173
1174	&pshufd	($twres,$twtmp,0x13);
1175	&pxor	($twtmp,$twtmp);
1176	&movdqa	($inout5,$tweak);		# put aside previous tweak
1177	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1178	&pand	($twres,$twmask);		# isolate carry and residue
1179	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1180	&pxor	($tweak,$twres);
1181	&movdqa	(&QWP(16*0,"esp"),$inout3);
1182	&movdqa	(&QWP(16*1,"esp"),$inout4);
1183	&je	(&label("xts_enc_four"));
1184
1185	&movdqa	(&QWP(16*2,"esp"),$inout5);
1186	&pshufd	($inout5,$twtmp,0x13);
1187	&movdqa	(&QWP(16*3,"esp"),$tweak);
1188	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1189	&pand	($inout5,$twmask);		# isolate carry and residue
1190	&pxor	($inout5,$tweak);
1191
1192	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1193	&movdqu	($inout1,&QWP(16*1,$inp));
1194	&movdqu	($inout2,&QWP(16*2,$inp));
1195	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1196	&movdqu	($inout3,&QWP(16*3,$inp));
1197	&pxor	($inout1,&QWP(16*1,"esp"));
1198	&movdqu	($inout4,&QWP(16*4,$inp));
1199	&pxor	($inout2,&QWP(16*2,"esp"));
1200	&lea	($inp,&DWP(16*5,$inp));
1201	&pxor	($inout3,&QWP(16*3,"esp"));
1202	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1203	&pxor	($inout4,$inout5);
1204
1205	&call	("_aesni_encrypt6");
1206
1207	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1208	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1209	&xorps	($inout1,&QWP(16*1,"esp"));
1210	&xorps	($inout2,&QWP(16*2,"esp"));
1211	&movups	(&QWP(16*0,$out),$inout0);	# write output
1212	&xorps	($inout3,&QWP(16*3,"esp"));
1213	&movups	(&QWP(16*1,$out),$inout1);
1214	&xorps	($inout4,$tweak);
1215	&movups	(&QWP(16*2,$out),$inout2);
1216	&movups	(&QWP(16*3,$out),$inout3);
1217	&movups	(&QWP(16*4,$out),$inout4);
1218	&lea	($out,&DWP(16*5,$out));
1219	&jmp	(&label("xts_enc_done"));
1220
1221&set_label("xts_enc_one",16);
1222	&movups	($inout0,&QWP(16*0,$inp));	# load input
1223	&lea	($inp,&DWP(16*1,$inp));
1224	&xorps	($inout0,$inout3);		# input^=tweak
1225	if ($inline)
1226	{   &aesni_inline_generate1("enc");	}
1227	else
1228	{   &call	("_aesni_encrypt1");	}
1229	&xorps	($inout0,$inout3);		# output^=tweak
1230	&movups	(&QWP(16*0,$out),$inout0);	# write output
1231	&lea	($out,&DWP(16*1,$out));
1232
1233	&movdqa	($tweak,$inout3);		# last tweak
1234	&jmp	(&label("xts_enc_done"));
1235
1236&set_label("xts_enc_two",16);
1237	&movaps	($inout4,$tweak);		# put aside last tweak
1238
1239	&movups	($inout0,&QWP(16*0,$inp));	# load input
1240	&movups	($inout1,&QWP(16*1,$inp));
1241	&lea	($inp,&DWP(16*2,$inp));
1242	&xorps	($inout0,$inout3);		# input^=tweak
1243	&xorps	($inout1,$inout4);
1244	&xorps	($inout2,$inout2);
1245
1246	&call	("_aesni_encrypt3");
1247
1248	&xorps	($inout0,$inout3);		# output^=tweak
1249	&xorps	($inout1,$inout4);
1250	&movups	(&QWP(16*0,$out),$inout0);	# write output
1251	&movups	(&QWP(16*1,$out),$inout1);
1252	&lea	($out,&DWP(16*2,$out));
1253
1254	&movdqa	($tweak,$inout4);		# last tweak
1255	&jmp	(&label("xts_enc_done"));
1256
1257&set_label("xts_enc_three",16);
1258	&movaps	($inout5,$tweak);		# put aside last tweak
1259	&movups	($inout0,&QWP(16*0,$inp));	# load input
1260	&movups	($inout1,&QWP(16*1,$inp));
1261	&movups	($inout2,&QWP(16*2,$inp));
1262	&lea	($inp,&DWP(16*3,$inp));
1263	&xorps	($inout0,$inout3);		# input^=tweak
1264	&xorps	($inout1,$inout4);
1265	&xorps	($inout2,$inout5);
1266
1267	&call	("_aesni_encrypt3");
1268
1269	&xorps	($inout0,$inout3);		# output^=tweak
1270	&xorps	($inout1,$inout4);
1271	&xorps	($inout2,$inout5);
1272	&movups	(&QWP(16*0,$out),$inout0);	# write output
1273	&movups	(&QWP(16*1,$out),$inout1);
1274	&movups	(&QWP(16*2,$out),$inout2);
1275	&lea	($out,&DWP(16*3,$out));
1276
1277	&movdqa	($tweak,$inout5);		# last tweak
1278	&jmp	(&label("xts_enc_done"));
1279
1280&set_label("xts_enc_four",16);
1281	&movaps	($inout4,$tweak);		# put aside last tweak
1282
1283	&movups	($inout0,&QWP(16*0,$inp));	# load input
1284	&movups	($inout1,&QWP(16*1,$inp));
1285	&movups	($inout2,&QWP(16*2,$inp));
1286	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1287	&movups	($inout3,&QWP(16*3,$inp));
1288	&lea	($inp,&DWP(16*4,$inp));
1289	&xorps	($inout1,&QWP(16*1,"esp"));
1290	&xorps	($inout2,$inout5);
1291	&xorps	($inout3,$inout4);
1292
1293	&call	("_aesni_encrypt4");
1294
1295	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1296	&xorps	($inout1,&QWP(16*1,"esp"));
1297	&xorps	($inout2,$inout5);
1298	&movups	(&QWP(16*0,$out),$inout0);	# write output
1299	&xorps	($inout3,$inout4);
1300	&movups	(&QWP(16*1,$out),$inout1);
1301	&movups	(&QWP(16*2,$out),$inout2);
1302	&movups	(&QWP(16*3,$out),$inout3);
1303	&lea	($out,&DWP(16*4,$out));
1304
1305	&movdqa	($tweak,$inout4);		# last tweak
1306	&jmp	(&label("xts_enc_done"));
1307
1308&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
1309	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1310	&and	($len,15);
1311	&jz	(&label("xts_enc_ret"));
1312	&movdqa	($inout3,$tweak);
1313	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1314	&jmp	(&label("xts_enc_steal"));
1315
1316&set_label("xts_enc_done",16);
1317	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1318	&pxor	($twtmp,$twtmp);
1319	&and	($len,15);
1320	&jz	(&label("xts_enc_ret"));
1321
1322	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1323	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1324	&pshufd	($inout3,$twtmp,0x13);
1325	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1326	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
1327	&pxor	($inout3,$tweak);
1328
1329&set_label("xts_enc_steal");
1330	&movz	($rounds,&BP(0,$inp));
1331	&movz	($key,&BP(-16,$out));
1332	&lea	($inp,&DWP(1,$inp));
1333	&mov	(&BP(-16,$out),&LB($rounds));
1334	&mov	(&BP(0,$out),&LB($key));
1335	&lea	($out,&DWP(1,$out));
1336	&sub	($len,1);
1337	&jnz	(&label("xts_enc_steal"));
1338
1339	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1340	&mov	($key,$key_);			# restore $key
1341	&mov	($rounds,$rounds_);		# restore $rounds
1342
1343	&movups	($inout0,&QWP(-16,$out));	# load input
1344	&xorps	($inout0,$inout3);		# input^=tweak
1345	if ($inline)
1346	{   &aesni_inline_generate1("enc");	}
1347	else
1348	{   &call	("_aesni_encrypt1");	}
1349	&xorps	($inout0,$inout3);		# output^=tweak
1350	&movups	(&QWP(-16,$out),$inout0);	# write output
1351
1352&set_label("xts_enc_ret");
1353	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1354&function_end("aesni_xts_encrypt");
1355
1356&function_begin("aesni_xts_decrypt");
1357	&mov	($key,&wparam(4));		# key2
1358	&mov	($inp,&wparam(5));		# clear-text tweak
1359
1360	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1361	&movups	($inout0,&QWP(0,$inp));
1362	if ($inline)
1363	{   &aesni_inline_generate1("enc");	}
1364	else
1365	{   &call	("_aesni_encrypt1");	}
1366
1367	&mov	($inp,&wparam(0));
1368	&mov	($out,&wparam(1));
1369	&mov	($len,&wparam(2));
1370	&mov	($key,&wparam(3));		# key1
1371
1372	&mov	($key_,"esp");
1373	&sub	("esp",16*7+8);
1374	&and	("esp",-16);			# align stack
1375
1376	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
1377	&test	($len,15);
1378	&setnz	(&LB($rounds_));
1379	&shl	($rounds_,4);
1380	&sub	($len,$rounds_);
1381
1382	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1383	&mov	(&DWP(16*6+4,"esp"),0);
1384	&mov	(&DWP(16*6+8,"esp"),1);
1385	&mov	(&DWP(16*6+12,"esp"),0);
1386	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1387	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1388
1389	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1390	&mov	($key_,$key);			# backup $key
1391	&mov	($rounds_,$rounds);		# backup $rounds
1392
1393	&movdqa	($tweak,$inout0);
1394	&pxor	($twtmp,$twtmp);
1395	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1396	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1397
1398	&and	($len,-16);
1399	&sub	($len,16*6);
1400	&jc	(&label("xts_dec_short"));
1401
1402	&shr	($rounds,1);
1403	&mov	($rounds_,$rounds);
1404	&jmp	(&label("xts_dec_loop6"));
1405
1406&set_label("xts_dec_loop6",16);
1407	for ($i=0;$i<4;$i++) {
1408	    &pshufd	($twres,$twtmp,0x13);
1409	    &pxor	($twtmp,$twtmp);
1410	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1411	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1412	    &pand	($twres,$twmask);	# isolate carry and residue
1413	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1414	    &pxor	($tweak,$twres);
1415	}
1416	&pshufd	($inout5,$twtmp,0x13);
1417	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1418	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1419	 &$movekey	($rndkey0,&QWP(0,$key_));
1420	&pand	($inout5,$twmask);		# isolate carry and residue
1421	 &movups	($inout0,&QWP(0,$inp));	# load input
1422	&pxor	($inout5,$tweak);
1423
1424	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425	&movdqu	($inout1,&QWP(16*1,$inp));
1426	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1427	&movdqu	($inout2,&QWP(16*2,$inp));
1428	 &pxor		($inout1,$rndkey0);
1429	&movdqu	($inout3,&QWP(16*3,$inp));
1430	 &pxor		($inout2,$rndkey0);
1431	&movdqu	($inout4,&QWP(16*4,$inp));
1432	 &pxor		($inout3,$rndkey0);
1433	&movdqu	($rndkey1,&QWP(16*5,$inp));
1434	 &pxor		($inout4,$rndkey0);
1435	&lea	($inp,&DWP(16*6,$inp));
1436	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1437	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1438	&pxor	($inout5,$rndkey1);
1439
1440	 &$movekey	($rndkey1,&QWP(16,$key_));
1441	 &lea		($key,&DWP(32,$key_));
1442	&pxor	($inout1,&QWP(16*1,"esp"));
1443	 &aesdec	($inout0,$rndkey1);
1444	&pxor	($inout2,&QWP(16*2,"esp"));
1445	 &aesdec	($inout1,$rndkey1);
1446	&pxor	($inout3,&QWP(16*3,"esp"));
1447	 &dec		($rounds);
1448	 &aesdec	($inout2,$rndkey1);
1449	&pxor	($inout4,&QWP(16*4,"esp"));
1450	 &aesdec	($inout3,$rndkey1);
1451	&pxor		($inout5,$rndkey0);
1452	 &aesdec	($inout4,$rndkey1);
1453	 &$movekey	($rndkey0,&QWP(0,$key));
1454	 &aesdec	($inout5,$rndkey1);
1455	&call		(&label("_aesni_decrypt6_enter"));
1456
1457	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1458       &pxor	($twtmp,$twtmp);
1459	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1460       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1461	&xorps	($inout1,&QWP(16*1,"esp"));
1462	&movups	(&QWP(16*0,$out),$inout0);	# write output
1463	&xorps	($inout2,&QWP(16*2,"esp"));
1464	&movups	(&QWP(16*1,$out),$inout1);
1465	&xorps	($inout3,&QWP(16*3,"esp"));
1466	&movups	(&QWP(16*2,$out),$inout2);
1467	&xorps	($inout4,&QWP(16*4,"esp"));
1468	&movups	(&QWP(16*3,$out),$inout3);
1469	&xorps	($inout5,$tweak);
1470	&movups	(&QWP(16*4,$out),$inout4);
1471       &pshufd	($twres,$twtmp,0x13);
1472	&movups	(&QWP(16*5,$out),$inout5);
1473	&lea	($out,&DWP(16*6,$out));
1474       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1475
1476	&pxor	($twtmp,$twtmp);
1477	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1478	&pand	($twres,$twmask);		# isolate carry and residue
1479	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1480	&mov	($rounds,$rounds_);		# restore $rounds
1481	&pxor	($tweak,$twres);
1482
1483	&sub	($len,16*6);
1484	&jnc	(&label("xts_dec_loop6"));
1485
1486	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
1487	&mov	($key,$key_);			# restore $key
1488	&mov	($rounds_,$rounds);
1489
1490&set_label("xts_dec_short");
1491	&add	($len,16*6);
1492	&jz	(&label("xts_dec_done6x"));
1493
1494	&movdqa	($inout3,$tweak);		# put aside previous tweak
1495	&cmp	($len,0x20);
1496	&jb	(&label("xts_dec_one"));
1497
1498	&pshufd	($twres,$twtmp,0x13);
1499	&pxor	($twtmp,$twtmp);
1500	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1501	&pand	($twres,$twmask);		# isolate carry and residue
1502	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1503	&pxor	($tweak,$twres);
1504	&je	(&label("xts_dec_two"));
1505
1506	&pshufd	($twres,$twtmp,0x13);
1507	&pxor	($twtmp,$twtmp);
1508	&movdqa	($inout4,$tweak);		# put aside previous tweak
1509	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1510	&pand	($twres,$twmask);		# isolate carry and residue
1511	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1512	&pxor	($tweak,$twres);
1513	&cmp	($len,0x40);
1514	&jb	(&label("xts_dec_three"));
1515
1516	&pshufd	($twres,$twtmp,0x13);
1517	&pxor	($twtmp,$twtmp);
1518	&movdqa	($inout5,$tweak);		# put aside previous tweak
1519	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1520	&pand	($twres,$twmask);		# isolate carry and residue
1521	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1522	&pxor	($tweak,$twres);
1523	&movdqa	(&QWP(16*0,"esp"),$inout3);
1524	&movdqa	(&QWP(16*1,"esp"),$inout4);
1525	&je	(&label("xts_dec_four"));
1526
1527	&movdqa	(&QWP(16*2,"esp"),$inout5);
1528	&pshufd	($inout5,$twtmp,0x13);
1529	&movdqa	(&QWP(16*3,"esp"),$tweak);
1530	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1531	&pand	($inout5,$twmask);		# isolate carry and residue
1532	&pxor	($inout5,$tweak);
1533
1534	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1535	&movdqu	($inout1,&QWP(16*1,$inp));
1536	&movdqu	($inout2,&QWP(16*2,$inp));
1537	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1538	&movdqu	($inout3,&QWP(16*3,$inp));
1539	&pxor	($inout1,&QWP(16*1,"esp"));
1540	&movdqu	($inout4,&QWP(16*4,$inp));
1541	&pxor	($inout2,&QWP(16*2,"esp"));
1542	&lea	($inp,&DWP(16*5,$inp));
1543	&pxor	($inout3,&QWP(16*3,"esp"));
1544	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1545	&pxor	($inout4,$inout5);
1546
1547	&call	("_aesni_decrypt6");
1548
1549	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1550	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1551	&xorps	($inout1,&QWP(16*1,"esp"));
1552	&xorps	($inout2,&QWP(16*2,"esp"));
1553	&movups	(&QWP(16*0,$out),$inout0);	# write output
1554	&xorps	($inout3,&QWP(16*3,"esp"));
1555	&movups	(&QWP(16*1,$out),$inout1);
1556	&xorps	($inout4,$tweak);
1557	&movups	(&QWP(16*2,$out),$inout2);
1558	&movups	(&QWP(16*3,$out),$inout3);
1559	&movups	(&QWP(16*4,$out),$inout4);
1560	&lea	($out,&DWP(16*5,$out));
1561	&jmp	(&label("xts_dec_done"));
1562
1563&set_label("xts_dec_one",16);
1564	&movups	($inout0,&QWP(16*0,$inp));	# load input
1565	&lea	($inp,&DWP(16*1,$inp));
1566	&xorps	($inout0,$inout3);		# input^=tweak
1567	if ($inline)
1568	{   &aesni_inline_generate1("dec");	}
1569	else
1570	{   &call	("_aesni_decrypt1");	}
1571	&xorps	($inout0,$inout3);		# output^=tweak
1572	&movups	(&QWP(16*0,$out),$inout0);	# write output
1573	&lea	($out,&DWP(16*1,$out));
1574
1575	&movdqa	($tweak,$inout3);		# last tweak
1576	&jmp	(&label("xts_dec_done"));
1577
1578&set_label("xts_dec_two",16);
1579	&movaps	($inout4,$tweak);		# put aside last tweak
1580
1581	&movups	($inout0,&QWP(16*0,$inp));	# load input
1582	&movups	($inout1,&QWP(16*1,$inp));
1583	&lea	($inp,&DWP(16*2,$inp));
1584	&xorps	($inout0,$inout3);		# input^=tweak
1585	&xorps	($inout1,$inout4);
1586
1587	&call	("_aesni_decrypt3");
1588
1589	&xorps	($inout0,$inout3);		# output^=tweak
1590	&xorps	($inout1,$inout4);
1591	&movups	(&QWP(16*0,$out),$inout0);	# write output
1592	&movups	(&QWP(16*1,$out),$inout1);
1593	&lea	($out,&DWP(16*2,$out));
1594
1595	&movdqa	($tweak,$inout4);		# last tweak
1596	&jmp	(&label("xts_dec_done"));
1597
1598&set_label("xts_dec_three",16);
1599	&movaps	($inout5,$tweak);		# put aside last tweak
1600	&movups	($inout0,&QWP(16*0,$inp));	# load input
1601	&movups	($inout1,&QWP(16*1,$inp));
1602	&movups	($inout2,&QWP(16*2,$inp));
1603	&lea	($inp,&DWP(16*3,$inp));
1604	&xorps	($inout0,$inout3);		# input^=tweak
1605	&xorps	($inout1,$inout4);
1606	&xorps	($inout2,$inout5);
1607
1608	&call	("_aesni_decrypt3");
1609
1610	&xorps	($inout0,$inout3);		# output^=tweak
1611	&xorps	($inout1,$inout4);
1612	&xorps	($inout2,$inout5);
1613	&movups	(&QWP(16*0,$out),$inout0);	# write output
1614	&movups	(&QWP(16*1,$out),$inout1);
1615	&movups	(&QWP(16*2,$out),$inout2);
1616	&lea	($out,&DWP(16*3,$out));
1617
1618	&movdqa	($tweak,$inout5);		# last tweak
1619	&jmp	(&label("xts_dec_done"));
1620
1621&set_label("xts_dec_four",16);
1622	&movaps	($inout4,$tweak);		# put aside last tweak
1623
1624	&movups	($inout0,&QWP(16*0,$inp));	# load input
1625	&movups	($inout1,&QWP(16*1,$inp));
1626	&movups	($inout2,&QWP(16*2,$inp));
1627	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1628	&movups	($inout3,&QWP(16*3,$inp));
1629	&lea	($inp,&DWP(16*4,$inp));
1630	&xorps	($inout1,&QWP(16*1,"esp"));
1631	&xorps	($inout2,$inout5);
1632	&xorps	($inout3,$inout4);
1633
1634	&call	("_aesni_decrypt4");
1635
1636	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1637	&xorps	($inout1,&QWP(16*1,"esp"));
1638	&xorps	($inout2,$inout5);
1639	&movups	(&QWP(16*0,$out),$inout0);	# write output
1640	&xorps	($inout3,$inout4);
1641	&movups	(&QWP(16*1,$out),$inout1);
1642	&movups	(&QWP(16*2,$out),$inout2);
1643	&movups	(&QWP(16*3,$out),$inout3);
1644	&lea	($out,&DWP(16*4,$out));
1645
1646	&movdqa	($tweak,$inout4);		# last tweak
1647	&jmp	(&label("xts_dec_done"));
1648
1649&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
1650	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1651	&and	($len,15);
1652	&jz	(&label("xts_dec_ret"));
1653	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1654	&jmp	(&label("xts_dec_only_one_more"));
1655
1656&set_label("xts_dec_done",16);
1657	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1658	&pxor	($twtmp,$twtmp);
1659	&and	($len,15);
1660	&jz	(&label("xts_dec_ret"));
1661
1662	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1663	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1664	&pshufd	($twres,$twtmp,0x13);
1665	&pxor	($twtmp,$twtmp);
1666	&movdqa	($twmask,&QWP(16*6,"esp"));
1667	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1668	&pand	($twres,$twmask);		# isolate carry and residue
1669	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1670	&pxor	($tweak,$twres);
1671
1672&set_label("xts_dec_only_one_more");
1673	&pshufd	($inout3,$twtmp,0x13);
1674	&movdqa	($inout4,$tweak);		# put aside previous tweak
1675	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1676	&pand	($inout3,$twmask);		# isolate carry and residue
1677	&pxor	($inout3,$tweak);
1678
1679	&mov	($key,$key_);			# restore $key
1680	&mov	($rounds,$rounds_);		# restore $rounds
1681
1682	&movups	($inout0,&QWP(0,$inp));		# load input
1683	&xorps	($inout0,$inout3);		# input^=tweak
1684	if ($inline)
1685	{   &aesni_inline_generate1("dec");	}
1686	else
1687	{   &call	("_aesni_decrypt1");	}
1688	&xorps	($inout0,$inout3);		# output^=tweak
1689	&movups	(&QWP(0,$out),$inout0);		# write output
1690
1691&set_label("xts_dec_steal");
1692	&movz	($rounds,&BP(16,$inp));
1693	&movz	($key,&BP(0,$out));
1694	&lea	($inp,&DWP(1,$inp));
1695	&mov	(&BP(0,$out),&LB($rounds));
1696	&mov	(&BP(16,$out),&LB($key));
1697	&lea	($out,&DWP(1,$out));
1698	&sub	($len,1);
1699	&jnz	(&label("xts_dec_steal"));
1700
1701	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1702	&mov	($key,$key_);			# restore $key
1703	&mov	($rounds,$rounds_);		# restore $rounds
1704
1705	&movups	($inout0,&QWP(0,$out));		# load input
1706	&xorps	($inout0,$inout4);		# input^=tweak
1707	if ($inline)
1708	{   &aesni_inline_generate1("dec");	}
1709	else
1710	{   &call	("_aesni_decrypt1");	}
1711	&xorps	($inout0,$inout4);		# output^=tweak
1712	&movups	(&QWP(0,$out),$inout0);		# write output
1713
1714&set_label("xts_dec_ret");
1715	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1716&function_end("aesni_xts_decrypt");
1717}
1718}
1719
1720######################################################################
1721# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722#                           size_t length, const AES_KEY *key,
1723#                           unsigned char *ivp,const int enc);
1724&function_begin("${PREFIX}_cbc_encrypt");
1725	&mov	($inp,&wparam(0));
1726	&mov	($rounds_,"esp");
1727	&mov	($out,&wparam(1));
1728	&sub	($rounds_,24);
1729	&mov	($len,&wparam(2));
1730	&and	($rounds_,-16);
1731	&mov	($key,&wparam(3));
1732	&mov	($key_,&wparam(4));
1733	&test	($len,$len);
1734	&jz	(&label("cbc_abort"));
1735
1736	&cmp	(&wparam(5),0);
1737	&xchg	($rounds_,"esp");		# alloca
1738	&movups	($ivec,&QWP(0,$key_));		# load IV
1739	&mov	($rounds,&DWP(240,$key));
1740	&mov	($key_,$key);			# backup $key
1741	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
1742	&mov	($rounds_,$rounds);		# backup $rounds
1743	&je	(&label("cbc_decrypt"));
1744
1745	&movaps	($inout0,$ivec);
1746	&cmp	($len,16);
1747	&jb	(&label("cbc_enc_tail"));
1748	&sub	($len,16);
1749	&jmp	(&label("cbc_enc_loop"));
1750
1751&set_label("cbc_enc_loop",16);
1752	&movups	($ivec,&QWP(0,$inp));		# input actually
1753	&lea	($inp,&DWP(16,$inp));
1754	if ($inline)
1755	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
1756	else
1757	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
1758	&mov	($rounds,$rounds_);	# restore $rounds
1759	&mov	($key,$key_);		# restore $key
1760	&movups	(&QWP(0,$out),$inout0);	# store output
1761	&lea	($out,&DWP(16,$out));
1762	&sub	($len,16);
1763	&jnc	(&label("cbc_enc_loop"));
1764	&add	($len,16);
1765	&jnz	(&label("cbc_enc_tail"));
1766	&movaps	($ivec,$inout0);
1767	&jmp	(&label("cbc_ret"));
1768
1769&set_label("cbc_enc_tail");
1770	&mov	("ecx",$len);		# zaps $rounds
1771	&data_word(0xA4F3F689);		# rep movsb
1772	&mov	("ecx",16);		# zero tail
1773	&sub	("ecx",$len);
1774	&xor	("eax","eax");		# zaps $len
1775	&data_word(0xAAF3F689);		# rep stosb
1776	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
1777	&mov	($rounds,$rounds_);	# restore $rounds
1778	&mov	($inp,$out);		# $inp and $out are the same
1779	&mov	($key,$key_);		# restore $key
1780	&jmp	(&label("cbc_enc_loop"));
1781######################################################################
1782&set_label("cbc_decrypt",16);
1783	&cmp	($len,0x50);
1784	&jbe	(&label("cbc_dec_tail"));
1785	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1786	&sub	($len,0x50);
1787	&jmp	(&label("cbc_dec_loop6_enter"));
1788
1789&set_label("cbc_dec_loop6",16);
1790	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
1791	&movups	(&QWP(0,$out),$inout5);
1792	&lea	($out,&DWP(0x10,$out));
1793&set_label("cbc_dec_loop6_enter");
1794	&movdqu	($inout0,&QWP(0,$inp));
1795	&movdqu	($inout1,&QWP(0x10,$inp));
1796	&movdqu	($inout2,&QWP(0x20,$inp));
1797	&movdqu	($inout3,&QWP(0x30,$inp));
1798	&movdqu	($inout4,&QWP(0x40,$inp));
1799	&movdqu	($inout5,&QWP(0x50,$inp));
1800
1801	&call	("_aesni_decrypt6");
1802
1803	&movups	($rndkey1,&QWP(0,$inp));
1804	&movups	($rndkey0,&QWP(0x10,$inp));
1805	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
1806	&xorps	($inout1,$rndkey1);
1807	&movups	($rndkey1,&QWP(0x20,$inp));
1808	&xorps	($inout2,$rndkey0);
1809	&movups	($rndkey0,&QWP(0x30,$inp));
1810	&xorps	($inout3,$rndkey1);
1811	&movups	($rndkey1,&QWP(0x40,$inp));
1812	&xorps	($inout4,$rndkey0);
1813	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
1814	&xorps	($inout5,$rndkey1);
1815	&movups	(&QWP(0,$out),$inout0);
1816	&movups	(&QWP(0x10,$out),$inout1);
1817	&lea	($inp,&DWP(0x60,$inp));
1818	&movups	(&QWP(0x20,$out),$inout2);
1819	&mov	($rounds,$rounds_)		# restore $rounds
1820	&movups	(&QWP(0x30,$out),$inout3);
1821	&mov	($key,$key_);			# restore $key
1822	&movups	(&QWP(0x40,$out),$inout4);
1823	&lea	($out,&DWP(0x50,$out));
1824	&sub	($len,0x60);
1825	&ja	(&label("cbc_dec_loop6"));
1826
1827	&movaps	($inout0,$inout5);
1828	&movaps	($ivec,$rndkey0);
1829	&add	($len,0x50);
1830	&jle	(&label("cbc_dec_tail_collected"));
1831	&movups	(&QWP(0,$out),$inout0);
1832	&lea	($out,&DWP(0x10,$out));
1833&set_label("cbc_dec_tail");
1834	&movups	($inout0,&QWP(0,$inp));
1835	&movaps	($in0,$inout0);
1836	&cmp	($len,0x10);
1837	&jbe	(&label("cbc_dec_one"));
1838
1839	&movups	($inout1,&QWP(0x10,$inp));
1840	&movaps	($in1,$inout1);
1841	&cmp	($len,0x20);
1842	&jbe	(&label("cbc_dec_two"));
1843
1844	&movups	($inout2,&QWP(0x20,$inp));
1845	&cmp	($len,0x30);
1846	&jbe	(&label("cbc_dec_three"));
1847
1848	&movups	($inout3,&QWP(0x30,$inp));
1849	&cmp	($len,0x40);
1850	&jbe	(&label("cbc_dec_four"));
1851
1852	&movups	($inout4,&QWP(0x40,$inp));
1853	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1854	&movups	($inout0,&QWP(0,$inp));
1855	&xorps	($inout5,$inout5);
1856	&call	("_aesni_decrypt6");
1857	&movups	($rndkey1,&QWP(0,$inp));
1858	&movups	($rndkey0,&QWP(0x10,$inp));
1859	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
1860	&xorps	($inout1,$rndkey1);
1861	&movups	($rndkey1,&QWP(0x20,$inp));
1862	&xorps	($inout2,$rndkey0);
1863	&movups	($rndkey0,&QWP(0x30,$inp));
1864	&xorps	($inout3,$rndkey1);
1865	&movups	($ivec,&QWP(0x40,$inp));	# IV
1866	&xorps	($inout4,$rndkey0);
1867	&movups	(&QWP(0,$out),$inout0);
1868	&movups	(&QWP(0x10,$out),$inout1);
1869	&movups	(&QWP(0x20,$out),$inout2);
1870	&movups	(&QWP(0x30,$out),$inout3);
1871	&lea	($out,&DWP(0x40,$out));
1872	&movaps	($inout0,$inout4);
1873	&sub	($len,0x50);
1874	&jmp	(&label("cbc_dec_tail_collected"));
1875
1876&set_label("cbc_dec_one",16);
1877	if ($inline)
1878	{   &aesni_inline_generate1("dec");	}
1879	else
1880	{   &call	("_aesni_decrypt1");	}
1881	&xorps	($inout0,$ivec);
1882	&movaps	($ivec,$in0);
1883	&sub	($len,0x10);
1884	&jmp	(&label("cbc_dec_tail_collected"));
1885
1886&set_label("cbc_dec_two",16);
1887	&xorps	($inout2,$inout2);
1888	&call	("_aesni_decrypt3");
1889	&xorps	($inout0,$ivec);
1890	&xorps	($inout1,$in0);
1891	&movups	(&QWP(0,$out),$inout0);
1892	&movaps	($inout0,$inout1);
1893	&lea	($out,&DWP(0x10,$out));
1894	&movaps	($ivec,$in1);
1895	&sub	($len,0x20);
1896	&jmp	(&label("cbc_dec_tail_collected"));
1897
1898&set_label("cbc_dec_three",16);
1899	&call	("_aesni_decrypt3");
1900	&xorps	($inout0,$ivec);
1901	&xorps	($inout1,$in0);
1902	&xorps	($inout2,$in1);
1903	&movups	(&QWP(0,$out),$inout0);
1904	&movaps	($inout0,$inout2);
1905	&movups	(&QWP(0x10,$out),$inout1);
1906	&lea	($out,&DWP(0x20,$out));
1907	&movups	($ivec,&QWP(0x20,$inp));
1908	&sub	($len,0x30);
1909	&jmp	(&label("cbc_dec_tail_collected"));
1910
1911&set_label("cbc_dec_four",16);
1912	&call	("_aesni_decrypt4");
1913	&movups	($rndkey1,&QWP(0x10,$inp));
1914	&movups	($rndkey0,&QWP(0x20,$inp));
1915	&xorps	($inout0,$ivec);
1916	&movups	($ivec,&QWP(0x30,$inp));
1917	&xorps	($inout1,$in0);
1918	&movups	(&QWP(0,$out),$inout0);
1919	&xorps	($inout2,$rndkey1);
1920	&movups	(&QWP(0x10,$out),$inout1);
1921	&xorps	($inout3,$rndkey0);
1922	&movups	(&QWP(0x20,$out),$inout2);
1923	&lea	($out,&DWP(0x30,$out));
1924	&movaps	($inout0,$inout3);
1925	&sub	($len,0x40);
1926
1927&set_label("cbc_dec_tail_collected");
1928	&and	($len,15);
1929	&jnz	(&label("cbc_dec_tail_partial"));
1930	&movups	(&QWP(0,$out),$inout0);
1931	&jmp	(&label("cbc_ret"));
1932
1933&set_label("cbc_dec_tail_partial",16);
1934	&movaps	(&QWP(0,"esp"),$inout0);
1935	&mov	("ecx",16);
1936	&mov	($inp,"esp");
1937	&sub	("ecx",$len);
1938	&data_word(0xA4F3F689);		# rep movsb
1939
1940&set_label("cbc_ret");
1941	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
1942	&mov	($key_,&wparam(4));
1943	&movups	(&QWP(0,$key_),$ivec);	# output IV
1944&set_label("cbc_abort");
1945&function_end("${PREFIX}_cbc_encrypt");
1946
1947######################################################################
1948# Mechanical port from aesni-x86_64.pl.
1949#
1950# _aesni_set_encrypt_key is private interface,
1951# input:
1952#	"eax"	const unsigned char *userKey
1953#	$rounds	int bits
1954#	$key	AES_KEY *key
1955# output:
1956#	"eax"	return code
1957#	$round	rounds
1958
1959&function_begin_B("_aesni_set_encrypt_key");
1960	&test	("eax","eax");
1961	&jz	(&label("bad_pointer"));
1962	&test	($key,$key);
1963	&jz	(&label("bad_pointer"));
1964
1965	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
1966	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
1967	&lea	($key,&DWP(16,$key));
1968	&cmp	($rounds,256);
1969	&je	(&label("14rounds"));
1970	&cmp	($rounds,192);
1971	&je	(&label("12rounds"));
1972	&cmp	($rounds,128);
1973	&jne	(&label("bad_keybits"));
1974
1975&set_label("10rounds",16);
1976	&mov		($rounds,9);
1977	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
1978	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
1979	&call		(&label("key_128_cold"));
1980	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
1981	&call		(&label("key_128"));
1982	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
1983	&call		(&label("key_128"));
1984	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
1985	&call		(&label("key_128"));
1986	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
1987	&call		(&label("key_128"));
1988	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
1989	&call		(&label("key_128"));
1990	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
1991	&call		(&label("key_128"));
1992	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
1993	&call		(&label("key_128"));
1994	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
1995	&call		(&label("key_128"));
1996	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
1997	&call		(&label("key_128"));
1998	&$movekey	(&QWP(0,$key),"xmm0");
1999	&mov		(&DWP(80,$key),$rounds);
2000	&xor		("eax","eax");
2001	&ret();
2002
2003&set_label("key_128",16);
2004	&$movekey	(&QWP(0,$key),"xmm0");
2005	&lea		($key,&DWP(16,$key));
2006&set_label("key_128_cold");
2007	&shufps		("xmm4","xmm0",0b00010000);
2008	&xorps		("xmm0","xmm4");
2009	&shufps		("xmm4","xmm0",0b10001100);
2010	&xorps		("xmm0","xmm4");
2011	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2012	&xorps		("xmm0","xmm1");
2013	&ret();
2014
2015&set_label("12rounds",16);
2016	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
2017	&mov		($rounds,11);
2018	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
2019	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
2020	&call		(&label("key_192a_cold"));
2021	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
2022	&call		(&label("key_192b"));
2023	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
2024	&call		(&label("key_192a"));
2025	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
2026	&call		(&label("key_192b"));
2027	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
2028	&call		(&label("key_192a"));
2029	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
2030	&call		(&label("key_192b"));
2031	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
2032	&call		(&label("key_192a"));
2033	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
2034	&call		(&label("key_192b"));
2035	&$movekey	(&QWP(0,$key),"xmm0");
2036	&mov		(&DWP(48,$key),$rounds);
2037	&xor		("eax","eax");
2038	&ret();
2039
2040&set_label("key_192a",16);
2041	&$movekey	(&QWP(0,$key),"xmm0");
2042	&lea		($key,&DWP(16,$key));
2043&set_label("key_192a_cold",16);
2044	&movaps		("xmm5","xmm2");
2045&set_label("key_192b_warm");
2046	&shufps		("xmm4","xmm0",0b00010000);
2047	&movdqa		("xmm3","xmm2");
2048	&xorps		("xmm0","xmm4");
2049	&shufps		("xmm4","xmm0",0b10001100);
2050	&pslldq		("xmm3",4);
2051	&xorps		("xmm0","xmm4");
2052	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
2053	&pxor		("xmm2","xmm3");
2054	&pxor		("xmm0","xmm1");
2055	&pshufd		("xmm3","xmm0",0b11111111);
2056	&pxor		("xmm2","xmm3");
2057	&ret();
2058
2059&set_label("key_192b",16);
2060	&movaps		("xmm3","xmm0");
2061	&shufps		("xmm5","xmm0",0b01000100);
2062	&$movekey	(&QWP(0,$key),"xmm5");
2063	&shufps		("xmm3","xmm2",0b01001110);
2064	&$movekey	(&QWP(16,$key),"xmm3");
2065	&lea		($key,&DWP(32,$key));
2066	&jmp		(&label("key_192b_warm"));
2067
2068&set_label("14rounds",16);
2069	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
2070	&mov		($rounds,13);
2071	&lea		($key,&DWP(16,$key));
2072	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
2073	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
2074	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
2075	&call		(&label("key_256a_cold"));
2076	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
2077	&call		(&label("key_256b"));
2078	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
2079	&call		(&label("key_256a"));
2080	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
2081	&call		(&label("key_256b"));
2082	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
2083	&call		(&label("key_256a"));
2084	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
2085	&call		(&label("key_256b"));
2086	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
2087	&call		(&label("key_256a"));
2088	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
2089	&call		(&label("key_256b"));
2090	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
2091	&call		(&label("key_256a"));
2092	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
2093	&call		(&label("key_256b"));
2094	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
2095	&call		(&label("key_256a"));
2096	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
2097	&call		(&label("key_256b"));
2098	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
2099	&call		(&label("key_256a"));
2100	&$movekey	(&QWP(0,$key),"xmm0");
2101	&mov		(&DWP(16,$key),$rounds);
2102	&xor		("eax","eax");
2103	&ret();
2104
2105&set_label("key_256a",16);
2106	&$movekey	(&QWP(0,$key),"xmm2");
2107	&lea		($key,&DWP(16,$key));
2108&set_label("key_256a_cold");
2109	&shufps		("xmm4","xmm0",0b00010000);
2110	&xorps		("xmm0","xmm4");
2111	&shufps		("xmm4","xmm0",0b10001100);
2112	&xorps		("xmm0","xmm4");
2113	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2114	&xorps		("xmm0","xmm1");
2115	&ret();
2116
2117&set_label("key_256b",16);
2118	&$movekey	(&QWP(0,$key),"xmm0");
2119	&lea		($key,&DWP(16,$key));
2120
2121	&shufps		("xmm4","xmm2",0b00010000);
2122	&xorps		("xmm2","xmm4");
2123	&shufps		("xmm4","xmm2",0b10001100);
2124	&xorps		("xmm2","xmm4");
2125	&shufps		("xmm1","xmm1",0b10101010);	# critical path
2126	&xorps		("xmm2","xmm1");
2127	&ret();
2128
2129&set_label("bad_pointer",4);
2130	&mov	("eax",-1);
2131	&ret	();
2132&set_label("bad_keybits",4);
2133	&mov	("eax",-2);
2134	&ret	();
2135&function_end_B("_aesni_set_encrypt_key");
2136
2137# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2138#                              AES_KEY *key)
2139&function_begin_B("${PREFIX}_set_encrypt_key");
2140	&mov	("eax",&wparam(0));
2141	&mov	($rounds,&wparam(1));
2142	&mov	($key,&wparam(2));
2143	&call	("_aesni_set_encrypt_key");
2144	&ret	();
2145&function_end_B("${PREFIX}_set_encrypt_key");
2146
2147# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2148#                              AES_KEY *key)
2149&function_begin_B("${PREFIX}_set_decrypt_key");
2150	&mov	("eax",&wparam(0));
2151	&mov	($rounds,&wparam(1));
2152	&mov	($key,&wparam(2));
2153	&call	("_aesni_set_encrypt_key");
2154	&mov	($key,&wparam(2));
2155	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
2156	&test	("eax","eax");
2157	&jnz	(&label("dec_key_ret"));
2158	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
2159
2160	&$movekey	("xmm0",&QWP(0,$key));	# just swap
2161	&$movekey	("xmm1",&QWP(0,"eax"));
2162	&$movekey	(&QWP(0,"eax"),"xmm0");
2163	&$movekey	(&QWP(0,$key),"xmm1");
2164	&lea		($key,&DWP(16,$key));
2165	&lea		("eax",&DWP(-16,"eax"));
2166
2167&set_label("dec_key_inverse");
2168	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
2169	&$movekey	("xmm1",&QWP(0,"eax"));
2170	&aesimc		("xmm0","xmm0");
2171	&aesimc		("xmm1","xmm1");
2172	&lea		($key,&DWP(16,$key));
2173	&lea		("eax",&DWP(-16,"eax"));
2174	&$movekey	(&QWP(16,"eax"),"xmm0");
2175	&$movekey	(&QWP(-16,$key),"xmm1");
2176	&cmp		("eax",$key);
2177	&ja		(&label("dec_key_inverse"));
2178
2179	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
2180	&aesimc		("xmm0","xmm0");
2181	&$movekey	(&QWP(0,$key),"xmm0");
2182
2183	&xor		("eax","eax");		# return success
2184&set_label("dec_key_ret");
2185	&ret	();
2186&function_end_B("${PREFIX}_set_decrypt_key");
2187&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2188
2189&asm_finish();
2190