1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode]				###
5### bitsliced implementation for Intel Core 2 processors	###
6### requires support of SSE extensions up to SSSE3		###
7### Author: Emilia Käsper and Peter Schwabe			###
8### Date: 2009-03-19						###
9### Public domain						###
10###								###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12### further information.					###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22#   from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24#   allowed to feed its output back to aesenc[last], this was
25#   achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28#   relies on conversion of "conventional" key schedule as returned
29#   by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31#   to skip one shiftrows(), reduce bit-sliced key schedule and
32#   speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38#		Emilia's	this(*)		difference
39#
40# Core 2    	9.30		8.69		+7%
41# Nehalem(**) 	7.63		6.88		+11%
42# Atom	    	17.1		16.4		+4%
43# Silvermont	-		12.9
44#
45# (*)	Comparison is not completely fair, because "this" is ECB,
46#	i.e. no extra processing such as counter values calculation
47#	and xor-ing input as in Emilia's CTR implementation is
48#	performed. However, the CTR calculations stand for not more
49#	than 1% of total time, so comparison is *rather* fair.
50#
51# (**)	Results were collected on Westmere, which is considered to
52#	be equivalent to Nehalem for this code.
53#
54# As for key schedule conversion subroutine. Interface to OpenSSL
55# relies on per-invocation on-the-fly conversion. This naturally
56# has impact on performance, especially for short inputs. Conversion
57# time in CPU cycles and its ratio to CPU cycles spent in 8x block
58# function is:
59#
60# 		conversion	conversion/8x block
61# Core 2	240		0.22
62# Nehalem	180		0.20
63# Atom		430		0.20
64#
65# The ratio values mean that 128-byte blocks will be processed
66# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
67# etc. Then keep in mind that input sizes not divisible by 128 are
68# *effectively* slower, especially shortest ones, e.g. consecutive
69# 144-byte blocks are processed 44% slower than one would expect,
70# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
71# it's still faster than ["hyper-threading-safe" code path in]
72# aes-x86_64.pl on all lengths above 64 bytes...
73#
74# October 2011.
75#
76# Add decryption procedure. Performance in CPU cycles spent to decrypt
77# one byte out of 4096-byte buffer with 128-bit key is:
78#
79# Core 2	9.98
80# Nehalem	7.80
81# Atom		17.9
82# Silvermont	14.0
83#
84# November 2011.
85#
86# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
87# suboptimal, but XTS is meant to be used with larger blocks...
88#
89#						<appro@openssl.org>
90
91$flavour = shift;
92$output  = shift;
93if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94
95$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96
97$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100die "can't locate x86_64-xlate.pl";
101
102open OUT,"| \"$^X\" $xlate $flavour $output";
103*STDOUT=*OUT;
104
105my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
106my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
107my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
108
109{
110my ($key,$rounds,$const)=("%rax","%r10d","%r11");
111
112sub Sbox {
113# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
114# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
115my @b=@_[0..7];
116my @t=@_[8..11];
117my @s=@_[12..15];
118	&InBasisChange	(@b);
119	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
120	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
121}
122
123sub InBasisChange {
124# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
126my @b=@_[0..7];
127$code.=<<___;
128	pxor	@b[6], @b[5]
129	pxor	@b[1], @b[2]
130	pxor	@b[0], @b[3]
131	pxor	@b[2], @b[6]
132	pxor 	@b[0], @b[5]
133
134	pxor	@b[3], @b[6]
135	pxor	@b[7], @b[3]
136	pxor	@b[5], @b[7]
137	pxor	@b[4], @b[3]
138	pxor	@b[5], @b[4]
139	pxor	@b[1], @b[3]
140
141	pxor	@b[7], @b[2]
142	pxor	@b[5], @b[1]
143___
144}
145
146sub OutBasisChange {
147# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
148# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
149my @b=@_[0..7];
150$code.=<<___;
151	pxor	@b[6], @b[0]
152	pxor	@b[4], @b[1]
153	pxor	@b[0], @b[2]
154	pxor	@b[6], @b[4]
155	pxor	@b[1], @b[6]
156
157	pxor	@b[5], @b[1]
158	pxor	@b[3], @b[5]
159	pxor	@b[7], @b[3]
160	pxor	@b[5], @b[7]
161	pxor	@b[5], @b[2]
162
163	pxor	@b[7], @b[4]
164___
165}
166
167sub InvSbox {
168# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
169# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
170my @b=@_[0..7];
171my @t=@_[8..11];
172my @s=@_[12..15];
173	&InvInBasisChange	(@b);
174	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
175	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
176}
177
178sub InvInBasisChange {		# OutBasisChange in reverse
179my @b=@_[5,1,2,6,3,7,0,4];
180$code.=<<___
181	pxor	@b[7], @b[4]
182
183	pxor	@b[5], @b[7]
184	pxor	@b[5], @b[2]
185	pxor	@b[7], @b[3]
186	pxor	@b[3], @b[5]
187	pxor	@b[5], @b[1]
188
189	pxor	@b[1], @b[6]
190	pxor	@b[0], @b[2]
191	pxor	@b[6], @b[4]
192	pxor	@b[6], @b[0]
193	pxor	@b[4], @b[1]
194___
195}
196
197sub InvOutBasisChange {		# InBasisChange in reverse
198my @b=@_[2,5,7,3,6,1,0,4];
199$code.=<<___;
200	pxor	@b[5], @b[1]
201	pxor	@b[7], @b[2]
202
203	pxor	@b[1], @b[3]
204	pxor	@b[5], @b[4]
205	pxor	@b[5], @b[7]
206	pxor	@b[4], @b[3]
207	 pxor 	@b[0], @b[5]
208	pxor	@b[7], @b[3]
209	 pxor	@b[2], @b[6]
210	 pxor	@b[1], @b[2]
211	pxor	@b[3], @b[6]
212
213	pxor	@b[0], @b[3]
214	pxor	@b[6], @b[5]
215___
216}
217
218sub Mul_GF4 {
219#;*************************************************************
220#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
221#;*************************************************************
222my ($x0,$x1,$y0,$y1,$t0)=@_;
223$code.=<<___;
224	movdqa	$y0, $t0
225	pxor 	$y1, $t0
226	pand	$x0, $t0
227	pxor	$x1, $x0
228	pand	$y0, $x1
229	pand	$y1, $x0
230	pxor	$x1, $x0
231	pxor	$t0, $x1
232___
233}
234
235sub Mul_GF4_N {				# not used, see next subroutine
236# multiply and scale by N
237my ($x0,$x1,$y0,$y1,$t0)=@_;
238$code.=<<___;
239	movdqa	$y0, $t0
240	pxor	$y1, $t0
241	pand	$x0, $t0
242	pxor	$x1, $x0
243	pand	$y0, $x1
244	pand	$y1, $x0
245	pxor	$x0, $x1
246	pxor	$t0, $x0
247___
248}
249
250sub Mul_GF4_N_GF4 {
251# interleaved Mul_GF4_N and Mul_GF4
252my ($x0,$x1,$y0,$y1,$t0,
253    $x2,$x3,$y2,$y3,$t1)=@_;
254$code.=<<___;
255	movdqa	$y0, $t0
256	 movdqa	$y2, $t1
257	pxor	$y1, $t0
258	 pxor 	$y3, $t1
259	pand	$x0, $t0
260	 pand	$x2, $t1
261	pxor	$x1, $x0
262	 pxor	$x3, $x2
263	pand	$y0, $x1
264	 pand	$y2, $x3
265	pand	$y1, $x0
266	 pand	$y3, $x2
267	pxor	$x0, $x1
268	 pxor	$x3, $x2
269	pxor	$t0, $x0
270	 pxor	$t1, $x3
271___
272}
273sub Mul_GF16_2 {
274my @x=@_[0..7];
275my @y=@_[8..11];
276my @t=@_[12..15];
277$code.=<<___;
278	movdqa	@x[0], @t[0]
279	movdqa	@x[1], @t[1]
280___
281	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
282$code.=<<___;
283	pxor	@x[2], @t[0]
284	pxor	@x[3], @t[1]
285	pxor	@y[2], @y[0]
286	pxor	@y[3], @y[1]
287___
288	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
289			 @x[2], @x[3], @y[2], @y[3], @t[2]);
290$code.=<<___;
291	pxor	@t[0], @x[0]
292	pxor	@t[0], @x[2]
293	pxor	@t[1], @x[1]
294	pxor	@t[1], @x[3]
295
296	movdqa	@x[4], @t[0]
297	movdqa	@x[5], @t[1]
298	pxor	@x[6], @t[0]
299	pxor	@x[7], @t[1]
300___
301	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
302			 @x[6], @x[7], @y[2], @y[3], @t[2]);
303$code.=<<___;
304	pxor	@y[2], @y[0]
305	pxor	@y[3], @y[1]
306___
307	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
308$code.=<<___;
309	pxor	@t[0], @x[4]
310	pxor	@t[0], @x[6]
311	pxor	@t[1], @x[5]
312	pxor	@t[1], @x[7]
313___
314}
315sub Inv_GF256 {
316#;********************************************************************
317#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
318#;********************************************************************
319my @x=@_[0..7];
320my @t=@_[8..11];
321my @s=@_[12..15];
322# direct optimizations from hardware
323$code.=<<___;
324	movdqa	@x[4], @t[3]
325	movdqa	@x[5], @t[2]
326	movdqa	@x[1], @t[1]
327	movdqa	@x[7], @s[1]
328	movdqa	@x[0], @s[0]
329
330	pxor	@x[6], @t[3]
331	pxor	@x[7], @t[2]
332	pxor	@x[3], @t[1]
333	 movdqa	@t[3], @s[2]
334	pxor	@x[6], @s[1]
335	 movdqa	@t[2], @t[0]
336	pxor	@x[2], @s[0]
337	 movdqa	@t[3], @s[3]
338
339	por	@t[1], @t[2]
340	por	@s[0], @t[3]
341	pxor	@t[0], @s[3]
342	pand	@s[0], @s[2]
343	pxor	@t[1], @s[0]
344	pand	@t[1], @t[0]
345	pand	@s[0], @s[3]
346	movdqa	@x[3], @s[0]
347	pxor	@x[2], @s[0]
348	pand	@s[0], @s[1]
349	pxor	@s[1], @t[3]
350	pxor	@s[1], @t[2]
351	movdqa	@x[4], @s[1]
352	movdqa	@x[1], @s[0]
353	pxor	@x[5], @s[1]
354	pxor	@x[0], @s[0]
355	movdqa	@s[1], @t[1]
356	pand	@s[0], @s[1]
357	por	@s[0], @t[1]
358	pxor	@s[1], @t[0]
359	pxor	@s[3], @t[3]
360	pxor	@s[2], @t[2]
361	pxor	@s[3], @t[1]
362	movdqa	@x[7], @s[0]
363	pxor	@s[2], @t[0]
364	movdqa	@x[6], @s[1]
365	pxor	@s[2], @t[1]
366	movdqa	@x[5], @s[2]
367	pand	@x[3], @s[0]
368	movdqa	@x[4], @s[3]
369	pand	@x[2], @s[1]
370	pand	@x[1], @s[2]
371	por	@x[0], @s[3]
372	pxor	@s[0], @t[3]
373	pxor	@s[1], @t[2]
374	pxor	@s[2], @t[1]
375	pxor	@s[3], @t[0]
376
377	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
378
379	# new smaller inversion
380
381	movdqa	@t[3], @s[0]
382	pand	@t[1], @t[3]
383	pxor	@t[2], @s[0]
384
385	movdqa	@t[0], @s[2]
386	movdqa	@s[0], @s[3]
387	pxor	@t[3], @s[2]
388	pand	@s[2], @s[3]
389
390	movdqa	@t[1], @s[1]
391	pxor	@t[2], @s[3]
392	pxor	@t[0], @s[1]
393
394	pxor	@t[2], @t[3]
395
396	pand	@t[3], @s[1]
397
398	movdqa	@s[2], @t[2]
399	pxor	@t[0], @s[1]
400
401	pxor	@s[1], @t[2]
402	pxor	@s[1], @t[1]
403
404	pand	@t[0], @t[2]
405
406	pxor	@t[2], @s[2]
407	pxor	@t[2], @t[1]
408
409	pand	@s[3], @s[2]
410
411	pxor	@s[0], @s[2]
412___
413# output in s3, s2, s1, t1
414
415# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
416
417# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
418	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
419
420### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
421}
422
423# AES linear components
424
425sub ShiftRows {
426my @x=@_[0..7];
427my $mask=pop;
428$code.=<<___;
429	pxor	0x00($key),@x[0]
430	pxor	0x10($key),@x[1]
431	pxor	0x20($key),@x[2]
432	pxor	0x30($key),@x[3]
433	pshufb	$mask,@x[0]
434	pshufb	$mask,@x[1]
435	pxor	0x40($key),@x[4]
436	pxor	0x50($key),@x[5]
437	pshufb	$mask,@x[2]
438	pshufb	$mask,@x[3]
439	pxor	0x60($key),@x[6]
440	pxor	0x70($key),@x[7]
441	pshufb	$mask,@x[4]
442	pshufb	$mask,@x[5]
443	pshufb	$mask,@x[6]
444	pshufb	$mask,@x[7]
445	lea	0x80($key),$key
446___
447}
448
449sub MixColumns {
450# modified to emit output in order suitable for feeding back to aesenc[last]
451my @x=@_[0..7];
452my @t=@_[8..15];
453my $inv=@_[16];	# optional
454$code.=<<___;
455	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
456	pshufd	\$0x93, @x[1], @t[1]
457	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
458	pshufd	\$0x93, @x[2], @t[2]
459	 pxor	@t[1], @x[1]
460	pshufd	\$0x93, @x[3], @t[3]
461	 pxor	@t[2], @x[2]
462	pshufd	\$0x93, @x[4], @t[4]
463	 pxor	@t[3], @x[3]
464	pshufd	\$0x93, @x[5], @t[5]
465	 pxor	@t[4], @x[4]
466	pshufd	\$0x93, @x[6], @t[6]
467	 pxor	@t[5], @x[5]
468	pshufd	\$0x93, @x[7], @t[7]
469	 pxor	@t[6], @x[6]
470	 pxor	@t[7], @x[7]
471
472	pxor	@x[0], @t[1]
473	pxor	@x[7], @t[0]
474	pxor	@x[7], @t[1]
475	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
476	pxor	@x[1], @t[2]
477	 pshufd	\$0x4E, @x[1], @x[1]
478	pxor	@x[4], @t[5]
479	 pxor	@t[0], @x[0]
480	pxor	@x[5], @t[6]
481	 pxor	@t[1], @x[1]
482	pxor	@x[3], @t[4]
483	 pshufd	\$0x4E, @x[4], @t[0]
484	pxor	@x[6], @t[7]
485	 pshufd	\$0x4E, @x[5], @t[1]
486	pxor	@x[2], @t[3]
487	 pshufd	\$0x4E, @x[3], @x[4]
488	pxor	@x[7], @t[3]
489	 pshufd	\$0x4E, @x[7], @x[5]
490	pxor	@x[7], @t[4]
491	 pshufd	\$0x4E, @x[6], @x[3]
492	pxor	@t[4], @t[0]
493	 pshufd	\$0x4E, @x[2], @x[6]
494	pxor	@t[5], @t[1]
495___
496$code.=<<___ if (!$inv);
497	pxor	@t[3], @x[4]
498	pxor	@t[7], @x[5]
499	pxor	@t[6], @x[3]
500	 movdqa	@t[0], @x[2]
501	pxor	@t[2], @x[6]
502	 movdqa	@t[1], @x[7]
503___
504$code.=<<___ if ($inv);
505	pxor	@x[4], @t[3]
506	pxor	@t[7], @x[5]
507	pxor	@x[3], @t[6]
508	 movdqa	@t[0], @x[3]
509	pxor	@t[2], @x[6]
510	 movdqa	@t[6], @x[2]
511	 movdqa	@t[1], @x[7]
512	 movdqa	@x[6], @x[4]
513	 movdqa	@t[3], @x[6]
514___
515}
516
517sub InvMixColumns_orig {
518my @x=@_[0..7];
519my @t=@_[8..15];
520
521$code.=<<___;
522	# multiplication by 0x0e
523	pshufd	\$0x93, @x[7], @t[7]
524	movdqa	@x[2], @t[2]
525	pxor	@x[5], @x[7]		# 7 5
526	pxor	@x[5], @x[2]		# 2 5
527	pshufd	\$0x93, @x[0], @t[0]
528	movdqa	@x[5], @t[5]
529	pxor	@x[0], @x[5]		# 5 0		[1]
530	pxor	@x[1], @x[0]		# 0 1
531	pshufd	\$0x93, @x[1], @t[1]
532	pxor	@x[2], @x[1]		# 1 25
533	pxor	@x[6], @x[0]		# 01 6		[2]
534	pxor	@x[3], @x[1]		# 125 3		[4]
535	pshufd	\$0x93, @x[3], @t[3]
536	pxor	@x[0], @x[2]		# 25 016	[3]
537	pxor	@x[7], @x[3]		# 3 75
538	pxor	@x[6], @x[7]		# 75 6		[0]
539	pshufd	\$0x93, @x[6], @t[6]
540	movdqa	@x[4], @t[4]
541	pxor	@x[4], @x[6]		# 6 4
542	pxor	@x[3], @x[4]		# 4 375		[6]
543	pxor	@x[7], @x[3]		# 375 756=36
544	pxor	@t[5], @x[6]		# 64 5		[7]
545	pxor	@t[2], @x[3]		# 36 2
546	pxor	@t[4], @x[3]		# 362 4		[5]
547	pshufd	\$0x93, @t[5], @t[5]
548___
549					my @y = @x[7,5,0,2,1,3,4,6];
550$code.=<<___;
551	# multiplication by 0x0b
552	pxor	@y[0], @y[1]
553	pxor	@t[0], @y[0]
554	pxor	@t[1], @y[1]
555	pshufd	\$0x93, @t[2], @t[2]
556	pxor	@t[5], @y[0]
557	pxor	@t[6], @y[1]
558	pxor	@t[7], @y[0]
559	pshufd	\$0x93, @t[4], @t[4]
560	pxor	@t[6], @t[7]		# clobber t[7]
561	pxor	@y[0], @y[1]
562
563	pxor	@t[0], @y[3]
564	pshufd	\$0x93, @t[0], @t[0]
565	pxor	@t[1], @y[2]
566	pxor	@t[1], @y[4]
567	pxor	@t[2], @y[2]
568	pshufd	\$0x93, @t[1], @t[1]
569	pxor	@t[2], @y[3]
570	pxor	@t[2], @y[5]
571	pxor	@t[7], @y[2]
572	pshufd	\$0x93, @t[2], @t[2]
573	pxor	@t[3], @y[3]
574	pxor	@t[3], @y[6]
575	pxor	@t[3], @y[4]
576	pshufd	\$0x93, @t[3], @t[3]
577	pxor	@t[4], @y[7]
578	pxor	@t[4], @y[5]
579	pxor	@t[7], @y[7]
580	pxor	@t[5], @y[3]
581	pxor	@t[4], @y[4]
582	pxor	@t[5], @t[7]		# clobber t[7] even more
583
584	pxor	@t[7], @y[5]
585	pshufd	\$0x93, @t[4], @t[4]
586	pxor	@t[7], @y[6]
587	pxor	@t[7], @y[4]
588
589	pxor	@t[5], @t[7]
590	pshufd	\$0x93, @t[5], @t[5]
591	pxor	@t[6], @t[7]		# restore t[7]
592
593	# multiplication by 0x0d
594	pxor	@y[7], @y[4]
595	pxor	@t[4], @y[7]
596	pshufd	\$0x93, @t[6], @t[6]
597	pxor	@t[0], @y[2]
598	pxor	@t[5], @y[7]
599	pxor	@t[2], @y[2]
600	pshufd	\$0x93, @t[7], @t[7]
601
602	pxor	@y[1], @y[3]
603	pxor	@t[1], @y[1]
604	pxor	@t[0], @y[0]
605	pxor	@t[0], @y[3]
606	pxor	@t[5], @y[1]
607	pxor	@t[5], @y[0]
608	pxor	@t[7], @y[1]
609	pshufd	\$0x93, @t[0], @t[0]
610	pxor	@t[6], @y[0]
611	pxor	@y[1], @y[3]
612	pxor	@t[1], @y[4]
613	pshufd	\$0x93, @t[1], @t[1]
614
615	pxor	@t[7], @y[7]
616	pxor	@t[2], @y[4]
617	pxor	@t[2], @y[5]
618	pshufd	\$0x93, @t[2], @t[2]
619	pxor	@t[6], @y[2]
620	pxor	@t[3], @t[6]		# clobber t[6]
621	pxor	@y[7], @y[4]
622	pxor	@t[6], @y[3]
623
624	pxor	@t[6], @y[6]
625	pxor	@t[5], @y[5]
626	pxor	@t[4], @y[6]
627	pshufd	\$0x93, @t[4], @t[4]
628	pxor	@t[6], @y[5]
629	pxor	@t[7], @y[6]
630	pxor	@t[3], @t[6]		# restore t[6]
631
632	pshufd	\$0x93, @t[5], @t[5]
633	pshufd	\$0x93, @t[6], @t[6]
634	pshufd	\$0x93, @t[7], @t[7]
635	pshufd	\$0x93, @t[3], @t[3]
636
637	# multiplication by 0x09
638	pxor	@y[1], @y[4]
639	pxor	@y[1], @t[1]		# t[1]=y[1]
640	pxor	@t[5], @t[0]		# clobber t[0]
641	pxor	@t[5], @t[1]
642	pxor	@t[0], @y[3]
643	pxor	@y[0], @t[0]		# t[0]=y[0]
644	pxor	@t[6], @t[1]
645	pxor	@t[7], @t[6]		# clobber t[6]
646	pxor	@t[1], @y[4]
647	pxor	@t[4], @y[7]
648	pxor	@y[4], @t[4]		# t[4]=y[4]
649	pxor	@t[3], @y[6]
650	pxor	@y[3], @t[3]		# t[3]=y[3]
651	pxor	@t[2], @y[5]
652	pxor	@y[2], @t[2]		# t[2]=y[2]
653	pxor	@t[7], @t[3]
654	pxor	@y[5], @t[5]		# t[5]=y[5]
655	pxor	@t[6], @t[2]
656	pxor	@t[6], @t[5]
657	pxor	@y[6], @t[6]		# t[6]=y[6]
658	pxor	@y[7], @t[7]		# t[7]=y[7]
659
660	movdqa	@t[0],@XMM[0]
661	movdqa	@t[1],@XMM[1]
662	movdqa	@t[2],@XMM[2]
663	movdqa	@t[3],@XMM[3]
664	movdqa	@t[4],@XMM[4]
665	movdqa	@t[5],@XMM[5]
666	movdqa	@t[6],@XMM[6]
667	movdqa	@t[7],@XMM[7]
668___
669}
670
671sub InvMixColumns {
672my @x=@_[0..7];
673my @t=@_[8..15];
674
675# Thanks to Jussi Kivilinna for providing pointer to
676#
677# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
678# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
679# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
680# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
681
682$code.=<<___;
683	# multiplication by 0x05-0x00-0x04-0x00
684	pshufd	\$0x4E, @x[0], @t[0]
685	pshufd	\$0x4E, @x[6], @t[6]
686	pxor	@x[0], @t[0]
687	pshufd	\$0x4E, @x[7], @t[7]
688	pxor	@x[6], @t[6]
689	pshufd	\$0x4E, @x[1], @t[1]
690	pxor	@x[7], @t[7]
691	pshufd	\$0x4E, @x[2], @t[2]
692	pxor	@x[1], @t[1]
693	pshufd	\$0x4E, @x[3], @t[3]
694	pxor	@x[2], @t[2]
695	 pxor	@t[6], @x[0]
696	 pxor	@t[6], @x[1]
697	pshufd	\$0x4E, @x[4], @t[4]
698	pxor	@x[3], @t[3]
699	 pxor	@t[0], @x[2]
700	 pxor	@t[1], @x[3]
701	pshufd	\$0x4E, @x[5], @t[5]
702	pxor	@x[4], @t[4]
703	 pxor	@t[7], @x[1]
704	 pxor	@t[2], @x[4]
705	pxor	@x[5], @t[5]
706
707	 pxor	@t[7], @x[2]
708	 pxor	@t[6], @x[3]
709	 pxor	@t[6], @x[4]
710	 pxor	@t[3], @x[5]
711	 pxor	@t[4], @x[6]
712	 pxor	@t[7], @x[4]
713	 pxor	@t[7], @x[5]
714	 pxor	@t[5], @x[7]
715___
716	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
717}
718
719sub aesenc {				# not used
720my @b=@_[0..7];
721my @t=@_[8..15];
722$code.=<<___;
723	movdqa	0x30($const),@t[0]	# .LSR
724___
725	&ShiftRows	(@b,@t[0]);
726	&Sbox		(@b,@t);
727	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
728}
729
730sub aesenclast {			# not used
731my @b=@_[0..7];
732my @t=@_[8..15];
733$code.=<<___;
734	movdqa	0x40($const),@t[0]	# .LSRM0
735___
736	&ShiftRows	(@b,@t[0]);
737	&Sbox		(@b,@t);
738$code.=<<___
739	pxor	0x00($key),@b[0]
740	pxor	0x10($key),@b[1]
741	pxor	0x20($key),@b[4]
742	pxor	0x30($key),@b[6]
743	pxor	0x40($key),@b[3]
744	pxor	0x50($key),@b[7]
745	pxor	0x60($key),@b[2]
746	pxor	0x70($key),@b[5]
747___
748}
749
750sub swapmove {
751my ($a,$b,$n,$mask,$t)=@_;
752$code.=<<___;
753	movdqa	$b,$t
754	psrlq	\$$n,$b
755	pxor  	$a,$b
756	pand	$mask,$b
757	pxor	$b,$a
758	psllq	\$$n,$b
759	pxor	$t,$b
760___
761}
762sub swapmove2x {
763my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
764$code.=<<___;
765	movdqa	$b0,$t0
766	psrlq	\$$n,$b0
767	 movdqa	$b1,$t1
768	 psrlq	\$$n,$b1
769	pxor  	$a0,$b0
770	 pxor  	$a1,$b1
771	pand	$mask,$b0
772	 pand	$mask,$b1
773	pxor	$b0,$a0
774	psllq	\$$n,$b0
775	 pxor	$b1,$a1
776	 psllq	\$$n,$b1
777	pxor	$t0,$b0
778	 pxor	$t1,$b1
779___
780}
781
782sub bitslice {
783my @x=reverse(@_[0..7]);
784my ($t0,$t1,$t2,$t3)=@_[8..11];
785$code.=<<___;
786	movdqa	0x00($const),$t0	# .LBS0
787	movdqa	0x10($const),$t1	# .LBS1
788___
789	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
790	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
791$code.=<<___;
792	movdqa	0x20($const),$t0	# .LBS2
793___
794	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
795	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
796
797	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
798	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
799}
800
801$code.=<<___;
802.text
803
804.extern	asm_AES_encrypt
805.extern	asm_AES_decrypt
806
807.type	_bsaes_encrypt8,\@abi-omnipotent
808.align	64
809_bsaes_encrypt8:
810	lea	.LBS0(%rip), $const	# constants table
811
812	movdqa	($key), @XMM[9]		# round 0 key
813	lea	0x10($key), $key
814	movdqa	0x50($const), @XMM[8]	# .LM0SR
815	pxor	@XMM[9], @XMM[0]	# xor with round0 key
816	pxor	@XMM[9], @XMM[1]
817	pxor	@XMM[9], @XMM[2]
818	pxor	@XMM[9], @XMM[3]
819	 pshufb	@XMM[8], @XMM[0]
820	 pshufb	@XMM[8], @XMM[1]
821	pxor	@XMM[9], @XMM[4]
822	pxor	@XMM[9], @XMM[5]
823	 pshufb	@XMM[8], @XMM[2]
824	 pshufb	@XMM[8], @XMM[3]
825	pxor	@XMM[9], @XMM[6]
826	pxor	@XMM[9], @XMM[7]
827	 pshufb	@XMM[8], @XMM[4]
828	 pshufb	@XMM[8], @XMM[5]
829	 pshufb	@XMM[8], @XMM[6]
830	 pshufb	@XMM[8], @XMM[7]
831_bsaes_encrypt8_bitslice:
832___
833	&bitslice	(@XMM[0..7, 8..11]);
834$code.=<<___;
835	dec	$rounds
836	jmp	.Lenc_sbox
837.align	16
838.Lenc_loop:
839___
840	&ShiftRows	(@XMM[0..7, 8]);
841$code.=".Lenc_sbox:\n";
842	&Sbox		(@XMM[0..7, 8..15]);
843$code.=<<___;
844	dec	$rounds
845	jl	.Lenc_done
846___
847	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
848$code.=<<___;
849	movdqa	0x30($const), @XMM[8]	# .LSR
850	jnz	.Lenc_loop
851	movdqa	0x40($const), @XMM[8]	# .LSRM0
852	jmp	.Lenc_loop
853.align	16
854.Lenc_done:
855___
856	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
857	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
858$code.=<<___;
859	movdqa	($key), @XMM[8]		# last round key
860	pxor	@XMM[8], @XMM[4]
861	pxor	@XMM[8], @XMM[6]
862	pxor	@XMM[8], @XMM[3]
863	pxor	@XMM[8], @XMM[7]
864	pxor	@XMM[8], @XMM[2]
865	pxor	@XMM[8], @XMM[5]
866	pxor	@XMM[8], @XMM[0]
867	pxor	@XMM[8], @XMM[1]
868	ret
869.size	_bsaes_encrypt8,.-_bsaes_encrypt8
870
871.type	_bsaes_decrypt8,\@abi-omnipotent
872.align	64
873_bsaes_decrypt8:
874	lea	.LBS0(%rip), $const	# constants table
875
876	movdqa	($key), @XMM[9]		# round 0 key
877	lea	0x10($key), $key
878	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
879	pxor	@XMM[9], @XMM[0]	# xor with round0 key
880	pxor	@XMM[9], @XMM[1]
881	pxor	@XMM[9], @XMM[2]
882	pxor	@XMM[9], @XMM[3]
883	 pshufb	@XMM[8], @XMM[0]
884	 pshufb	@XMM[8], @XMM[1]
885	pxor	@XMM[9], @XMM[4]
886	pxor	@XMM[9], @XMM[5]
887	 pshufb	@XMM[8], @XMM[2]
888	 pshufb	@XMM[8], @XMM[3]
889	pxor	@XMM[9], @XMM[6]
890	pxor	@XMM[9], @XMM[7]
891	 pshufb	@XMM[8], @XMM[4]
892	 pshufb	@XMM[8], @XMM[5]
893	 pshufb	@XMM[8], @XMM[6]
894	 pshufb	@XMM[8], @XMM[7]
895___
896	&bitslice	(@XMM[0..7, 8..11]);
897$code.=<<___;
898	dec	$rounds
899	jmp	.Ldec_sbox
900.align	16
901.Ldec_loop:
902___
903	&ShiftRows	(@XMM[0..7, 8]);
904$code.=".Ldec_sbox:\n";
905	&InvSbox	(@XMM[0..7, 8..15]);
906$code.=<<___;
907	dec	$rounds
908	jl	.Ldec_done
909___
910	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
911$code.=<<___;
912	movdqa	-0x10($const), @XMM[8]	# .LISR
913	jnz	.Ldec_loop
914	movdqa	-0x20($const), @XMM[8]	# .LISRM0
915	jmp	.Ldec_loop
916.align	16
917.Ldec_done:
918___
919	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
920$code.=<<___;
921	movdqa	($key), @XMM[8]		# last round key
922	pxor	@XMM[8], @XMM[6]
923	pxor	@XMM[8], @XMM[4]
924	pxor	@XMM[8], @XMM[2]
925	pxor	@XMM[8], @XMM[7]
926	pxor	@XMM[8], @XMM[3]
927	pxor	@XMM[8], @XMM[5]
928	pxor	@XMM[8], @XMM[0]
929	pxor	@XMM[8], @XMM[1]
930	ret
931.size	_bsaes_decrypt8,.-_bsaes_decrypt8
932___
933}
934{
935my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
936
937sub bitslice_key {
938my @x=reverse(@_[0..7]);
939my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
940
941	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
942$code.=<<___;
943	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
944	movdqa	@x[0], @x[2]
945	movdqa	@x[1], @x[3]
946___
947	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
948
949	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
950$code.=<<___;
951	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
952	movdqa	@x[0], @x[4]
953	movdqa	@x[2], @x[6]
954	movdqa	@x[1], @x[5]
955	movdqa	@x[3], @x[7]
956___
957	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
958	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
959}
960
961$code.=<<___;
962.type	_bsaes_key_convert,\@abi-omnipotent
963.align	16
964_bsaes_key_convert:
965	lea	.Lmasks(%rip), $const
966	movdqu	($inp), %xmm7		# load round 0 key
967	lea	0x10($inp), $inp
968	movdqa	0x00($const), %xmm0	# 0x01...
969	movdqa	0x10($const), %xmm1	# 0x02...
970	movdqa	0x20($const), %xmm2	# 0x04...
971	movdqa	0x30($const), %xmm3	# 0x08...
972	movdqa	0x40($const), %xmm4	# .LM0
973	pcmpeqd	%xmm5, %xmm5		# .LNOT
974
975	movdqu	($inp), %xmm6		# load round 1 key
976	movdqa	%xmm7, ($out)		# save round 0 key
977	lea	0x10($out), $out
978	dec	$rounds
979	jmp	.Lkey_loop
980.align	16
981.Lkey_loop:
982	pshufb	%xmm4, %xmm6		# .LM0
983
984	movdqa	%xmm0,	%xmm8
985	movdqa	%xmm1,	%xmm9
986
987	pand	%xmm6,	%xmm8
988	pand	%xmm6,	%xmm9
989	movdqa	%xmm2,	%xmm10
990	pcmpeqb	%xmm0,	%xmm8
991	psllq	\$4,	%xmm0		# 0x10...
992	movdqa	%xmm3,	%xmm11
993	pcmpeqb	%xmm1,	%xmm9
994	psllq	\$4,	%xmm1		# 0x20...
995
996	pand	%xmm6,	%xmm10
997	pand	%xmm6,	%xmm11
998	movdqa	%xmm0,	%xmm12
999	pcmpeqb	%xmm2,	%xmm10
1000	psllq	\$4,	%xmm2		# 0x40...
1001	movdqa	%xmm1,	%xmm13
1002	pcmpeqb	%xmm3,	%xmm11
1003	psllq	\$4,	%xmm3		# 0x80...
1004
1005	movdqa	%xmm2,	%xmm14
1006	movdqa	%xmm3,	%xmm15
1007	 pxor	%xmm5,	%xmm8		# "pnot"
1008	 pxor	%xmm5,	%xmm9
1009
1010	pand	%xmm6,	%xmm12
1011	pand	%xmm6,	%xmm13
1012	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1013	pcmpeqb	%xmm0,	%xmm12
1014	psrlq	\$4,	%xmm0		# 0x01...
1015	 movdqa	%xmm9, 0x10($out)
1016	pcmpeqb	%xmm1,	%xmm13
1017	psrlq	\$4,	%xmm1		# 0x02...
1018	 lea	0x10($inp), $inp
1019
1020	pand	%xmm6,	%xmm14
1021	pand	%xmm6,	%xmm15
1022	 movdqa	%xmm10, 0x20($out)
1023	pcmpeqb	%xmm2,	%xmm14
1024	psrlq	\$4,	%xmm2		# 0x04...
1025	 movdqa	%xmm11, 0x30($out)
1026	pcmpeqb	%xmm3,	%xmm15
1027	psrlq	\$4,	%xmm3		# 0x08...
1028	 movdqu	($inp), %xmm6		# load next round key
1029
1030	pxor	%xmm5, %xmm13		# "pnot"
1031	pxor	%xmm5, %xmm14
1032	movdqa	%xmm12, 0x40($out)
1033	movdqa	%xmm13, 0x50($out)
1034	movdqa	%xmm14, 0x60($out)
1035	movdqa	%xmm15, 0x70($out)
1036	lea	0x80($out),$out
1037	dec	$rounds
1038	jnz	.Lkey_loop
1039
1040	movdqa	0x50($const), %xmm7	# .L63
1041	#movdqa	%xmm6, ($out)		# don't save last round key
1042	ret
1043.size	_bsaes_key_convert,.-_bsaes_key_convert
1044___
1045}
1046
1047if (0 && !$win64) {	# following four functions are unsupported interface
1048			# used for benchmarking...
1049$code.=<<___;
1050.globl	bsaes_enc_key_convert
1051.type	bsaes_enc_key_convert,\@function,2
1052.align	16
1053bsaes_enc_key_convert:
1054	mov	240($inp),%r10d		# pass rounds
1055	mov	$inp,%rcx		# pass key
1056	mov	$out,%rax		# pass key schedule
1057	call	_bsaes_key_convert
1058	pxor	%xmm6,%xmm7		# fix up last round key
1059	movdqa	%xmm7,(%rax)		# save last round key
1060	ret
1061.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1062
1063.globl	bsaes_encrypt_128
1064.type	bsaes_encrypt_128,\@function,4
1065.align	16
1066bsaes_encrypt_128:
1067.Lenc128_loop:
1068	movdqu	0x00($inp), @XMM[0]	# load input
1069	movdqu	0x10($inp), @XMM[1]
1070	movdqu	0x20($inp), @XMM[2]
1071	movdqu	0x30($inp), @XMM[3]
1072	movdqu	0x40($inp), @XMM[4]
1073	movdqu	0x50($inp), @XMM[5]
1074	movdqu	0x60($inp), @XMM[6]
1075	movdqu	0x70($inp), @XMM[7]
1076	mov	$key, %rax		# pass the $key
1077	lea	0x80($inp), $inp
1078	mov	\$10,%r10d
1079
1080	call	_bsaes_encrypt8
1081
1082	movdqu	@XMM[0], 0x00($out)	# write output
1083	movdqu	@XMM[1], 0x10($out)
1084	movdqu	@XMM[4], 0x20($out)
1085	movdqu	@XMM[6], 0x30($out)
1086	movdqu	@XMM[3], 0x40($out)
1087	movdqu	@XMM[7], 0x50($out)
1088	movdqu	@XMM[2], 0x60($out)
1089	movdqu	@XMM[5], 0x70($out)
1090	lea	0x80($out), $out
1091	sub	\$0x80,$len
1092	ja	.Lenc128_loop
1093	ret
1094.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1095
1096.globl	bsaes_dec_key_convert
1097.type	bsaes_dec_key_convert,\@function,2
1098.align	16
1099bsaes_dec_key_convert:
1100	mov	240($inp),%r10d		# pass rounds
1101	mov	$inp,%rcx		# pass key
1102	mov	$out,%rax		# pass key schedule
1103	call	_bsaes_key_convert
1104	pxor	($out),%xmm7		# fix up round 0 key
1105	movdqa	%xmm6,(%rax)		# save last round key
1106	movdqa	%xmm7,($out)
1107	ret
1108.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1109
1110.globl	bsaes_decrypt_128
1111.type	bsaes_decrypt_128,\@function,4
1112.align	16
1113bsaes_decrypt_128:
1114.Ldec128_loop:
1115	movdqu	0x00($inp), @XMM[0]	# load input
1116	movdqu	0x10($inp), @XMM[1]
1117	movdqu	0x20($inp), @XMM[2]
1118	movdqu	0x30($inp), @XMM[3]
1119	movdqu	0x40($inp), @XMM[4]
1120	movdqu	0x50($inp), @XMM[5]
1121	movdqu	0x60($inp), @XMM[6]
1122	movdqu	0x70($inp), @XMM[7]
1123	mov	$key, %rax		# pass the $key
1124	lea	0x80($inp), $inp
1125	mov	\$10,%r10d
1126
1127	call	_bsaes_decrypt8
1128
1129	movdqu	@XMM[0], 0x00($out)	# write output
1130	movdqu	@XMM[1], 0x10($out)
1131	movdqu	@XMM[6], 0x20($out)
1132	movdqu	@XMM[4], 0x30($out)
1133	movdqu	@XMM[2], 0x40($out)
1134	movdqu	@XMM[7], 0x50($out)
1135	movdqu	@XMM[3], 0x60($out)
1136	movdqu	@XMM[5], 0x70($out)
1137	lea	0x80($out), $out
1138	sub	\$0x80,$len
1139	ja	.Ldec128_loop
1140	ret
1141.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1142___
1143}
1144{
1145######################################################################
1146#
1147# OpenSSL interface
1148#
1149my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1150						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1151my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1152
1153if ($ecb) {
1154$code.=<<___;
1155.globl	bsaes_ecb_encrypt_blocks
1156.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1157.align	16
1158bsaes_ecb_encrypt_blocks:
1159	mov	%rsp, %rax
1160.Lecb_enc_prologue:
1161	push	%rbp
1162	push	%rbx
1163	push	%r12
1164	push	%r13
1165	push	%r14
1166	push	%r15
1167	lea	-0x48(%rsp),%rsp
1168___
1169$code.=<<___ if ($win64);
1170	lea	-0xa0(%rsp), %rsp
1171	movaps	%xmm6, 0x40(%rsp)
1172	movaps	%xmm7, 0x50(%rsp)
1173	movaps	%xmm8, 0x60(%rsp)
1174	movaps	%xmm9, 0x70(%rsp)
1175	movaps	%xmm10, 0x80(%rsp)
1176	movaps	%xmm11, 0x90(%rsp)
1177	movaps	%xmm12, 0xa0(%rsp)
1178	movaps	%xmm13, 0xb0(%rsp)
1179	movaps	%xmm14, 0xc0(%rsp)
1180	movaps	%xmm15, 0xd0(%rsp)
1181.Lecb_enc_body:
1182___
1183$code.=<<___;
1184	mov	%rsp,%rbp		# backup %rsp
1185	mov	240($arg4),%eax		# rounds
1186	mov	$arg1,$inp		# backup arguments
1187	mov	$arg2,$out
1188	mov	$arg3,$len
1189	mov	$arg4,$key
1190	cmp	\$8,$arg3
1191	jb	.Lecb_enc_short
1192
1193	mov	%eax,%ebx		# backup rounds
1194	shl	\$7,%rax		# 128 bytes per inner round key
1195	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1196	sub	%rax,%rsp
1197	mov	%rsp,%rax		# pass key schedule
1198	mov	$key,%rcx		# pass key
1199	mov	%ebx,%r10d		# pass rounds
1200	call	_bsaes_key_convert
1201	pxor	%xmm6,%xmm7		# fix up last round key
1202	movdqa	%xmm7,(%rax)		# save last round key
1203
1204	sub	\$8,$len
1205.Lecb_enc_loop:
1206	movdqu	0x00($inp), @XMM[0]	# load input
1207	movdqu	0x10($inp), @XMM[1]
1208	movdqu	0x20($inp), @XMM[2]
1209	movdqu	0x30($inp), @XMM[3]
1210	movdqu	0x40($inp), @XMM[4]
1211	movdqu	0x50($inp), @XMM[5]
1212	mov	%rsp, %rax		# pass key schedule
1213	movdqu	0x60($inp), @XMM[6]
1214	mov	%ebx,%r10d		# pass rounds
1215	movdqu	0x70($inp), @XMM[7]
1216	lea	0x80($inp), $inp
1217
1218	call	_bsaes_encrypt8
1219
1220	movdqu	@XMM[0], 0x00($out)	# write output
1221	movdqu	@XMM[1], 0x10($out)
1222	movdqu	@XMM[4], 0x20($out)
1223	movdqu	@XMM[6], 0x30($out)
1224	movdqu	@XMM[3], 0x40($out)
1225	movdqu	@XMM[7], 0x50($out)
1226	movdqu	@XMM[2], 0x60($out)
1227	movdqu	@XMM[5], 0x70($out)
1228	lea	0x80($out), $out
1229	sub	\$8,$len
1230	jnc	.Lecb_enc_loop
1231
1232	add	\$8,$len
1233	jz	.Lecb_enc_done
1234
1235	movdqu	0x00($inp), @XMM[0]	# load input
1236	mov	%rsp, %rax		# pass key schedule
1237	mov	%ebx,%r10d		# pass rounds
1238	cmp	\$2,$len
1239	jb	.Lecb_enc_one
1240	movdqu	0x10($inp), @XMM[1]
1241	je	.Lecb_enc_two
1242	movdqu	0x20($inp), @XMM[2]
1243	cmp	\$4,$len
1244	jb	.Lecb_enc_three
1245	movdqu	0x30($inp), @XMM[3]
1246	je	.Lecb_enc_four
1247	movdqu	0x40($inp), @XMM[4]
1248	cmp	\$6,$len
1249	jb	.Lecb_enc_five
1250	movdqu	0x50($inp), @XMM[5]
1251	je	.Lecb_enc_six
1252	movdqu	0x60($inp), @XMM[6]
1253	call	_bsaes_encrypt8
1254	movdqu	@XMM[0], 0x00($out)	# write output
1255	movdqu	@XMM[1], 0x10($out)
1256	movdqu	@XMM[4], 0x20($out)
1257	movdqu	@XMM[6], 0x30($out)
1258	movdqu	@XMM[3], 0x40($out)
1259	movdqu	@XMM[7], 0x50($out)
1260	movdqu	@XMM[2], 0x60($out)
1261	jmp	.Lecb_enc_done
1262.align	16
1263.Lecb_enc_six:
1264	call	_bsaes_encrypt8
1265	movdqu	@XMM[0], 0x00($out)	# write output
1266	movdqu	@XMM[1], 0x10($out)
1267	movdqu	@XMM[4], 0x20($out)
1268	movdqu	@XMM[6], 0x30($out)
1269	movdqu	@XMM[3], 0x40($out)
1270	movdqu	@XMM[7], 0x50($out)
1271	jmp	.Lecb_enc_done
1272.align	16
1273.Lecb_enc_five:
1274	call	_bsaes_encrypt8
1275	movdqu	@XMM[0], 0x00($out)	# write output
1276	movdqu	@XMM[1], 0x10($out)
1277	movdqu	@XMM[4], 0x20($out)
1278	movdqu	@XMM[6], 0x30($out)
1279	movdqu	@XMM[3], 0x40($out)
1280	jmp	.Lecb_enc_done
1281.align	16
1282.Lecb_enc_four:
1283	call	_bsaes_encrypt8
1284	movdqu	@XMM[0], 0x00($out)	# write output
1285	movdqu	@XMM[1], 0x10($out)
1286	movdqu	@XMM[4], 0x20($out)
1287	movdqu	@XMM[6], 0x30($out)
1288	jmp	.Lecb_enc_done
1289.align	16
1290.Lecb_enc_three:
1291	call	_bsaes_encrypt8
1292	movdqu	@XMM[0], 0x00($out)	# write output
1293	movdqu	@XMM[1], 0x10($out)
1294	movdqu	@XMM[4], 0x20($out)
1295	jmp	.Lecb_enc_done
1296.align	16
1297.Lecb_enc_two:
1298	call	_bsaes_encrypt8
1299	movdqu	@XMM[0], 0x00($out)	# write output
1300	movdqu	@XMM[1], 0x10($out)
1301	jmp	.Lecb_enc_done
1302.align	16
1303.Lecb_enc_one:
1304	call	_bsaes_encrypt8
1305	movdqu	@XMM[0], 0x00($out)	# write output
1306	jmp	.Lecb_enc_done
1307.align	16
1308.Lecb_enc_short:
1309	lea	($inp), $arg1
1310	lea	($out), $arg2
1311	lea	($key), $arg3
1312	call	asm_AES_encrypt
1313	lea	16($inp), $inp
1314	lea	16($out), $out
1315	dec	$len
1316	jnz	.Lecb_enc_short
1317
1318.Lecb_enc_done:
1319	lea	(%rsp),%rax
1320	pxor	%xmm0, %xmm0
1321.Lecb_enc_bzero:			# wipe key schedule [if any]
1322	movdqa	%xmm0, 0x00(%rax)
1323	movdqa	%xmm0, 0x10(%rax)
1324	lea	0x20(%rax), %rax
1325	cmp	%rax, %rbp
1326	jb	.Lecb_enc_bzero
1327
1328	lea	(%rbp),%rsp		# restore %rsp
1329___
1330$code.=<<___ if ($win64);
1331	movaps	0x40(%rbp), %xmm6
1332	movaps	0x50(%rbp), %xmm7
1333	movaps	0x60(%rbp), %xmm8
1334	movaps	0x70(%rbp), %xmm9
1335	movaps	0x80(%rbp), %xmm10
1336	movaps	0x90(%rbp), %xmm11
1337	movaps	0xa0(%rbp), %xmm12
1338	movaps	0xb0(%rbp), %xmm13
1339	movaps	0xc0(%rbp), %xmm14
1340	movaps	0xd0(%rbp), %xmm15
1341	lea	0xa0(%rbp), %rsp
1342___
1343$code.=<<___;
1344	mov	0x48(%rsp), %r15
1345	mov	0x50(%rsp), %r14
1346	mov	0x58(%rsp), %r13
1347	mov	0x60(%rsp), %r12
1348	mov	0x68(%rsp), %rbx
1349	mov	0x70(%rsp), %rax
1350	lea	0x78(%rsp), %rsp
1351	mov	%rax, %rbp
1352.Lecb_enc_epilogue:
1353	ret
1354.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1355
1356.globl	bsaes_ecb_decrypt_blocks
1357.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1358.align	16
1359bsaes_ecb_decrypt_blocks:
1360	mov	%rsp, %rax
1361.Lecb_dec_prologue:
1362	push	%rbp
1363	push	%rbx
1364	push	%r12
1365	push	%r13
1366	push	%r14
1367	push	%r15
1368	lea	-0x48(%rsp),%rsp
1369___
1370$code.=<<___ if ($win64);
1371	lea	-0xa0(%rsp), %rsp
1372	movaps	%xmm6, 0x40(%rsp)
1373	movaps	%xmm7, 0x50(%rsp)
1374	movaps	%xmm8, 0x60(%rsp)
1375	movaps	%xmm9, 0x70(%rsp)
1376	movaps	%xmm10, 0x80(%rsp)
1377	movaps	%xmm11, 0x90(%rsp)
1378	movaps	%xmm12, 0xa0(%rsp)
1379	movaps	%xmm13, 0xb0(%rsp)
1380	movaps	%xmm14, 0xc0(%rsp)
1381	movaps	%xmm15, 0xd0(%rsp)
1382.Lecb_dec_body:
1383___
1384$code.=<<___;
1385	mov	%rsp,%rbp		# backup %rsp
1386	mov	240($arg4),%eax		# rounds
1387	mov	$arg1,$inp		# backup arguments
1388	mov	$arg2,$out
1389	mov	$arg3,$len
1390	mov	$arg4,$key
1391	cmp	\$8,$arg3
1392	jb	.Lecb_dec_short
1393
1394	mov	%eax,%ebx		# backup rounds
1395	shl	\$7,%rax		# 128 bytes per inner round key
1396	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1397	sub	%rax,%rsp
1398	mov	%rsp,%rax		# pass key schedule
1399	mov	$key,%rcx		# pass key
1400	mov	%ebx,%r10d		# pass rounds
1401	call	_bsaes_key_convert
1402	pxor	(%rsp),%xmm7		# fix up 0 round key
1403	movdqa	%xmm6,(%rax)		# save last round key
1404	movdqa	%xmm7,(%rsp)
1405
1406	sub	\$8,$len
1407.Lecb_dec_loop:
1408	movdqu	0x00($inp), @XMM[0]	# load input
1409	movdqu	0x10($inp), @XMM[1]
1410	movdqu	0x20($inp), @XMM[2]
1411	movdqu	0x30($inp), @XMM[3]
1412	movdqu	0x40($inp), @XMM[4]
1413	movdqu	0x50($inp), @XMM[5]
1414	mov	%rsp, %rax		# pass key schedule
1415	movdqu	0x60($inp), @XMM[6]
1416	mov	%ebx,%r10d		# pass rounds
1417	movdqu	0x70($inp), @XMM[7]
1418	lea	0x80($inp), $inp
1419
1420	call	_bsaes_decrypt8
1421
1422	movdqu	@XMM[0], 0x00($out)	# write output
1423	movdqu	@XMM[1], 0x10($out)
1424	movdqu	@XMM[6], 0x20($out)
1425	movdqu	@XMM[4], 0x30($out)
1426	movdqu	@XMM[2], 0x40($out)
1427	movdqu	@XMM[7], 0x50($out)
1428	movdqu	@XMM[3], 0x60($out)
1429	movdqu	@XMM[5], 0x70($out)
1430	lea	0x80($out), $out
1431	sub	\$8,$len
1432	jnc	.Lecb_dec_loop
1433
1434	add	\$8,$len
1435	jz	.Lecb_dec_done
1436
1437	movdqu	0x00($inp), @XMM[0]	# load input
1438	mov	%rsp, %rax		# pass key schedule
1439	mov	%ebx,%r10d		# pass rounds
1440	cmp	\$2,$len
1441	jb	.Lecb_dec_one
1442	movdqu	0x10($inp), @XMM[1]
1443	je	.Lecb_dec_two
1444	movdqu	0x20($inp), @XMM[2]
1445	cmp	\$4,$len
1446	jb	.Lecb_dec_three
1447	movdqu	0x30($inp), @XMM[3]
1448	je	.Lecb_dec_four
1449	movdqu	0x40($inp), @XMM[4]
1450	cmp	\$6,$len
1451	jb	.Lecb_dec_five
1452	movdqu	0x50($inp), @XMM[5]
1453	je	.Lecb_dec_six
1454	movdqu	0x60($inp), @XMM[6]
1455	call	_bsaes_decrypt8
1456	movdqu	@XMM[0], 0x00($out)	# write output
1457	movdqu	@XMM[1], 0x10($out)
1458	movdqu	@XMM[6], 0x20($out)
1459	movdqu	@XMM[4], 0x30($out)
1460	movdqu	@XMM[2], 0x40($out)
1461	movdqu	@XMM[7], 0x50($out)
1462	movdqu	@XMM[3], 0x60($out)
1463	jmp	.Lecb_dec_done
1464.align	16
1465.Lecb_dec_six:
1466	call	_bsaes_decrypt8
1467	movdqu	@XMM[0], 0x00($out)	# write output
1468	movdqu	@XMM[1], 0x10($out)
1469	movdqu	@XMM[6], 0x20($out)
1470	movdqu	@XMM[4], 0x30($out)
1471	movdqu	@XMM[2], 0x40($out)
1472	movdqu	@XMM[7], 0x50($out)
1473	jmp	.Lecb_dec_done
1474.align	16
1475.Lecb_dec_five:
1476	call	_bsaes_decrypt8
1477	movdqu	@XMM[0], 0x00($out)	# write output
1478	movdqu	@XMM[1], 0x10($out)
1479	movdqu	@XMM[6], 0x20($out)
1480	movdqu	@XMM[4], 0x30($out)
1481	movdqu	@XMM[2], 0x40($out)
1482	jmp	.Lecb_dec_done
1483.align	16
1484.Lecb_dec_four:
1485	call	_bsaes_decrypt8
1486	movdqu	@XMM[0], 0x00($out)	# write output
1487	movdqu	@XMM[1], 0x10($out)
1488	movdqu	@XMM[6], 0x20($out)
1489	movdqu	@XMM[4], 0x30($out)
1490	jmp	.Lecb_dec_done
1491.align	16
1492.Lecb_dec_three:
1493	call	_bsaes_decrypt8
1494	movdqu	@XMM[0], 0x00($out)	# write output
1495	movdqu	@XMM[1], 0x10($out)
1496	movdqu	@XMM[6], 0x20($out)
1497	jmp	.Lecb_dec_done
1498.align	16
1499.Lecb_dec_two:
1500	call	_bsaes_decrypt8
1501	movdqu	@XMM[0], 0x00($out)	# write output
1502	movdqu	@XMM[1], 0x10($out)
1503	jmp	.Lecb_dec_done
1504.align	16
1505.Lecb_dec_one:
1506	call	_bsaes_decrypt8
1507	movdqu	@XMM[0], 0x00($out)	# write output
1508	jmp	.Lecb_dec_done
1509.align	16
1510.Lecb_dec_short:
1511	lea	($inp), $arg1
1512	lea	($out), $arg2
1513	lea	($key), $arg3
1514	call	asm_AES_decrypt
1515	lea	16($inp), $inp
1516	lea	16($out), $out
1517	dec	$len
1518	jnz	.Lecb_dec_short
1519
1520.Lecb_dec_done:
1521	lea	(%rsp),%rax
1522	pxor	%xmm0, %xmm0
1523.Lecb_dec_bzero:			# wipe key schedule [if any]
1524	movdqa	%xmm0, 0x00(%rax)
1525	movdqa	%xmm0, 0x10(%rax)
1526	lea	0x20(%rax), %rax
1527	cmp	%rax, %rbp
1528	jb	.Lecb_dec_bzero
1529
1530	lea	(%rbp),%rsp		# restore %rsp
1531___
1532$code.=<<___ if ($win64);
1533	movaps	0x40(%rbp), %xmm6
1534	movaps	0x50(%rbp), %xmm7
1535	movaps	0x60(%rbp), %xmm8
1536	movaps	0x70(%rbp), %xmm9
1537	movaps	0x80(%rbp), %xmm10
1538	movaps	0x90(%rbp), %xmm11
1539	movaps	0xa0(%rbp), %xmm12
1540	movaps	0xb0(%rbp), %xmm13
1541	movaps	0xc0(%rbp), %xmm14
1542	movaps	0xd0(%rbp), %xmm15
1543	lea	0xa0(%rbp), %rsp
1544___
1545$code.=<<___;
1546	mov	0x48(%rsp), %r15
1547	mov	0x50(%rsp), %r14
1548	mov	0x58(%rsp), %r13
1549	mov	0x60(%rsp), %r12
1550	mov	0x68(%rsp), %rbx
1551	mov	0x70(%rsp), %rax
1552	lea	0x78(%rsp), %rsp
1553	mov	%rax, %rbp
1554.Lecb_dec_epilogue:
1555	ret
1556.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1557___
1558}
1559$code.=<<___;
1560.extern	asm_AES_cbc_encrypt
1561.globl	bsaes_cbc_encrypt
1562.type	bsaes_cbc_encrypt,\@abi-omnipotent
1563.align	16
1564bsaes_cbc_encrypt:
1565___
1566$code.=<<___ if ($win64);
1567	mov	48(%rsp),$arg6		# pull direction flag
1568___
1569$code.=<<___;
1570	cmp	\$0,$arg6
1571	jne	asm_AES_cbc_encrypt
1572	cmp	\$128,$arg3
1573	jb	asm_AES_cbc_encrypt
1574
1575	mov	%rsp, %rax
1576.Lcbc_dec_prologue:
1577	push	%rbp
1578	push	%rbx
1579	push	%r12
1580	push	%r13
1581	push	%r14
1582	push	%r15
1583	lea	-0x48(%rsp), %rsp
1584___
1585$code.=<<___ if ($win64);
1586	mov	0xa0(%rsp),$arg5	# pull ivp
1587	lea	-0xa0(%rsp), %rsp
1588	movaps	%xmm6, 0x40(%rsp)
1589	movaps	%xmm7, 0x50(%rsp)
1590	movaps	%xmm8, 0x60(%rsp)
1591	movaps	%xmm9, 0x70(%rsp)
1592	movaps	%xmm10, 0x80(%rsp)
1593	movaps	%xmm11, 0x90(%rsp)
1594	movaps	%xmm12, 0xa0(%rsp)
1595	movaps	%xmm13, 0xb0(%rsp)
1596	movaps	%xmm14, 0xc0(%rsp)
1597	movaps	%xmm15, 0xd0(%rsp)
1598.Lcbc_dec_body:
1599___
1600$code.=<<___;
1601	mov	%rsp, %rbp		# backup %rsp
1602	mov	240($arg4), %eax	# rounds
1603	mov	$arg1, $inp		# backup arguments
1604	mov	$arg2, $out
1605	mov	$arg3, $len
1606	mov	$arg4, $key
1607	mov	$arg5, %rbx
1608	shr	\$4, $len		# bytes to blocks
1609
1610	mov	%eax, %edx		# rounds
1611	shl	\$7, %rax		# 128 bytes per inner round key
1612	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1613	sub	%rax, %rsp
1614
1615	mov	%rsp, %rax		# pass key schedule
1616	mov	$key, %rcx		# pass key
1617	mov	%edx, %r10d		# pass rounds
1618	call	_bsaes_key_convert
1619	pxor	(%rsp),%xmm7		# fix up 0 round key
1620	movdqa	%xmm6,(%rax)		# save last round key
1621	movdqa	%xmm7,(%rsp)
1622
1623	movdqu	(%rbx), @XMM[15]	# load IV
1624	sub	\$8,$len
1625.Lcbc_dec_loop:
1626	movdqu	0x00($inp), @XMM[0]	# load input
1627	movdqu	0x10($inp), @XMM[1]
1628	movdqu	0x20($inp), @XMM[2]
1629	movdqu	0x30($inp), @XMM[3]
1630	movdqu	0x40($inp), @XMM[4]
1631	movdqu	0x50($inp), @XMM[5]
1632	mov	%rsp, %rax		# pass key schedule
1633	movdqu	0x60($inp), @XMM[6]
1634	mov	%edx,%r10d		# pass rounds
1635	movdqu	0x70($inp), @XMM[7]
1636	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1637
1638	call	_bsaes_decrypt8
1639
1640	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1641	movdqu	0x00($inp), @XMM[8]	# re-load input
1642	movdqu	0x10($inp), @XMM[9]
1643	pxor	@XMM[8], @XMM[1]
1644	movdqu	0x20($inp), @XMM[10]
1645	pxor	@XMM[9], @XMM[6]
1646	movdqu	0x30($inp), @XMM[11]
1647	pxor	@XMM[10], @XMM[4]
1648	movdqu	0x40($inp), @XMM[12]
1649	pxor	@XMM[11], @XMM[2]
1650	movdqu	0x50($inp), @XMM[13]
1651	pxor	@XMM[12], @XMM[7]
1652	movdqu	0x60($inp), @XMM[14]
1653	pxor	@XMM[13], @XMM[3]
1654	movdqu	0x70($inp), @XMM[15]	# IV
1655	pxor	@XMM[14], @XMM[5]
1656	movdqu	@XMM[0], 0x00($out)	# write output
1657	lea	0x80($inp), $inp
1658	movdqu	@XMM[1], 0x10($out)
1659	movdqu	@XMM[6], 0x20($out)
1660	movdqu	@XMM[4], 0x30($out)
1661	movdqu	@XMM[2], 0x40($out)
1662	movdqu	@XMM[7], 0x50($out)
1663	movdqu	@XMM[3], 0x60($out)
1664	movdqu	@XMM[5], 0x70($out)
1665	lea	0x80($out), $out
1666	sub	\$8,$len
1667	jnc	.Lcbc_dec_loop
1668
1669	add	\$8,$len
1670	jz	.Lcbc_dec_done
1671
1672	movdqu	0x00($inp), @XMM[0]	# load input
1673	mov	%rsp, %rax		# pass key schedule
1674	mov	%edx, %r10d		# pass rounds
1675	cmp	\$2,$len
1676	jb	.Lcbc_dec_one
1677	movdqu	0x10($inp), @XMM[1]
1678	je	.Lcbc_dec_two
1679	movdqu	0x20($inp), @XMM[2]
1680	cmp	\$4,$len
1681	jb	.Lcbc_dec_three
1682	movdqu	0x30($inp), @XMM[3]
1683	je	.Lcbc_dec_four
1684	movdqu	0x40($inp), @XMM[4]
1685	cmp	\$6,$len
1686	jb	.Lcbc_dec_five
1687	movdqu	0x50($inp), @XMM[5]
1688	je	.Lcbc_dec_six
1689	movdqu	0x60($inp), @XMM[6]
1690	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1691	call	_bsaes_decrypt8
1692	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1693	movdqu	0x00($inp), @XMM[8]	# re-load input
1694	movdqu	0x10($inp), @XMM[9]
1695	pxor	@XMM[8], @XMM[1]
1696	movdqu	0x20($inp), @XMM[10]
1697	pxor	@XMM[9], @XMM[6]
1698	movdqu	0x30($inp), @XMM[11]
1699	pxor	@XMM[10], @XMM[4]
1700	movdqu	0x40($inp), @XMM[12]
1701	pxor	@XMM[11], @XMM[2]
1702	movdqu	0x50($inp), @XMM[13]
1703	pxor	@XMM[12], @XMM[7]
1704	movdqu	0x60($inp), @XMM[15]	# IV
1705	pxor	@XMM[13], @XMM[3]
1706	movdqu	@XMM[0], 0x00($out)	# write output
1707	movdqu	@XMM[1], 0x10($out)
1708	movdqu	@XMM[6], 0x20($out)
1709	movdqu	@XMM[4], 0x30($out)
1710	movdqu	@XMM[2], 0x40($out)
1711	movdqu	@XMM[7], 0x50($out)
1712	movdqu	@XMM[3], 0x60($out)
1713	jmp	.Lcbc_dec_done
1714.align	16
1715.Lcbc_dec_six:
1716	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1717	call	_bsaes_decrypt8
1718	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1719	movdqu	0x00($inp), @XMM[8]	# re-load input
1720	movdqu	0x10($inp), @XMM[9]
1721	pxor	@XMM[8], @XMM[1]
1722	movdqu	0x20($inp), @XMM[10]
1723	pxor	@XMM[9], @XMM[6]
1724	movdqu	0x30($inp), @XMM[11]
1725	pxor	@XMM[10], @XMM[4]
1726	movdqu	0x40($inp), @XMM[12]
1727	pxor	@XMM[11], @XMM[2]
1728	movdqu	0x50($inp), @XMM[15]	# IV
1729	pxor	@XMM[12], @XMM[7]
1730	movdqu	@XMM[0], 0x00($out)	# write output
1731	movdqu	@XMM[1], 0x10($out)
1732	movdqu	@XMM[6], 0x20($out)
1733	movdqu	@XMM[4], 0x30($out)
1734	movdqu	@XMM[2], 0x40($out)
1735	movdqu	@XMM[7], 0x50($out)
1736	jmp	.Lcbc_dec_done
1737.align	16
1738.Lcbc_dec_five:
1739	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1740	call	_bsaes_decrypt8
1741	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1742	movdqu	0x00($inp), @XMM[8]	# re-load input
1743	movdqu	0x10($inp), @XMM[9]
1744	pxor	@XMM[8], @XMM[1]
1745	movdqu	0x20($inp), @XMM[10]
1746	pxor	@XMM[9], @XMM[6]
1747	movdqu	0x30($inp), @XMM[11]
1748	pxor	@XMM[10], @XMM[4]
1749	movdqu	0x40($inp), @XMM[15]	# IV
1750	pxor	@XMM[11], @XMM[2]
1751	movdqu	@XMM[0], 0x00($out)	# write output
1752	movdqu	@XMM[1], 0x10($out)
1753	movdqu	@XMM[6], 0x20($out)
1754	movdqu	@XMM[4], 0x30($out)
1755	movdqu	@XMM[2], 0x40($out)
1756	jmp	.Lcbc_dec_done
1757.align	16
1758.Lcbc_dec_four:
1759	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1760	call	_bsaes_decrypt8
1761	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1762	movdqu	0x00($inp), @XMM[8]	# re-load input
1763	movdqu	0x10($inp), @XMM[9]
1764	pxor	@XMM[8], @XMM[1]
1765	movdqu	0x20($inp), @XMM[10]
1766	pxor	@XMM[9], @XMM[6]
1767	movdqu	0x30($inp), @XMM[15]	# IV
1768	pxor	@XMM[10], @XMM[4]
1769	movdqu	@XMM[0], 0x00($out)	# write output
1770	movdqu	@XMM[1], 0x10($out)
1771	movdqu	@XMM[6], 0x20($out)
1772	movdqu	@XMM[4], 0x30($out)
1773	jmp	.Lcbc_dec_done
1774.align	16
1775.Lcbc_dec_three:
1776	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1777	call	_bsaes_decrypt8
1778	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1779	movdqu	0x00($inp), @XMM[8]	# re-load input
1780	movdqu	0x10($inp), @XMM[9]
1781	pxor	@XMM[8], @XMM[1]
1782	movdqu	0x20($inp), @XMM[15]	# IV
1783	pxor	@XMM[9], @XMM[6]
1784	movdqu	@XMM[0], 0x00($out)	# write output
1785	movdqu	@XMM[1], 0x10($out)
1786	movdqu	@XMM[6], 0x20($out)
1787	jmp	.Lcbc_dec_done
1788.align	16
1789.Lcbc_dec_two:
1790	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1791	call	_bsaes_decrypt8
1792	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1793	movdqu	0x00($inp), @XMM[8]	# re-load input
1794	movdqu	0x10($inp), @XMM[15]	# IV
1795	pxor	@XMM[8], @XMM[1]
1796	movdqu	@XMM[0], 0x00($out)	# write output
1797	movdqu	@XMM[1], 0x10($out)
1798	jmp	.Lcbc_dec_done
1799.align	16
1800.Lcbc_dec_one:
1801	lea	($inp), $arg1
1802	lea	0x20(%rbp), $arg2	# buffer output
1803	lea	($key), $arg3
1804	call	asm_AES_decrypt		# doesn't touch %xmm
1805	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1806	movdqu	@XMM[15], ($out)	# write output
1807	movdqa	@XMM[0], @XMM[15]	# IV
1808
1809.Lcbc_dec_done:
1810	movdqu	@XMM[15], (%rbx)	# return IV
1811	lea	(%rsp), %rax
1812	pxor	%xmm0, %xmm0
1813.Lcbc_dec_bzero:			# wipe key schedule [if any]
1814	movdqa	%xmm0, 0x00(%rax)
1815	movdqa	%xmm0, 0x10(%rax)
1816	lea	0x20(%rax), %rax
1817	cmp	%rax, %rbp
1818	ja	.Lcbc_dec_bzero
1819
1820	lea	(%rbp),%rsp		# restore %rsp
1821___
1822$code.=<<___ if ($win64);
1823	movaps	0x40(%rbp), %xmm6
1824	movaps	0x50(%rbp), %xmm7
1825	movaps	0x60(%rbp), %xmm8
1826	movaps	0x70(%rbp), %xmm9
1827	movaps	0x80(%rbp), %xmm10
1828	movaps	0x90(%rbp), %xmm11
1829	movaps	0xa0(%rbp), %xmm12
1830	movaps	0xb0(%rbp), %xmm13
1831	movaps	0xc0(%rbp), %xmm14
1832	movaps	0xd0(%rbp), %xmm15
1833	lea	0xa0(%rbp), %rsp
1834___
1835$code.=<<___;
1836	mov	0x48(%rsp), %r15
1837	mov	0x50(%rsp), %r14
1838	mov	0x58(%rsp), %r13
1839	mov	0x60(%rsp), %r12
1840	mov	0x68(%rsp), %rbx
1841	mov	0x70(%rsp), %rax
1842	lea	0x78(%rsp), %rsp
1843	mov	%rax, %rbp
1844.Lcbc_dec_epilogue:
1845	ret
1846.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1847
1848.globl	bsaes_ctr32_encrypt_blocks
1849.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1850.align	16
1851bsaes_ctr32_encrypt_blocks:
1852	mov	%rsp, %rax
1853.Lctr_enc_prologue:
1854	push	%rbp
1855	push	%rbx
1856	push	%r12
1857	push	%r13
1858	push	%r14
1859	push	%r15
1860	lea	-0x48(%rsp), %rsp
1861___
1862$code.=<<___ if ($win64);
1863	mov	0xa0(%rsp),$arg5	# pull ivp
1864	lea	-0xa0(%rsp), %rsp
1865	movaps	%xmm6, 0x40(%rsp)
1866	movaps	%xmm7, 0x50(%rsp)
1867	movaps	%xmm8, 0x60(%rsp)
1868	movaps	%xmm9, 0x70(%rsp)
1869	movaps	%xmm10, 0x80(%rsp)
1870	movaps	%xmm11, 0x90(%rsp)
1871	movaps	%xmm12, 0xa0(%rsp)
1872	movaps	%xmm13, 0xb0(%rsp)
1873	movaps	%xmm14, 0xc0(%rsp)
1874	movaps	%xmm15, 0xd0(%rsp)
1875.Lctr_enc_body:
1876___
1877$code.=<<___;
1878	mov	%rsp, %rbp		# backup %rsp
1879	movdqu	($arg5), %xmm0		# load counter
1880	mov	240($arg4), %eax	# rounds
1881	mov	$arg1, $inp		# backup arguments
1882	mov	$arg2, $out
1883	mov	$arg3, $len
1884	mov	$arg4, $key
1885	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1886	cmp	\$8, $arg3
1887	jb	.Lctr_enc_short
1888
1889	mov	%eax, %ebx		# rounds
1890	shl	\$7, %rax		# 128 bytes per inner round key
1891	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1892	sub	%rax, %rsp
1893
1894	mov	%rsp, %rax		# pass key schedule
1895	mov	$key, %rcx		# pass key
1896	mov	%ebx, %r10d		# pass rounds
1897	call	_bsaes_key_convert
1898	pxor	%xmm6,%xmm7		# fix up last round key
1899	movdqa	%xmm7,(%rax)		# save last round key
1900
1901	movdqa	(%rsp), @XMM[9]		# load round0 key
1902	lea	.LADD1(%rip), %r11
1903	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1904	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1905	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1906	pshufb	@XMM[8], @XMM[0]
1907	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1908	jmp	.Lctr_enc_loop
1909.align	16
1910.Lctr_enc_loop:
1911	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1912	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1913	movdqa	@XMM[0], @XMM[2]
1914	paddd	0x00(%r11), @XMM[1]	# .LADD1
1915	movdqa	@XMM[0], @XMM[3]
1916	paddd	0x10(%r11), @XMM[2]	# .LADD2
1917	movdqa	@XMM[0], @XMM[4]
1918	paddd	0x20(%r11), @XMM[3]	# .LADD3
1919	movdqa	@XMM[0], @XMM[5]
1920	paddd	0x30(%r11), @XMM[4]	# .LADD4
1921	movdqa	@XMM[0], @XMM[6]
1922	paddd	0x40(%r11), @XMM[5]	# .LADD5
1923	movdqa	@XMM[0], @XMM[7]
1924	paddd	0x50(%r11), @XMM[6]	# .LADD6
1925	paddd	0x60(%r11), @XMM[7]	# .LADD7
1926
1927	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1928	# to flip byte order in 32-bit counter
1929	movdqa	(%rsp), @XMM[9]		# round 0 key
1930	lea	0x10(%rsp), %rax	# pass key schedule
1931	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1932	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1933	pxor	@XMM[9], @XMM[1]
1934	pxor	@XMM[9], @XMM[2]
1935	pxor	@XMM[9], @XMM[3]
1936	 pshufb	@XMM[8], @XMM[0]
1937	 pshufb	@XMM[8], @XMM[1]
1938	pxor	@XMM[9], @XMM[4]
1939	pxor	@XMM[9], @XMM[5]
1940	 pshufb	@XMM[8], @XMM[2]
1941	 pshufb	@XMM[8], @XMM[3]
1942	pxor	@XMM[9], @XMM[6]
1943	pxor	@XMM[9], @XMM[7]
1944	 pshufb	@XMM[8], @XMM[4]
1945	 pshufb	@XMM[8], @XMM[5]
1946	 pshufb	@XMM[8], @XMM[6]
1947	 pshufb	@XMM[8], @XMM[7]
1948	lea	.LBS0(%rip), %r11	# constants table
1949	mov	%ebx,%r10d		# pass rounds
1950
1951	call	_bsaes_encrypt8_bitslice
1952
1953	sub	\$8,$len
1954	jc	.Lctr_enc_loop_done
1955
1956	movdqu	0x00($inp), @XMM[8]	# load input
1957	movdqu	0x10($inp), @XMM[9]
1958	movdqu	0x20($inp), @XMM[10]
1959	movdqu	0x30($inp), @XMM[11]
1960	movdqu	0x40($inp), @XMM[12]
1961	movdqu	0x50($inp), @XMM[13]
1962	movdqu	0x60($inp), @XMM[14]
1963	movdqu	0x70($inp), @XMM[15]
1964	lea	0x80($inp),$inp
1965	pxor	@XMM[0], @XMM[8]
1966	movdqa	0x20(%rbp), @XMM[0]	# load counter
1967	pxor	@XMM[9], @XMM[1]
1968	movdqu	@XMM[8], 0x00($out)	# write output
1969	pxor	@XMM[10], @XMM[4]
1970	movdqu	@XMM[1], 0x10($out)
1971	pxor	@XMM[11], @XMM[6]
1972	movdqu	@XMM[4], 0x20($out)
1973	pxor	@XMM[12], @XMM[3]
1974	movdqu	@XMM[6], 0x30($out)
1975	pxor	@XMM[13], @XMM[7]
1976	movdqu	@XMM[3], 0x40($out)
1977	pxor	@XMM[14], @XMM[2]
1978	movdqu	@XMM[7], 0x50($out)
1979	pxor	@XMM[15], @XMM[5]
1980	movdqu	@XMM[2], 0x60($out)
1981	lea	.LADD1(%rip), %r11
1982	movdqu	@XMM[5], 0x70($out)
1983	lea	0x80($out), $out
1984	paddd	0x70(%r11), @XMM[0]	# .LADD8
1985	jnz	.Lctr_enc_loop
1986
1987	jmp	.Lctr_enc_done
1988.align	16
1989.Lctr_enc_loop_done:
1990	add	\$8, $len
1991	movdqu	0x00($inp), @XMM[8]	# load input
1992	pxor	@XMM[8], @XMM[0]
1993	movdqu	@XMM[0], 0x00($out)	# write output
1994	cmp	\$2,$len
1995	jb	.Lctr_enc_done
1996	movdqu	0x10($inp), @XMM[9]
1997	pxor	@XMM[9], @XMM[1]
1998	movdqu	@XMM[1], 0x10($out)
1999	je	.Lctr_enc_done
2000	movdqu	0x20($inp), @XMM[10]
2001	pxor	@XMM[10], @XMM[4]
2002	movdqu	@XMM[4], 0x20($out)
2003	cmp	\$4,$len
2004	jb	.Lctr_enc_done
2005	movdqu	0x30($inp), @XMM[11]
2006	pxor	@XMM[11], @XMM[6]
2007	movdqu	@XMM[6], 0x30($out)
2008	je	.Lctr_enc_done
2009	movdqu	0x40($inp), @XMM[12]
2010	pxor	@XMM[12], @XMM[3]
2011	movdqu	@XMM[3], 0x40($out)
2012	cmp	\$6,$len
2013	jb	.Lctr_enc_done
2014	movdqu	0x50($inp), @XMM[13]
2015	pxor	@XMM[13], @XMM[7]
2016	movdqu	@XMM[7], 0x50($out)
2017	je	.Lctr_enc_done
2018	movdqu	0x60($inp), @XMM[14]
2019	pxor	@XMM[14], @XMM[2]
2020	movdqu	@XMM[2], 0x60($out)
2021	jmp	.Lctr_enc_done
2022
2023.align	16
2024.Lctr_enc_short:
2025	lea	0x20(%rbp), $arg1
2026	lea	0x30(%rbp), $arg2
2027	lea	($key), $arg3
2028	call	asm_AES_encrypt
2029	movdqu	($inp), @XMM[1]
2030	lea	16($inp), $inp
2031	mov	0x2c(%rbp), %eax	# load 32-bit counter
2032	bswap	%eax
2033	pxor	0x30(%rbp), @XMM[1]
2034	inc	%eax			# increment
2035	movdqu	@XMM[1], ($out)
2036	bswap	%eax
2037	lea	16($out), $out
2038	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2039	dec	$len
2040	jnz	.Lctr_enc_short
2041
2042.Lctr_enc_done:
2043	lea	(%rsp), %rax
2044	pxor	%xmm0, %xmm0
2045.Lctr_enc_bzero:			# wipe key schedule [if any]
2046	movdqa	%xmm0, 0x00(%rax)
2047	movdqa	%xmm0, 0x10(%rax)
2048	lea	0x20(%rax), %rax
2049	cmp	%rax, %rbp
2050	ja	.Lctr_enc_bzero
2051
2052	lea	(%rbp),%rsp		# restore %rsp
2053___
2054$code.=<<___ if ($win64);
2055	movaps	0x40(%rbp), %xmm6
2056	movaps	0x50(%rbp), %xmm7
2057	movaps	0x60(%rbp), %xmm8
2058	movaps	0x70(%rbp), %xmm9
2059	movaps	0x80(%rbp), %xmm10
2060	movaps	0x90(%rbp), %xmm11
2061	movaps	0xa0(%rbp), %xmm12
2062	movaps	0xb0(%rbp), %xmm13
2063	movaps	0xc0(%rbp), %xmm14
2064	movaps	0xd0(%rbp), %xmm15
2065	lea	0xa0(%rbp), %rsp
2066___
2067$code.=<<___;
2068	mov	0x48(%rsp), %r15
2069	mov	0x50(%rsp), %r14
2070	mov	0x58(%rsp), %r13
2071	mov	0x60(%rsp), %r12
2072	mov	0x68(%rsp), %rbx
2073	mov	0x70(%rsp), %rax
2074	lea	0x78(%rsp), %rsp
2075	mov	%rax, %rbp
2076.Lctr_enc_epilogue:
2077	ret
2078.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2079___
2080######################################################################
2081# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2082#	const AES_KEY *key1, const AES_KEY *key2,
2083#	const unsigned char iv[16]);
2084#
2085my ($twmask,$twres,$twtmp)=@XMM[13..15];
2086$arg6=~s/d$//;
2087
2088$code.=<<___;
2089.globl	bsaes_xts_encrypt
2090.type	bsaes_xts_encrypt,\@abi-omnipotent
2091.align	16
2092bsaes_xts_encrypt:
2093	mov	%rsp, %rax
2094.Lxts_enc_prologue:
2095	push	%rbp
2096	push	%rbx
2097	push	%r12
2098	push	%r13
2099	push	%r14
2100	push	%r15
2101	lea	-0x48(%rsp), %rsp
2102___
2103$code.=<<___ if ($win64);
2104	mov	0xa0(%rsp),$arg5	# pull key2
2105	mov	0xa8(%rsp),$arg6	# pull ivp
2106	lea	-0xa0(%rsp), %rsp
2107	movaps	%xmm6, 0x40(%rsp)
2108	movaps	%xmm7, 0x50(%rsp)
2109	movaps	%xmm8, 0x60(%rsp)
2110	movaps	%xmm9, 0x70(%rsp)
2111	movaps	%xmm10, 0x80(%rsp)
2112	movaps	%xmm11, 0x90(%rsp)
2113	movaps	%xmm12, 0xa0(%rsp)
2114	movaps	%xmm13, 0xb0(%rsp)
2115	movaps	%xmm14, 0xc0(%rsp)
2116	movaps	%xmm15, 0xd0(%rsp)
2117.Lxts_enc_body:
2118___
2119$code.=<<___;
2120	mov	%rsp, %rbp		# backup %rsp
2121	mov	$arg1, $inp		# backup arguments
2122	mov	$arg2, $out
2123	mov	$arg3, $len
2124	mov	$arg4, $key
2125
2126	lea	($arg6), $arg1
2127	lea	0x20(%rbp), $arg2
2128	lea	($arg5), $arg3
2129	call	asm_AES_encrypt		# generate initial tweak
2130
2131	mov	240($key), %eax		# rounds
2132	mov	$len, %rbx		# backup $len
2133
2134	mov	%eax, %edx		# rounds
2135	shl	\$7, %rax		# 128 bytes per inner round key
2136	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2137	sub	%rax, %rsp
2138
2139	mov	%rsp, %rax		# pass key schedule
2140	mov	$key, %rcx		# pass key
2141	mov	%edx, %r10d		# pass rounds
2142	call	_bsaes_key_convert
2143	pxor	%xmm6, %xmm7		# fix up last round key
2144	movdqa	%xmm7, (%rax)		# save last round key
2145
2146	and	\$-16, $len
2147	sub	\$0x80, %rsp		# place for tweak[8]
2148	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2149
2150	pxor	$twtmp, $twtmp
2151	movdqa	.Lxts_magic(%rip), $twmask
2152	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2153
2154	sub	\$0x80, $len
2155	jc	.Lxts_enc_short
2156	jmp	.Lxts_enc_loop
2157
2158.align	16
2159.Lxts_enc_loop:
2160___
2161    for ($i=0;$i<7;$i++) {
2162    $code.=<<___;
2163	pshufd	\$0x13, $twtmp, $twres
2164	pxor	$twtmp, $twtmp
2165	movdqa	@XMM[7], @XMM[$i]
2166	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2167	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2168	pand	$twmask, $twres		# isolate carry and residue
2169	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2170	pxor	$twres, @XMM[7]
2171___
2172    $code.=<<___ if ($i>=1);
2173	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2174___
2175    $code.=<<___ if ($i>=2);
2176	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2177___
2178    }
2179$code.=<<___;
2180	movdqu	0x60($inp), @XMM[8+6]
2181	pxor	@XMM[8+5], @XMM[5]
2182	movdqu	0x70($inp), @XMM[8+7]
2183	lea	0x80($inp), $inp
2184	movdqa	@XMM[7], 0x70(%rsp)
2185	pxor	@XMM[8+6], @XMM[6]
2186	lea	0x80(%rsp), %rax	# pass key schedule
2187	pxor	@XMM[8+7], @XMM[7]
2188	mov	%edx, %r10d		# pass rounds
2189
2190	call	_bsaes_encrypt8
2191
2192	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2193	pxor	0x10(%rsp), @XMM[1]
2194	movdqu	@XMM[0], 0x00($out)	# write output
2195	pxor	0x20(%rsp), @XMM[4]
2196	movdqu	@XMM[1], 0x10($out)
2197	pxor	0x30(%rsp), @XMM[6]
2198	movdqu	@XMM[4], 0x20($out)
2199	pxor	0x40(%rsp), @XMM[3]
2200	movdqu	@XMM[6], 0x30($out)
2201	pxor	0x50(%rsp), @XMM[7]
2202	movdqu	@XMM[3], 0x40($out)
2203	pxor	0x60(%rsp), @XMM[2]
2204	movdqu	@XMM[7], 0x50($out)
2205	pxor	0x70(%rsp), @XMM[5]
2206	movdqu	@XMM[2], 0x60($out)
2207	movdqu	@XMM[5], 0x70($out)
2208	lea	0x80($out), $out
2209
2210	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2211	pxor	$twtmp, $twtmp
2212	movdqa	.Lxts_magic(%rip), $twmask
2213	pcmpgtd	@XMM[7], $twtmp
2214	pshufd	\$0x13, $twtmp, $twres
2215	pxor	$twtmp, $twtmp
2216	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2217	pand	$twmask, $twres		# isolate carry and residue
2218	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2219	pxor	$twres, @XMM[7]
2220
2221	sub	\$0x80,$len
2222	jnc	.Lxts_enc_loop
2223
2224.Lxts_enc_short:
2225	add	\$0x80, $len
2226	jz	.Lxts_enc_done
2227___
2228    for ($i=0;$i<7;$i++) {
2229    $code.=<<___;
2230	pshufd	\$0x13, $twtmp, $twres
2231	pxor	$twtmp, $twtmp
2232	movdqa	@XMM[7], @XMM[$i]
2233	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2234	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2235	pand	$twmask, $twres		# isolate carry and residue
2236	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2237	pxor	$twres, @XMM[7]
2238___
2239    $code.=<<___ if ($i>=1);
2240	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2241	cmp	\$`0x10*$i`,$len
2242	je	.Lxts_enc_$i
2243___
2244    $code.=<<___ if ($i>=2);
2245	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2246___
2247    }
2248$code.=<<___;
2249	movdqu	0x60($inp), @XMM[8+6]
2250	pxor	@XMM[8+5], @XMM[5]
2251	movdqa	@XMM[7], 0x70(%rsp)
2252	lea	0x70($inp), $inp
2253	pxor	@XMM[8+6], @XMM[6]
2254	lea	0x80(%rsp), %rax	# pass key schedule
2255	mov	%edx, %r10d		# pass rounds
2256
2257	call	_bsaes_encrypt8
2258
2259	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2260	pxor	0x10(%rsp), @XMM[1]
2261	movdqu	@XMM[0], 0x00($out)	# write output
2262	pxor	0x20(%rsp), @XMM[4]
2263	movdqu	@XMM[1], 0x10($out)
2264	pxor	0x30(%rsp), @XMM[6]
2265	movdqu	@XMM[4], 0x20($out)
2266	pxor	0x40(%rsp), @XMM[3]
2267	movdqu	@XMM[6], 0x30($out)
2268	pxor	0x50(%rsp), @XMM[7]
2269	movdqu	@XMM[3], 0x40($out)
2270	pxor	0x60(%rsp), @XMM[2]
2271	movdqu	@XMM[7], 0x50($out)
2272	movdqu	@XMM[2], 0x60($out)
2273	lea	0x70($out), $out
2274
2275	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2276	jmp	.Lxts_enc_done
2277.align	16
2278.Lxts_enc_6:
2279	pxor	@XMM[8+4], @XMM[4]
2280	lea	0x60($inp), $inp
2281	pxor	@XMM[8+5], @XMM[5]
2282	lea	0x80(%rsp), %rax	# pass key schedule
2283	mov	%edx, %r10d		# pass rounds
2284
2285	call	_bsaes_encrypt8
2286
2287	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2288	pxor	0x10(%rsp), @XMM[1]
2289	movdqu	@XMM[0], 0x00($out)	# write output
2290	pxor	0x20(%rsp), @XMM[4]
2291	movdqu	@XMM[1], 0x10($out)
2292	pxor	0x30(%rsp), @XMM[6]
2293	movdqu	@XMM[4], 0x20($out)
2294	pxor	0x40(%rsp), @XMM[3]
2295	movdqu	@XMM[6], 0x30($out)
2296	pxor	0x50(%rsp), @XMM[7]
2297	movdqu	@XMM[3], 0x40($out)
2298	movdqu	@XMM[7], 0x50($out)
2299	lea	0x60($out), $out
2300
2301	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2302	jmp	.Lxts_enc_done
2303.align	16
2304.Lxts_enc_5:
2305	pxor	@XMM[8+3], @XMM[3]
2306	lea	0x50($inp), $inp
2307	pxor	@XMM[8+4], @XMM[4]
2308	lea	0x80(%rsp), %rax	# pass key schedule
2309	mov	%edx, %r10d		# pass rounds
2310
2311	call	_bsaes_encrypt8
2312
2313	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2314	pxor	0x10(%rsp), @XMM[1]
2315	movdqu	@XMM[0], 0x00($out)	# write output
2316	pxor	0x20(%rsp), @XMM[4]
2317	movdqu	@XMM[1], 0x10($out)
2318	pxor	0x30(%rsp), @XMM[6]
2319	movdqu	@XMM[4], 0x20($out)
2320	pxor	0x40(%rsp), @XMM[3]
2321	movdqu	@XMM[6], 0x30($out)
2322	movdqu	@XMM[3], 0x40($out)
2323	lea	0x50($out), $out
2324
2325	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2326	jmp	.Lxts_enc_done
2327.align	16
2328.Lxts_enc_4:
2329	pxor	@XMM[8+2], @XMM[2]
2330	lea	0x40($inp), $inp
2331	pxor	@XMM[8+3], @XMM[3]
2332	lea	0x80(%rsp), %rax	# pass key schedule
2333	mov	%edx, %r10d		# pass rounds
2334
2335	call	_bsaes_encrypt8
2336
2337	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2338	pxor	0x10(%rsp), @XMM[1]
2339	movdqu	@XMM[0], 0x00($out)	# write output
2340	pxor	0x20(%rsp), @XMM[4]
2341	movdqu	@XMM[1], 0x10($out)
2342	pxor	0x30(%rsp), @XMM[6]
2343	movdqu	@XMM[4], 0x20($out)
2344	movdqu	@XMM[6], 0x30($out)
2345	lea	0x40($out), $out
2346
2347	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2348	jmp	.Lxts_enc_done
2349.align	16
2350.Lxts_enc_3:
2351	pxor	@XMM[8+1], @XMM[1]
2352	lea	0x30($inp), $inp
2353	pxor	@XMM[8+2], @XMM[2]
2354	lea	0x80(%rsp), %rax	# pass key schedule
2355	mov	%edx, %r10d		# pass rounds
2356
2357	call	_bsaes_encrypt8
2358
2359	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2360	pxor	0x10(%rsp), @XMM[1]
2361	movdqu	@XMM[0], 0x00($out)	# write output
2362	pxor	0x20(%rsp), @XMM[4]
2363	movdqu	@XMM[1], 0x10($out)
2364	movdqu	@XMM[4], 0x20($out)
2365	lea	0x30($out), $out
2366
2367	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2368	jmp	.Lxts_enc_done
2369.align	16
2370.Lxts_enc_2:
2371	pxor	@XMM[8+0], @XMM[0]
2372	lea	0x20($inp), $inp
2373	pxor	@XMM[8+1], @XMM[1]
2374	lea	0x80(%rsp), %rax	# pass key schedule
2375	mov	%edx, %r10d		# pass rounds
2376
2377	call	_bsaes_encrypt8
2378
2379	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2380	pxor	0x10(%rsp), @XMM[1]
2381	movdqu	@XMM[0], 0x00($out)	# write output
2382	movdqu	@XMM[1], 0x10($out)
2383	lea	0x20($out), $out
2384
2385	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2386	jmp	.Lxts_enc_done
2387.align	16
2388.Lxts_enc_1:
2389	pxor	@XMM[0], @XMM[8]
2390	lea	0x10($inp), $inp
2391	movdqa	@XMM[8], 0x20(%rbp)
2392	lea	0x20(%rbp), $arg1
2393	lea	0x20(%rbp), $arg2
2394	lea	($key), $arg3
2395	call	asm_AES_encrypt		# doesn't touch %xmm
2396	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2397	#pxor	@XMM[8], @XMM[0]
2398	#lea	0x80(%rsp), %rax	# pass key schedule
2399	#mov	%edx, %r10d		# pass rounds
2400	#call	_bsaes_encrypt8
2401	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2402	movdqu	@XMM[0], 0x00($out)	# write output
2403	lea	0x10($out), $out
2404
2405	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2406
2407.Lxts_enc_done:
2408	and	\$15, %ebx
2409	jz	.Lxts_enc_ret
2410	mov	$out, %rdx
2411
2412.Lxts_enc_steal:
2413	movzb	($inp), %eax
2414	movzb	-16(%rdx), %ecx
2415	lea	1($inp), $inp
2416	mov	%al, -16(%rdx)
2417	mov	%cl, 0(%rdx)
2418	lea	1(%rdx), %rdx
2419	sub	\$1,%ebx
2420	jnz	.Lxts_enc_steal
2421
2422	movdqu	-16($out), @XMM[0]
2423	lea	0x20(%rbp), $arg1
2424	pxor	@XMM[7], @XMM[0]
2425	lea	0x20(%rbp), $arg2
2426	movdqa	@XMM[0], 0x20(%rbp)
2427	lea	($key), $arg3
2428	call	asm_AES_encrypt		# doesn't touch %xmm
2429	pxor	0x20(%rbp), @XMM[7]
2430	movdqu	@XMM[7], -16($out)
2431
2432.Lxts_enc_ret:
2433	lea	(%rsp), %rax
2434	pxor	%xmm0, %xmm0
2435.Lxts_enc_bzero:			# wipe key schedule [if any]
2436	movdqa	%xmm0, 0x00(%rax)
2437	movdqa	%xmm0, 0x10(%rax)
2438	lea	0x20(%rax), %rax
2439	cmp	%rax, %rbp
2440	ja	.Lxts_enc_bzero
2441
2442	lea	(%rbp),%rsp		# restore %rsp
2443___
2444$code.=<<___ if ($win64);
2445	movaps	0x40(%rbp), %xmm6
2446	movaps	0x50(%rbp), %xmm7
2447	movaps	0x60(%rbp), %xmm8
2448	movaps	0x70(%rbp), %xmm9
2449	movaps	0x80(%rbp), %xmm10
2450	movaps	0x90(%rbp), %xmm11
2451	movaps	0xa0(%rbp), %xmm12
2452	movaps	0xb0(%rbp), %xmm13
2453	movaps	0xc0(%rbp), %xmm14
2454	movaps	0xd0(%rbp), %xmm15
2455	lea	0xa0(%rbp), %rsp
2456___
2457$code.=<<___;
2458	mov	0x48(%rsp), %r15
2459	mov	0x50(%rsp), %r14
2460	mov	0x58(%rsp), %r13
2461	mov	0x60(%rsp), %r12
2462	mov	0x68(%rsp), %rbx
2463	mov	0x70(%rsp), %rax
2464	lea	0x78(%rsp), %rsp
2465	mov	%rax, %rbp
2466.Lxts_enc_epilogue:
2467	ret
2468.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2469
2470.globl	bsaes_xts_decrypt
2471.type	bsaes_xts_decrypt,\@abi-omnipotent
2472.align	16
2473bsaes_xts_decrypt:
2474	mov	%rsp, %rax
2475.Lxts_dec_prologue:
2476	push	%rbp
2477	push	%rbx
2478	push	%r12
2479	push	%r13
2480	push	%r14
2481	push	%r15
2482	lea	-0x48(%rsp), %rsp
2483___
2484$code.=<<___ if ($win64);
2485	mov	0xa0(%rsp),$arg5	# pull key2
2486	mov	0xa8(%rsp),$arg6	# pull ivp
2487	lea	-0xa0(%rsp), %rsp
2488	movaps	%xmm6, 0x40(%rsp)
2489	movaps	%xmm7, 0x50(%rsp)
2490	movaps	%xmm8, 0x60(%rsp)
2491	movaps	%xmm9, 0x70(%rsp)
2492	movaps	%xmm10, 0x80(%rsp)
2493	movaps	%xmm11, 0x90(%rsp)
2494	movaps	%xmm12, 0xa0(%rsp)
2495	movaps	%xmm13, 0xb0(%rsp)
2496	movaps	%xmm14, 0xc0(%rsp)
2497	movaps	%xmm15, 0xd0(%rsp)
2498.Lxts_dec_body:
2499___
2500$code.=<<___;
2501	mov	%rsp, %rbp		# backup %rsp
2502	mov	$arg1, $inp		# backup arguments
2503	mov	$arg2, $out
2504	mov	$arg3, $len
2505	mov	$arg4, $key
2506
2507	lea	($arg6), $arg1
2508	lea	0x20(%rbp), $arg2
2509	lea	($arg5), $arg3
2510	call	asm_AES_encrypt		# generate initial tweak
2511
2512	mov	240($key), %eax		# rounds
2513	mov	$len, %rbx		# backup $len
2514
2515	mov	%eax, %edx		# rounds
2516	shl	\$7, %rax		# 128 bytes per inner round key
2517	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2518	sub	%rax, %rsp
2519
2520	mov	%rsp, %rax		# pass key schedule
2521	mov	$key, %rcx		# pass key
2522	mov	%edx, %r10d		# pass rounds
2523	call	_bsaes_key_convert
2524	pxor	(%rsp), %xmm7		# fix up round 0 key
2525	movdqa	%xmm6, (%rax)		# save last round key
2526	movdqa	%xmm7, (%rsp)
2527
2528	xor	%eax, %eax		# if ($len%16) len-=16;
2529	and	\$-16, $len
2530	test	\$15, %ebx
2531	setnz	%al
2532	shl	\$4, %rax
2533	sub	%rax, $len
2534
2535	sub	\$0x80, %rsp		# place for tweak[8]
2536	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2537
2538	pxor	$twtmp, $twtmp
2539	movdqa	.Lxts_magic(%rip), $twmask
2540	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2541
2542	sub	\$0x80, $len
2543	jc	.Lxts_dec_short
2544	jmp	.Lxts_dec_loop
2545
2546.align	16
2547.Lxts_dec_loop:
2548___
2549    for ($i=0;$i<7;$i++) {
2550    $code.=<<___;
2551	pshufd	\$0x13, $twtmp, $twres
2552	pxor	$twtmp, $twtmp
2553	movdqa	@XMM[7], @XMM[$i]
2554	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2555	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2556	pand	$twmask, $twres		# isolate carry and residue
2557	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2558	pxor	$twres, @XMM[7]
2559___
2560    $code.=<<___ if ($i>=1);
2561	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2562___
2563    $code.=<<___ if ($i>=2);
2564	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2565___
2566    }
2567$code.=<<___;
2568	movdqu	0x60($inp), @XMM[8+6]
2569	pxor	@XMM[8+5], @XMM[5]
2570	movdqu	0x70($inp), @XMM[8+7]
2571	lea	0x80($inp), $inp
2572	movdqa	@XMM[7], 0x70(%rsp)
2573	pxor	@XMM[8+6], @XMM[6]
2574	lea	0x80(%rsp), %rax	# pass key schedule
2575	pxor	@XMM[8+7], @XMM[7]
2576	mov	%edx, %r10d		# pass rounds
2577
2578	call	_bsaes_decrypt8
2579
2580	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2581	pxor	0x10(%rsp), @XMM[1]
2582	movdqu	@XMM[0], 0x00($out)	# write output
2583	pxor	0x20(%rsp), @XMM[6]
2584	movdqu	@XMM[1], 0x10($out)
2585	pxor	0x30(%rsp), @XMM[4]
2586	movdqu	@XMM[6], 0x20($out)
2587	pxor	0x40(%rsp), @XMM[2]
2588	movdqu	@XMM[4], 0x30($out)
2589	pxor	0x50(%rsp), @XMM[7]
2590	movdqu	@XMM[2], 0x40($out)
2591	pxor	0x60(%rsp), @XMM[3]
2592	movdqu	@XMM[7], 0x50($out)
2593	pxor	0x70(%rsp), @XMM[5]
2594	movdqu	@XMM[3], 0x60($out)
2595	movdqu	@XMM[5], 0x70($out)
2596	lea	0x80($out), $out
2597
2598	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2599	pxor	$twtmp, $twtmp
2600	movdqa	.Lxts_magic(%rip), $twmask
2601	pcmpgtd	@XMM[7], $twtmp
2602	pshufd	\$0x13, $twtmp, $twres
2603	pxor	$twtmp, $twtmp
2604	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2605	pand	$twmask, $twres		# isolate carry and residue
2606	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2607	pxor	$twres, @XMM[7]
2608
2609	sub	\$0x80,$len
2610	jnc	.Lxts_dec_loop
2611
2612.Lxts_dec_short:
2613	add	\$0x80, $len
2614	jz	.Lxts_dec_done
2615___
2616    for ($i=0;$i<7;$i++) {
2617    $code.=<<___;
2618	pshufd	\$0x13, $twtmp, $twres
2619	pxor	$twtmp, $twtmp
2620	movdqa	@XMM[7], @XMM[$i]
2621	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2622	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2623	pand	$twmask, $twres		# isolate carry and residue
2624	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2625	pxor	$twres, @XMM[7]
2626___
2627    $code.=<<___ if ($i>=1);
2628	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2629	cmp	\$`0x10*$i`,$len
2630	je	.Lxts_dec_$i
2631___
2632    $code.=<<___ if ($i>=2);
2633	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2634___
2635    }
2636$code.=<<___;
2637	movdqu	0x60($inp), @XMM[8+6]
2638	pxor	@XMM[8+5], @XMM[5]
2639	movdqa	@XMM[7], 0x70(%rsp)
2640	lea	0x70($inp), $inp
2641	pxor	@XMM[8+6], @XMM[6]
2642	lea	0x80(%rsp), %rax	# pass key schedule
2643	mov	%edx, %r10d		# pass rounds
2644
2645	call	_bsaes_decrypt8
2646
2647	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2648	pxor	0x10(%rsp), @XMM[1]
2649	movdqu	@XMM[0], 0x00($out)	# write output
2650	pxor	0x20(%rsp), @XMM[6]
2651	movdqu	@XMM[1], 0x10($out)
2652	pxor	0x30(%rsp), @XMM[4]
2653	movdqu	@XMM[6], 0x20($out)
2654	pxor	0x40(%rsp), @XMM[2]
2655	movdqu	@XMM[4], 0x30($out)
2656	pxor	0x50(%rsp), @XMM[7]
2657	movdqu	@XMM[2], 0x40($out)
2658	pxor	0x60(%rsp), @XMM[3]
2659	movdqu	@XMM[7], 0x50($out)
2660	movdqu	@XMM[3], 0x60($out)
2661	lea	0x70($out), $out
2662
2663	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2664	jmp	.Lxts_dec_done
2665.align	16
2666.Lxts_dec_6:
2667	pxor	@XMM[8+4], @XMM[4]
2668	lea	0x60($inp), $inp
2669	pxor	@XMM[8+5], @XMM[5]
2670	lea	0x80(%rsp), %rax	# pass key schedule
2671	mov	%edx, %r10d		# pass rounds
2672
2673	call	_bsaes_decrypt8
2674
2675	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2676	pxor	0x10(%rsp), @XMM[1]
2677	movdqu	@XMM[0], 0x00($out)	# write output
2678	pxor	0x20(%rsp), @XMM[6]
2679	movdqu	@XMM[1], 0x10($out)
2680	pxor	0x30(%rsp), @XMM[4]
2681	movdqu	@XMM[6], 0x20($out)
2682	pxor	0x40(%rsp), @XMM[2]
2683	movdqu	@XMM[4], 0x30($out)
2684	pxor	0x50(%rsp), @XMM[7]
2685	movdqu	@XMM[2], 0x40($out)
2686	movdqu	@XMM[7], 0x50($out)
2687	lea	0x60($out), $out
2688
2689	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2690	jmp	.Lxts_dec_done
2691.align	16
2692.Lxts_dec_5:
2693	pxor	@XMM[8+3], @XMM[3]
2694	lea	0x50($inp), $inp
2695	pxor	@XMM[8+4], @XMM[4]
2696	lea	0x80(%rsp), %rax	# pass key schedule
2697	mov	%edx, %r10d		# pass rounds
2698
2699	call	_bsaes_decrypt8
2700
2701	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2702	pxor	0x10(%rsp), @XMM[1]
2703	movdqu	@XMM[0], 0x00($out)	# write output
2704	pxor	0x20(%rsp), @XMM[6]
2705	movdqu	@XMM[1], 0x10($out)
2706	pxor	0x30(%rsp), @XMM[4]
2707	movdqu	@XMM[6], 0x20($out)
2708	pxor	0x40(%rsp), @XMM[2]
2709	movdqu	@XMM[4], 0x30($out)
2710	movdqu	@XMM[2], 0x40($out)
2711	lea	0x50($out), $out
2712
2713	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2714	jmp	.Lxts_dec_done
2715.align	16
2716.Lxts_dec_4:
2717	pxor	@XMM[8+2], @XMM[2]
2718	lea	0x40($inp), $inp
2719	pxor	@XMM[8+3], @XMM[3]
2720	lea	0x80(%rsp), %rax	# pass key schedule
2721	mov	%edx, %r10d		# pass rounds
2722
2723	call	_bsaes_decrypt8
2724
2725	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2726	pxor	0x10(%rsp), @XMM[1]
2727	movdqu	@XMM[0], 0x00($out)	# write output
2728	pxor	0x20(%rsp), @XMM[6]
2729	movdqu	@XMM[1], 0x10($out)
2730	pxor	0x30(%rsp), @XMM[4]
2731	movdqu	@XMM[6], 0x20($out)
2732	movdqu	@XMM[4], 0x30($out)
2733	lea	0x40($out), $out
2734
2735	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2736	jmp	.Lxts_dec_done
2737.align	16
2738.Lxts_dec_3:
2739	pxor	@XMM[8+1], @XMM[1]
2740	lea	0x30($inp), $inp
2741	pxor	@XMM[8+2], @XMM[2]
2742	lea	0x80(%rsp), %rax	# pass key schedule
2743	mov	%edx, %r10d		# pass rounds
2744
2745	call	_bsaes_decrypt8
2746
2747	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2748	pxor	0x10(%rsp), @XMM[1]
2749	movdqu	@XMM[0], 0x00($out)	# write output
2750	pxor	0x20(%rsp), @XMM[6]
2751	movdqu	@XMM[1], 0x10($out)
2752	movdqu	@XMM[6], 0x20($out)
2753	lea	0x30($out), $out
2754
2755	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2756	jmp	.Lxts_dec_done
2757.align	16
2758.Lxts_dec_2:
2759	pxor	@XMM[8+0], @XMM[0]
2760	lea	0x20($inp), $inp
2761	pxor	@XMM[8+1], @XMM[1]
2762	lea	0x80(%rsp), %rax	# pass key schedule
2763	mov	%edx, %r10d		# pass rounds
2764
2765	call	_bsaes_decrypt8
2766
2767	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2768	pxor	0x10(%rsp), @XMM[1]
2769	movdqu	@XMM[0], 0x00($out)	# write output
2770	movdqu	@XMM[1], 0x10($out)
2771	lea	0x20($out), $out
2772
2773	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2774	jmp	.Lxts_dec_done
2775.align	16
2776.Lxts_dec_1:
2777	pxor	@XMM[0], @XMM[8]
2778	lea	0x10($inp), $inp
2779	movdqa	@XMM[8], 0x20(%rbp)
2780	lea	0x20(%rbp), $arg1
2781	lea	0x20(%rbp), $arg2
2782	lea	($key), $arg3
2783	call	asm_AES_decrypt		# doesn't touch %xmm
2784	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2785	#pxor	@XMM[8], @XMM[0]
2786	#lea	0x80(%rsp), %rax	# pass key schedule
2787	#mov	%edx, %r10d		# pass rounds
2788	#call	_bsaes_decrypt8
2789	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2790	movdqu	@XMM[0], 0x00($out)	# write output
2791	lea	0x10($out), $out
2792
2793	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2794
2795.Lxts_dec_done:
2796	and	\$15, %ebx
2797	jz	.Lxts_dec_ret
2798
2799	pxor	$twtmp, $twtmp
2800	movdqa	.Lxts_magic(%rip), $twmask
2801	pcmpgtd	@XMM[7], $twtmp
2802	pshufd	\$0x13, $twtmp, $twres
2803	movdqa	@XMM[7], @XMM[6]
2804	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2805	pand	$twmask, $twres		# isolate carry and residue
2806	movdqu	($inp), @XMM[0]
2807	pxor	$twres, @XMM[7]
2808
2809	lea	0x20(%rbp), $arg1
2810	pxor	@XMM[7], @XMM[0]
2811	lea	0x20(%rbp), $arg2
2812	movdqa	@XMM[0], 0x20(%rbp)
2813	lea	($key), $arg3
2814	call	asm_AES_decrypt		# doesn't touch %xmm
2815	pxor	0x20(%rbp), @XMM[7]
2816	mov	$out, %rdx
2817	movdqu	@XMM[7], ($out)
2818
2819.Lxts_dec_steal:
2820	movzb	16($inp), %eax
2821	movzb	(%rdx), %ecx
2822	lea	1($inp), $inp
2823	mov	%al, (%rdx)
2824	mov	%cl, 16(%rdx)
2825	lea	1(%rdx), %rdx
2826	sub	\$1,%ebx
2827	jnz	.Lxts_dec_steal
2828
2829	movdqu	($out), @XMM[0]
2830	lea	0x20(%rbp), $arg1
2831	pxor	@XMM[6], @XMM[0]
2832	lea	0x20(%rbp), $arg2
2833	movdqa	@XMM[0], 0x20(%rbp)
2834	lea	($key), $arg3
2835	call	asm_AES_decrypt		# doesn't touch %xmm
2836	pxor	0x20(%rbp), @XMM[6]
2837	movdqu	@XMM[6], ($out)
2838
2839.Lxts_dec_ret:
2840	lea	(%rsp), %rax
2841	pxor	%xmm0, %xmm0
2842.Lxts_dec_bzero:			# wipe key schedule [if any]
2843	movdqa	%xmm0, 0x00(%rax)
2844	movdqa	%xmm0, 0x10(%rax)
2845	lea	0x20(%rax), %rax
2846	cmp	%rax, %rbp
2847	ja	.Lxts_dec_bzero
2848
2849	lea	(%rbp),%rsp		# restore %rsp
2850___
2851$code.=<<___ if ($win64);
2852	movaps	0x40(%rbp), %xmm6
2853	movaps	0x50(%rbp), %xmm7
2854	movaps	0x60(%rbp), %xmm8
2855	movaps	0x70(%rbp), %xmm9
2856	movaps	0x80(%rbp), %xmm10
2857	movaps	0x90(%rbp), %xmm11
2858	movaps	0xa0(%rbp), %xmm12
2859	movaps	0xb0(%rbp), %xmm13
2860	movaps	0xc0(%rbp), %xmm14
2861	movaps	0xd0(%rbp), %xmm15
2862	lea	0xa0(%rbp), %rsp
2863___
2864$code.=<<___;
2865	mov	0x48(%rsp), %r15
2866	mov	0x50(%rsp), %r14
2867	mov	0x58(%rsp), %r13
2868	mov	0x60(%rsp), %r12
2869	mov	0x68(%rsp), %rbx
2870	mov	0x70(%rsp), %rax
2871	lea	0x78(%rsp), %rsp
2872	mov	%rax, %rbp
2873.Lxts_dec_epilogue:
2874	ret
2875.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2876___
2877}
2878$code.=<<___;
2879.type	_bsaes_const,\@object
2880.align	64
2881_bsaes_const:
2882.LM0ISR:	# InvShiftRows constants
2883	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2884.LISRM0:
2885	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2886.LISR:
2887	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2888.LBS0:		# bit-slice constants
2889	.quad	0x5555555555555555, 0x5555555555555555
2890.LBS1:
2891	.quad	0x3333333333333333, 0x3333333333333333
2892.LBS2:
2893	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2894.LSR:		# shiftrows constants
2895	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2896.LSRM0:
2897	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2898.LM0SR:
2899	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2900.LSWPUP:	# byte-swap upper dword
2901	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2902.LSWPUPM0SR:
2903	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2904.LADD1:		# counter increment constants
2905	.quad	0x0000000000000000, 0x0000000100000000
2906.LADD2:
2907	.quad	0x0000000000000000, 0x0000000200000000
2908.LADD3:
2909	.quad	0x0000000000000000, 0x0000000300000000
2910.LADD4:
2911	.quad	0x0000000000000000, 0x0000000400000000
2912.LADD5:
2913	.quad	0x0000000000000000, 0x0000000500000000
2914.LADD6:
2915	.quad	0x0000000000000000, 0x0000000600000000
2916.LADD7:
2917	.quad	0x0000000000000000, 0x0000000700000000
2918.LADD8:
2919	.quad	0x0000000000000000, 0x0000000800000000
2920.Lxts_magic:
2921	.long	0x87,0,1,0
2922.Lmasks:
2923	.quad	0x0101010101010101, 0x0101010101010101
2924	.quad	0x0202020202020202, 0x0202020202020202
2925	.quad	0x0404040404040404, 0x0404040404040404
2926	.quad	0x0808080808080808, 0x0808080808080808
2927.LM0:
2928	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2929.L63:
2930	.quad	0x6363636363636363, 0x6363636363636363
2931.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2932.align	64
2933.size	_bsaes_const,.-_bsaes_const
2934___
2935
2936# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2937#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2938if ($win64) {
2939$rec="%rcx";
2940$frame="%rdx";
2941$context="%r8";
2942$disp="%r9";
2943
2944$code.=<<___;
2945.extern	__imp_RtlVirtualUnwind
2946.type	se_handler,\@abi-omnipotent
2947.align	16
2948se_handler:
2949	push	%rsi
2950	push	%rdi
2951	push	%rbx
2952	push	%rbp
2953	push	%r12
2954	push	%r13
2955	push	%r14
2956	push	%r15
2957	pushfq
2958	sub	\$64,%rsp
2959
2960	mov	120($context),%rax	# pull context->Rax
2961	mov	248($context),%rbx	# pull context->Rip
2962
2963	mov	8($disp),%rsi		# disp->ImageBase
2964	mov	56($disp),%r11		# disp->HandlerData
2965
2966	mov	0(%r11),%r10d		# HandlerData[0]
2967	lea	(%rsi,%r10),%r10	# prologue label
2968	cmp	%r10,%rbx		# context->Rip<prologue label
2969	jb	.Lin_prologue
2970
2971	mov	152($context),%rax	# pull context->Rsp
2972
2973	mov	4(%r11),%r10d		# HandlerData[1]
2974	lea	(%rsi,%r10),%r10	# epilogue label
2975	cmp	%r10,%rbx		# context->Rip>=epilogue label
2976	jae	.Lin_prologue
2977
2978	mov	160($context),%rax	# pull context->Rbp
2979
2980	lea	0x40(%rax),%rsi		# %xmm save area
2981	lea	512($context),%rdi	# &context.Xmm6
2982	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2983	.long	0xa548f3fc		# cld; rep movsq
2984	lea	0xa0(%rax),%rax		# adjust stack pointer
2985
2986	mov	0x70(%rax),%rbp
2987	mov	0x68(%rax),%rbx
2988	mov	0x60(%rax),%r12
2989	mov	0x58(%rax),%r13
2990	mov	0x50(%rax),%r14
2991	mov	0x48(%rax),%r15
2992	lea	0x78(%rax),%rax		# adjust stack pointer
2993	mov	%rbx,144($context)	# restore context->Rbx
2994	mov	%rbp,160($context)	# restore context->Rbp
2995	mov	%r12,216($context)	# restore context->R12
2996	mov	%r13,224($context)	# restore context->R13
2997	mov	%r14,232($context)	# restore context->R14
2998	mov	%r15,240($context)	# restore context->R15
2999
3000.Lin_prologue:
3001	mov	%rax,152($context)	# restore context->Rsp
3002
3003	mov	40($disp),%rdi		# disp->ContextRecord
3004	mov	$context,%rsi		# context
3005	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3006	.long	0xa548f3fc		# cld; rep movsq
3007
3008	mov	$disp,%rsi
3009	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3010	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3011	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3012	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3013	mov	40(%rsi),%r10		# disp->ContextRecord
3014	lea	56(%rsi),%r11		# &disp->HandlerData
3015	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3016	mov	%r10,32(%rsp)		# arg5
3017	mov	%r11,40(%rsp)		# arg6
3018	mov	%r12,48(%rsp)		# arg7
3019	mov	%rcx,56(%rsp)		# arg8, (NULL)
3020	call	*__imp_RtlVirtualUnwind(%rip)
3021
3022	mov	\$1,%eax		# ExceptionContinueSearch
3023	add	\$64,%rsp
3024	popfq
3025	pop	%r15
3026	pop	%r14
3027	pop	%r13
3028	pop	%r12
3029	pop	%rbp
3030	pop	%rbx
3031	pop	%rdi
3032	pop	%rsi
3033	ret
3034.size	se_handler,.-se_handler
3035
3036.section	.pdata
3037.align	4
3038___
3039$code.=<<___ if ($ecb);
3040	.rva	.Lecb_enc_prologue
3041	.rva	.Lecb_enc_epilogue
3042	.rva	.Lecb_enc_info
3043
3044	.rva	.Lecb_dec_prologue
3045	.rva	.Lecb_dec_epilogue
3046	.rva	.Lecb_dec_info
3047___
3048$code.=<<___;
3049	.rva	.Lcbc_dec_prologue
3050	.rva	.Lcbc_dec_epilogue
3051	.rva	.Lcbc_dec_info
3052
3053	.rva	.Lctr_enc_prologue
3054	.rva	.Lctr_enc_epilogue
3055	.rva	.Lctr_enc_info
3056
3057	.rva	.Lxts_enc_prologue
3058	.rva	.Lxts_enc_epilogue
3059	.rva	.Lxts_enc_info
3060
3061	.rva	.Lxts_dec_prologue
3062	.rva	.Lxts_dec_epilogue
3063	.rva	.Lxts_dec_info
3064
3065.section	.xdata
3066.align	8
3067___
3068$code.=<<___ if ($ecb);
3069.Lecb_enc_info:
3070	.byte	9,0,0,0
3071	.rva	se_handler
3072	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3073.Lecb_dec_info:
3074	.byte	9,0,0,0
3075	.rva	se_handler
3076	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3077___
3078$code.=<<___;
3079.Lcbc_dec_info:
3080	.byte	9,0,0,0
3081	.rva	se_handler
3082	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3083.Lctr_enc_info:
3084	.byte	9,0,0,0
3085	.rva	se_handler
3086	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3087.Lxts_enc_info:
3088	.byte	9,0,0,0
3089	.rva	se_handler
3090	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3091.Lxts_dec_info:
3092	.byte	9,0,0,0
3093	.rva	se_handler
3094	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3095___
3096}
3097
3098$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3099
3100print $code;
3101
3102close STDOUT;
3103