aesni-sha1-x86_64.pl revision 04ef91b390dfcc6125913e2f2af502d23d7a5112
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2011
11#
12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
18# AESNI code is weaved into it. Below are performance numbers in
19# cycles per processed byte, less is better, for standalone AESNI-CBC
20# encrypt, sum of the latter and standalone SHA1, and "stitched"
21# subroutine:
22#
23#		AES-128-CBC	+SHA1		stitch      gain
24# Westmere	3.77[+5.6]	9.37		6.65	    +41%
25# Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
26#
27#		AES-192-CBC
28# Westmere	4.51		10.11		6.97	    +45%
29# Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
30#
31#		AES-256-CBC
32# Westmere	5.25		10.85		7.25	    +50%
33# Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
34#
35# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
36#	background information. Above numbers in parentheses are SSSE3
37#	results collected on AVX-capable CPU, i.e. apply on OSes that
38#	don't support AVX.
39#
40# Needless to mention that it makes no sense to implement "stitched"
41# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
42# fully utilize parallelism, so stitching would not give any gain
43# anyway. Well, there might be some, e.g. because of better cache
44# locality... For reference, here are performance results for
45# standalone AESNI-CBC decrypt:
46#
47#		AES-128-CBC	AES-192-CBC	AES-256-CBC
48# Westmere	1.31		1.55		1.80
49# Sandy Bridge	0.93		1.06		1.22
50
51$flavour = shift;
52$output  = shift;
53if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
54
55$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
60die "can't locate x86_64-xlate.pl";
61
62$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
63		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
64	   $1>=2.19);
65$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
67	   $1>=2.09);
68$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
70	   $1>=10);
71
72open OUT,"| \"$^X\" $xlate $flavour $output";
73*STDOUT=*OUT;
74
75# void aesni_cbc_sha1_enc(const void *inp,
76#			void *out,
77#			size_t length,
78#			const AES_KEY *key,
79#			unsigned char *iv,
80#			SHA_CTX *ctx,
81#			const void *in0);
82
83$code.=<<___;
84.text
85.extern	OPENSSL_ia32cap_P
86
87.globl	aesni_cbc_sha1_enc
88.type	aesni_cbc_sha1_enc,\@abi-omnipotent
89.align	16
90aesni_cbc_sha1_enc:
91	# caller should check for SSSE3 and AES-NI bits
92	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
93	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
94___
95$code.=<<___ if ($avx);
96	and	\$`1<<28`,%r11d		# mask AVX bit
97	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
98	or	%r11d,%r10d
99	cmp	\$`1<<28|1<<30`,%r10d
100	je	aesni_cbc_sha1_enc_avx
101___
102$code.=<<___;
103	jmp	aesni_cbc_sha1_enc_ssse3
104	ret
105.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
106___
107
108my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
109
110my $Xi=4;
111my @X=map("%xmm$_",(4..7,0..3));
112my @Tx=map("%xmm$_",(8..10));
113my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
114my @T=("%esi","%edi");
115my $j=0; my $jj=0; my $r=0; my $sn=0;
116my $K_XX_XX="%r11";
117my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
118my @rndkey=("%xmm14","%xmm15");
119
120sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
121{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
122  my $arg = pop;
123    $arg = "\$$arg" if ($arg*1 eq $arg);
124    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
125}
126
127my $_rol=sub { &rol(@_) };
128my $_ror=sub { &ror(@_) };
129
130$code.=<<___;
131.type	aesni_cbc_sha1_enc_ssse3,\@function,6
132.align	16
133aesni_cbc_sha1_enc_ssse3:
134	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
135	#shr	\$6,$len			# debugging artefact
136	#jz	.Lepilogue_ssse3		# debugging artefact
137	push	%rbx
138	push	%rbp
139	push	%r12
140	push	%r13
141	push	%r14
142	push	%r15
143	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
144	#mov	$in0,$inp			# debugging artefact
145	#lea	64(%rsp),$ctx			# debugging artefact
146___
147$code.=<<___ if ($win64);
148	movaps	%xmm6,96+0(%rsp)
149	movaps	%xmm7,96+16(%rsp)
150	movaps	%xmm8,96+32(%rsp)
151	movaps	%xmm9,96+48(%rsp)
152	movaps	%xmm10,96+64(%rsp)
153	movaps	%xmm11,96+80(%rsp)
154	movaps	%xmm12,96+96(%rsp)
155	movaps	%xmm13,96+112(%rsp)
156	movaps	%xmm14,96+128(%rsp)
157	movaps	%xmm15,96+144(%rsp)
158.Lprologue_ssse3:
159___
160$code.=<<___;
161	mov	$in0,%r12			# reassign arguments
162	mov	$out,%r13
163	mov	$len,%r14
164	mov	$key,%r15
165	movdqu	($ivp),$iv			# load IV
166	mov	$ivp,88(%rsp)			# save $ivp
167___
168my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
169my $rounds="${ivp}d";
170$code.=<<___;
171	shl	\$6,$len
172	sub	$in0,$out
173	mov	240($key),$rounds
174	add	$inp,$len		# end of input
175
176	lea	K_XX_XX(%rip),$K_XX_XX
177	mov	0($ctx),$A		# load context
178	mov	4($ctx),$B
179	mov	8($ctx),$C
180	mov	12($ctx),$D
181	mov	$B,@T[0]		# magic seed
182	mov	16($ctx),$E
183
184	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
185	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
186	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
187	movdqu	16($inp),@X[-3&7]
188	movdqu	32($inp),@X[-2&7]
189	movdqu	48($inp),@X[-1&7]
190	pshufb	@X[2],@X[-4&7]		# byte swap
191	add	\$64,$inp
192	pshufb	@X[2],@X[-3&7]
193	pshufb	@X[2],@X[-2&7]
194	pshufb	@X[2],@X[-1&7]
195	paddd	@Tx[1],@X[-4&7]		# add K_00_19
196	paddd	@Tx[1],@X[-3&7]
197	paddd	@Tx[1],@X[-2&7]
198	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
199	psubd	@Tx[1],@X[-4&7]		# restore X[]
200	movdqa	@X[-3&7],16(%rsp)
201	psubd	@Tx[1],@X[-3&7]
202	movdqa	@X[-2&7],32(%rsp)
203	psubd	@Tx[1],@X[-2&7]
204	movups	($key),$rndkey0		# $key[0]
205	movups	16($key),$rndkey[0]	# forward reference
206	jmp	.Loop_ssse3
207___
208
209my $aesenc=sub {
210  use integer;
211  my ($n,$k)=($r/10,$r%10);
212    if ($k==0) {
213      $code.=<<___;
214	movups		`16*$n`($in0),$in		# load input
215	xorps		$rndkey0,$in
216___
217      $code.=<<___ if ($n);
218	movups		$iv,`16*($n-1)`($out,$in0)	# write output
219___
220      $code.=<<___;
221	xorps		$in,$iv
222	aesenc		$rndkey[0],$iv
223	movups		`32+16*$k`($key),$rndkey[1]
224___
225    } elsif ($k==9) {
226      $sn++;
227      $code.=<<___;
228	cmp		\$11,$rounds
229	jb		.Laesenclast$sn
230	movups		`32+16*($k+0)`($key),$rndkey[1]
231	aesenc		$rndkey[0],$iv
232	movups		`32+16*($k+1)`($key),$rndkey[0]
233	aesenc		$rndkey[1],$iv
234	je		.Laesenclast$sn
235	movups		`32+16*($k+2)`($key),$rndkey[1]
236	aesenc		$rndkey[0],$iv
237	movups		`32+16*($k+3)`($key),$rndkey[0]
238	aesenc		$rndkey[1],$iv
239.Laesenclast$sn:
240	aesenclast	$rndkey[0],$iv
241	movups		16($key),$rndkey[1]		# forward reference
242___
243    } else {
244      $code.=<<___;
245	aesenc		$rndkey[0],$iv
246	movups		`32+16*$k`($key),$rndkey[1]
247___
248    }
249    $r++;	unshift(@rndkey,pop(@rndkey));
250};
251
252sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
253{ use integer;
254  my $body = shift;
255  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
256  my ($a,$b,$c,$d,$e);
257
258	&movdqa	(@X[0],@X[-3&7]);
259	 eval(shift(@insns));
260	 eval(shift(@insns));
261	&movdqa	(@Tx[0],@X[-1&7]);
262	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
263	 eval(shift(@insns));
264	 eval(shift(@insns));
265
266	  &paddd	(@Tx[1],@X[-1&7]);
267	 eval(shift(@insns));
268	 eval(shift(@insns));
269	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
270	 eval(shift(@insns));
271	 eval(shift(@insns));
272	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
273	 eval(shift(@insns));
274	 eval(shift(@insns));
275
276	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
277	 eval(shift(@insns));
278	 eval(shift(@insns));
279	 eval(shift(@insns));
280	 eval(shift(@insns));
281
282	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
283	 eval(shift(@insns));
284	 eval(shift(@insns));
285	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
286	 eval(shift(@insns));
287	 eval(shift(@insns));
288
289	&movdqa	(@Tx[2],@X[0]);
290	&movdqa	(@Tx[0],@X[0]);
291	 eval(shift(@insns));
292	 eval(shift(@insns));
293	 eval(shift(@insns));
294	 eval(shift(@insns));
295
296	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
297	&paddd	(@X[0],@X[0]);
298	 eval(shift(@insns));
299	 eval(shift(@insns));
300	 eval(shift(@insns));
301	 eval(shift(@insns));
302
303	&psrld	(@Tx[0],31);
304	 eval(shift(@insns));
305	 eval(shift(@insns));
306	&movdqa	(@Tx[1],@Tx[2]);
307	 eval(shift(@insns));
308	 eval(shift(@insns));
309
310	&psrld	(@Tx[2],30);
311	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
312	 eval(shift(@insns));
313	 eval(shift(@insns));
314	 eval(shift(@insns));
315	 eval(shift(@insns));
316
317	&pslld	(@Tx[1],2);
318	&pxor	(@X[0],@Tx[2]);
319	 eval(shift(@insns));
320	 eval(shift(@insns));
321	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
322	 eval(shift(@insns));
323	 eval(shift(@insns));
324
325	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
326
327	 foreach (@insns) { eval; }	# remaining instructions [if any]
328
329  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
330		push(@Tx,shift(@Tx));
331}
332
333sub Xupdate_ssse3_32_79()
334{ use integer;
335  my $body = shift;
336  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
337  my ($a,$b,$c,$d,$e);
338
339	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
340	 eval(shift(@insns));		# body_20_39
341	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
342	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
343	 eval(shift(@insns));
344	 eval(shift(@insns));
345	 eval(shift(@insns));		# rol
346
347	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
348	 eval(shift(@insns));
349	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
350	if ($Xi%5) {
351	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
352	} else {			# ... or load next one
353	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
354	}
355	  &paddd	(@Tx[1],@X[-1&7]);
356	 eval(shift(@insns));		# ror
357	 eval(shift(@insns));
358
359	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
360	 eval(shift(@insns));		# body_20_39
361	 eval(shift(@insns));
362	 eval(shift(@insns));
363	 eval(shift(@insns));		# rol
364
365	&movdqa	(@Tx[0],@X[0]);
366	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
367	 eval(shift(@insns));
368	 eval(shift(@insns));
369	 eval(shift(@insns));		# ror
370	 eval(shift(@insns));
371
372	&pslld	(@X[0],2);
373	 eval(shift(@insns));		# body_20_39
374	 eval(shift(@insns));
375	&psrld	(@Tx[0],30);
376	 eval(shift(@insns));
377	 eval(shift(@insns));		# rol
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380	 eval(shift(@insns));		# ror
381	 eval(shift(@insns));
382
383	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
384	 eval(shift(@insns));		# body_20_39
385	 eval(shift(@insns));
386	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
387	 eval(shift(@insns));
388	 eval(shift(@insns));		# rol
389	 eval(shift(@insns));
390	 eval(shift(@insns));
391	 eval(shift(@insns));		# rol
392	 eval(shift(@insns));
393
394	 foreach (@insns) { eval; }	# remaining instructions
395
396  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
397		push(@Tx,shift(@Tx));
398}
399
400sub Xuplast_ssse3_80()
401{ use integer;
402  my $body = shift;
403  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
404  my ($a,$b,$c,$d,$e);
405
406	 eval(shift(@insns));
407	  &paddd	(@Tx[1],@X[-1&7]);
408	 eval(shift(@insns));
409	 eval(shift(@insns));
410	 eval(shift(@insns));
411	 eval(shift(@insns));
412
413	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
414
415	 foreach (@insns) { eval; }		# remaining instructions
416
417	&cmp	($inp,$len);
418	&je	(".Ldone_ssse3");
419
420	unshift(@Tx,pop(@Tx));
421
422	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
423	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
424	&movdqu	(@X[-4&7],"0($inp)");		# load input
425	&movdqu	(@X[-3&7],"16($inp)");
426	&movdqu	(@X[-2&7],"32($inp)");
427	&movdqu	(@X[-1&7],"48($inp)");
428	&pshufb	(@X[-4&7],@X[2]);		# byte swap
429	&add	($inp,64);
430
431  $Xi=0;
432}
433
434sub Xloop_ssse3()
435{ use integer;
436  my $body = shift;
437  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
438  my ($a,$b,$c,$d,$e);
439
440	 eval(shift(@insns));
441	 eval(shift(@insns));
442	&pshufb	(@X[($Xi-3)&7],@X[2]);
443	 eval(shift(@insns));
444	 eval(shift(@insns));
445	&paddd	(@X[($Xi-4)&7],@Tx[1]);
446	 eval(shift(@insns));
447	 eval(shift(@insns));
448	 eval(shift(@insns));
449	 eval(shift(@insns));
450	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
451	 eval(shift(@insns));
452	 eval(shift(@insns));
453	&psubd	(@X[($Xi-4)&7],@Tx[1]);
454
455	foreach (@insns) { eval; }
456  $Xi++;
457}
458
459sub Xtail_ssse3()
460{ use integer;
461  my $body = shift;
462  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
463  my ($a,$b,$c,$d,$e);
464
465	foreach (@insns) { eval; }
466}
467
468sub body_00_19 () {
469  use integer;
470  my ($k,$n);
471  my @r=(
472	'($a,$b,$c,$d,$e)=@V;'.
473	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
474	'&xor	($c,$d);',
475	'&mov	(@T[1],$a);',	# $b in next round
476	'&$_rol	($a,5);',
477	'&and	(@T[0],$c);',	# ($b&($c^$d))
478	'&xor	($c,$d);',	# restore $c
479	'&xor	(@T[0],$d);',
480	'&add	($e,$a);',
481	'&$_ror	($b,$j?7:2);',	# $b>>>2
482	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
483	);
484	$n = scalar(@r);
485	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
486	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
487	$jj++;
488    return @r;
489}
490
491sub body_20_39 () {
492  use integer;
493  my ($k,$n);
494  my @r=(
495	'($a,$b,$c,$d,$e)=@V;'.
496	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
497	'&xor	(@T[0],$d);',	# ($b^$d)
498	'&mov	(@T[1],$a);',	# $b in next round
499	'&$_rol	($a,5);',
500	'&xor	(@T[0],$c);',	# ($b^$d^$c)
501	'&add	($e,$a);',
502	'&$_ror	($b,7);',	# $b>>>2
503	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
504	);
505	$n = scalar(@r);
506	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
507	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
508	$jj++;
509    return @r;
510}
511
512sub body_40_59 () {
513  use integer;
514  my ($k,$n);
515  my @r=(
516	'($a,$b,$c,$d,$e)=@V;'.
517	'&mov	(@T[1],$c);',
518	'&xor	($c,$d);',
519	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
520	'&and	(@T[1],$d);',
521	'&and	(@T[0],$c);',	# ($b&($c^$d))
522	'&$_ror	($b,7);',	# $b>>>2
523	'&add	($e,@T[1]);',
524	'&mov	(@T[1],$a);',	# $b in next round
525	'&$_rol	($a,5);',
526	'&add	($e,@T[0]);',
527	'&xor	($c,$d);',	# restore $c
528	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
529	);
530	$n = scalar(@r);
531	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
532	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
533	$jj++;
534    return @r;
535}
536$code.=<<___;
537.align	16
538.Loop_ssse3:
539___
540	&Xupdate_ssse3_16_31(\&body_00_19);
541	&Xupdate_ssse3_16_31(\&body_00_19);
542	&Xupdate_ssse3_16_31(\&body_00_19);
543	&Xupdate_ssse3_16_31(\&body_00_19);
544	&Xupdate_ssse3_32_79(\&body_00_19);
545	&Xupdate_ssse3_32_79(\&body_20_39);
546	&Xupdate_ssse3_32_79(\&body_20_39);
547	&Xupdate_ssse3_32_79(\&body_20_39);
548	&Xupdate_ssse3_32_79(\&body_20_39);
549	&Xupdate_ssse3_32_79(\&body_20_39);
550	&Xupdate_ssse3_32_79(\&body_40_59);
551	&Xupdate_ssse3_32_79(\&body_40_59);
552	&Xupdate_ssse3_32_79(\&body_40_59);
553	&Xupdate_ssse3_32_79(\&body_40_59);
554	&Xupdate_ssse3_32_79(\&body_40_59);
555	&Xupdate_ssse3_32_79(\&body_20_39);
556	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
557
558				$saved_j=$j; @saved_V=@V;
559				$saved_r=$r; @saved_rndkey=@rndkey;
560
561	&Xloop_ssse3(\&body_20_39);
562	&Xloop_ssse3(\&body_20_39);
563	&Xloop_ssse3(\&body_20_39);
564
565$code.=<<___;
566	movups	$iv,48($out,$in0)		# write output
567	lea	64($in0),$in0
568
569	add	0($ctx),$A			# update context
570	add	4($ctx),@T[0]
571	add	8($ctx),$C
572	add	12($ctx),$D
573	mov	$A,0($ctx)
574	add	16($ctx),$E
575	mov	@T[0],4($ctx)
576	mov	@T[0],$B			# magic seed
577	mov	$C,8($ctx)
578	mov	$D,12($ctx)
579	mov	$E,16($ctx)
580	jmp	.Loop_ssse3
581
582.align	16
583.Ldone_ssse3:
584___
585				$jj=$j=$saved_j; @V=@saved_V;
586				$r=$saved_r;     @rndkey=@saved_rndkey;
587
588	&Xtail_ssse3(\&body_20_39);
589	&Xtail_ssse3(\&body_20_39);
590	&Xtail_ssse3(\&body_20_39);
591
592$code.=<<___;
593	movups	$iv,48($out,$in0)		# write output
594	mov	88(%rsp),$ivp			# restore $ivp
595
596	add	0($ctx),$A			# update context
597	add	4($ctx),@T[0]
598	add	8($ctx),$C
599	mov	$A,0($ctx)
600	add	12($ctx),$D
601	mov	@T[0],4($ctx)
602	add	16($ctx),$E
603	mov	$C,8($ctx)
604	mov	$D,12($ctx)
605	mov	$E,16($ctx)
606	movups	$iv,($ivp)			# write IV
607___
608$code.=<<___ if ($win64);
609	movaps	96+0(%rsp),%xmm6
610	movaps	96+16(%rsp),%xmm7
611	movaps	96+32(%rsp),%xmm8
612	movaps	96+48(%rsp),%xmm9
613	movaps	96+64(%rsp),%xmm10
614	movaps	96+80(%rsp),%xmm11
615	movaps	96+96(%rsp),%xmm12
616	movaps	96+112(%rsp),%xmm13
617	movaps	96+128(%rsp),%xmm14
618	movaps	96+144(%rsp),%xmm15
619___
620$code.=<<___;
621	lea	`104+($win64?10*16:0)`(%rsp),%rsi
622	mov	0(%rsi),%r15
623	mov	8(%rsi),%r14
624	mov	16(%rsi),%r13
625	mov	24(%rsi),%r12
626	mov	32(%rsi),%rbp
627	mov	40(%rsi),%rbx
628	lea	48(%rsi),%rsp
629.Lepilogue_ssse3:
630	ret
631.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
632___
633
634$j=$jj=$r=$sn=0;
635
636if ($avx) {
637my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
638
639my $Xi=4;
640my @X=map("%xmm$_",(4..7,0..3));
641my @Tx=map("%xmm$_",(8..10));
642my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
643my @T=("%esi","%edi");
644
645my $_rol=sub { &shld(@_[0],@_) };
646my $_ror=sub { &shrd(@_[0],@_) };
647
648$code.=<<___;
649.type	aesni_cbc_sha1_enc_avx,\@function,6
650.align	16
651aesni_cbc_sha1_enc_avx:
652	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
653	#shr	\$6,$len			# debugging artefact
654	#jz	.Lepilogue_avx			# debugging artefact
655	push	%rbx
656	push	%rbp
657	push	%r12
658	push	%r13
659	push	%r14
660	push	%r15
661	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
662	#mov	$in0,$inp			# debugging artefact
663	#lea	64(%rsp),$ctx			# debugging artefact
664___
665$code.=<<___ if ($win64);
666	movaps	%xmm6,96+0(%rsp)
667	movaps	%xmm7,96+16(%rsp)
668	movaps	%xmm8,96+32(%rsp)
669	movaps	%xmm9,96+48(%rsp)
670	movaps	%xmm10,96+64(%rsp)
671	movaps	%xmm11,96+80(%rsp)
672	movaps	%xmm12,96+96(%rsp)
673	movaps	%xmm13,96+112(%rsp)
674	movaps	%xmm14,96+128(%rsp)
675	movaps	%xmm15,96+144(%rsp)
676.Lprologue_avx:
677___
678$code.=<<___;
679	vzeroall
680	mov	$in0,%r12			# reassign arguments
681	mov	$out,%r13
682	mov	$len,%r14
683	mov	$key,%r15
684	vmovdqu	($ivp),$iv			# load IV
685	mov	$ivp,88(%rsp)			# save $ivp
686___
687my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
688my $rounds="${ivp}d";
689$code.=<<___;
690	shl	\$6,$len
691	sub	$in0,$out
692	mov	240($key),$rounds
693	add	\$112,$key		# size optimization
694	add	$inp,$len		# end of input
695
696	lea	K_XX_XX(%rip),$K_XX_XX
697	mov	0($ctx),$A		# load context
698	mov	4($ctx),$B
699	mov	8($ctx),$C
700	mov	12($ctx),$D
701	mov	$B,@T[0]		# magic seed
702	mov	16($ctx),$E
703
704	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
705	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
706	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
707	vmovdqu	16($inp),@X[-3&7]
708	vmovdqu	32($inp),@X[-2&7]
709	vmovdqu	48($inp),@X[-1&7]
710	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
711	add	\$64,$inp
712	vpshufb	@X[2],@X[-3&7],@X[-3&7]
713	vpshufb	@X[2],@X[-2&7],@X[-2&7]
714	vpshufb	@X[2],@X[-1&7],@X[-1&7]
715	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
716	vpaddd	@Tx[1],@X[-3&7],@X[1]
717	vpaddd	@Tx[1],@X[-2&7],@X[2]
718	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
719	vmovdqa	@X[1],16(%rsp)
720	vmovdqa	@X[2],32(%rsp)
721	vmovups	-112($key),$rndkey0	# $key[0]
722	vmovups	16-112($key),$rndkey[0]	# forward reference
723	jmp	.Loop_avx
724___
725
726my $aesenc=sub {
727  use integer;
728  my ($n,$k)=($r/10,$r%10);
729    if ($k==0) {
730      $code.=<<___;
731	vmovups		`16*$n`($in0),$in		# load input
732	vxorps		$rndkey0,$in,$in
733___
734      $code.=<<___ if ($n);
735	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
736___
737      $code.=<<___;
738	vxorps		$in,$iv,$iv
739	vaesenc		$rndkey[0],$iv,$iv
740	vmovups		`32+16*$k-112`($key),$rndkey[1]
741___
742    } elsif ($k==9) {
743      $sn++;
744      $code.=<<___;
745	cmp		\$11,$rounds
746	jb		.Lvaesenclast$sn
747	vaesenc		$rndkey[0],$iv,$iv
748	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
749	vaesenc		$rndkey[1],$iv,$iv
750	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
751	je		.Lvaesenclast$sn
752	vaesenc		$rndkey[0],$iv,$iv
753	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
754	vaesenc		$rndkey[1],$iv,$iv
755	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
756.Lvaesenclast$sn:
757	vaesenclast	$rndkey[0],$iv,$iv
758	vmovups		16-112($key),$rndkey[1]		# forward reference
759___
760    } else {
761      $code.=<<___;
762	vaesenc		$rndkey[0],$iv,$iv
763	vmovups		`32+16*$k-112`($key),$rndkey[1]
764___
765    }
766    $r++;	unshift(@rndkey,pop(@rndkey));
767};
768
769sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
770{ use integer;
771  my $body = shift;
772  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
773  my ($a,$b,$c,$d,$e);
774
775	 eval(shift(@insns));
776	 eval(shift(@insns));
777	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
778	 eval(shift(@insns));
779	 eval(shift(@insns));
780
781	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
782	 eval(shift(@insns));
783	 eval(shift(@insns));
784	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
785	 eval(shift(@insns));
786	 eval(shift(@insns));
787	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
788	 eval(shift(@insns));
789	 eval(shift(@insns));
790
791	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
792	 eval(shift(@insns));
793	 eval(shift(@insns));
794	 eval(shift(@insns));
795	 eval(shift(@insns));
796
797	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
798	 eval(shift(@insns));
799	 eval(shift(@insns));
800	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
801	 eval(shift(@insns));
802	 eval(shift(@insns));
803
804	&vpsrld	(@Tx[0],@X[0],31);
805	 eval(shift(@insns));
806	 eval(shift(@insns));
807	 eval(shift(@insns));
808	 eval(shift(@insns));
809
810	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
811	&vpaddd	(@X[0],@X[0],@X[0]);
812	 eval(shift(@insns));
813	 eval(shift(@insns));
814	 eval(shift(@insns));
815	 eval(shift(@insns));
816
817	&vpsrld	(@Tx[1],@Tx[2],30);
818	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
819	 eval(shift(@insns));
820	 eval(shift(@insns));
821	 eval(shift(@insns));
822	 eval(shift(@insns));
823
824	&vpslld	(@Tx[2],@Tx[2],2);
825	&vpxor	(@X[0],@X[0],@Tx[1]);
826	 eval(shift(@insns));
827	 eval(shift(@insns));
828	 eval(shift(@insns));
829	 eval(shift(@insns));
830
831	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
832	 eval(shift(@insns));
833	 eval(shift(@insns));
834	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
835	 eval(shift(@insns));
836	 eval(shift(@insns));
837
838
839	 foreach (@insns) { eval; }	# remaining instructions [if any]
840
841  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
842		push(@Tx,shift(@Tx));
843}
844
845sub Xupdate_avx_32_79()
846{ use integer;
847  my $body = shift;
848  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
849  my ($a,$b,$c,$d,$e);
850
851	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
852	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
853	 eval(shift(@insns));		# body_20_39
854	 eval(shift(@insns));
855	 eval(shift(@insns));
856	 eval(shift(@insns));		# rol
857
858	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
859	 eval(shift(@insns));
860	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
861	if ($Xi%5) {
862	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
863	} else {			# ... or load next one
864	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
865	}
866	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
867	 eval(shift(@insns));		# ror
868	 eval(shift(@insns));
869
870	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
871	 eval(shift(@insns));		# body_20_39
872	 eval(shift(@insns));
873	 eval(shift(@insns));
874	 eval(shift(@insns));		# rol
875
876	&vpsrld	(@Tx[0],@X[0],30);
877	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
878	 eval(shift(@insns));
879	 eval(shift(@insns));
880	 eval(shift(@insns));		# ror
881	 eval(shift(@insns));
882
883	&vpslld	(@X[0],@X[0],2);
884	 eval(shift(@insns));		# body_20_39
885	 eval(shift(@insns));
886	 eval(shift(@insns));
887	 eval(shift(@insns));		# rol
888	 eval(shift(@insns));
889	 eval(shift(@insns));
890	 eval(shift(@insns));		# ror
891	 eval(shift(@insns));
892
893	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
894	 eval(shift(@insns));		# body_20_39
895	 eval(shift(@insns));
896	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
897	 eval(shift(@insns));
898	 eval(shift(@insns));		# rol
899	 eval(shift(@insns));
900	 eval(shift(@insns));
901	 eval(shift(@insns));		# rol
902	 eval(shift(@insns));
903
904	 foreach (@insns) { eval; }	# remaining instructions
905
906  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
907		push(@Tx,shift(@Tx));
908}
909
910sub Xuplast_avx_80()
911{ use integer;
912  my $body = shift;
913  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
914  my ($a,$b,$c,$d,$e);
915
916	 eval(shift(@insns));
917	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
918	 eval(shift(@insns));
919	 eval(shift(@insns));
920	 eval(shift(@insns));
921	 eval(shift(@insns));
922
923	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
924
925	 foreach (@insns) { eval; }		# remaining instructions
926
927	&cmp	($inp,$len);
928	&je	(".Ldone_avx");
929
930	unshift(@Tx,pop(@Tx));
931
932	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
933	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
934	&vmovdqu(@X[-4&7],"0($inp)");		# load input
935	&vmovdqu(@X[-3&7],"16($inp)");
936	&vmovdqu(@X[-2&7],"32($inp)");
937	&vmovdqu(@X[-1&7],"48($inp)");
938	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
939	&add	($inp,64);
940
941  $Xi=0;
942}
943
944sub Xloop_avx()
945{ use integer;
946  my $body = shift;
947  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
948  my ($a,$b,$c,$d,$e);
949
950	 eval(shift(@insns));
951	 eval(shift(@insns));
952	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
953	 eval(shift(@insns));
954	 eval(shift(@insns));
955	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
956	 eval(shift(@insns));
957	 eval(shift(@insns));
958	 eval(shift(@insns));
959	 eval(shift(@insns));
960	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
961	 eval(shift(@insns));
962	 eval(shift(@insns));
963
964	foreach (@insns) { eval; }
965  $Xi++;
966}
967
968sub Xtail_avx()
969{ use integer;
970  my $body = shift;
971  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
972  my ($a,$b,$c,$d,$e);
973
974	foreach (@insns) { eval; }
975}
976
977$code.=<<___;
978.align	16
979.Loop_avx:
980___
981	&Xupdate_avx_16_31(\&body_00_19);
982	&Xupdate_avx_16_31(\&body_00_19);
983	&Xupdate_avx_16_31(\&body_00_19);
984	&Xupdate_avx_16_31(\&body_00_19);
985	&Xupdate_avx_32_79(\&body_00_19);
986	&Xupdate_avx_32_79(\&body_20_39);
987	&Xupdate_avx_32_79(\&body_20_39);
988	&Xupdate_avx_32_79(\&body_20_39);
989	&Xupdate_avx_32_79(\&body_20_39);
990	&Xupdate_avx_32_79(\&body_20_39);
991	&Xupdate_avx_32_79(\&body_40_59);
992	&Xupdate_avx_32_79(\&body_40_59);
993	&Xupdate_avx_32_79(\&body_40_59);
994	&Xupdate_avx_32_79(\&body_40_59);
995	&Xupdate_avx_32_79(\&body_40_59);
996	&Xupdate_avx_32_79(\&body_20_39);
997	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
998
999				$saved_j=$j; @saved_V=@V;
1000				$saved_r=$r; @saved_rndkey=@rndkey;
1001
1002	&Xloop_avx(\&body_20_39);
1003	&Xloop_avx(\&body_20_39);
1004	&Xloop_avx(\&body_20_39);
1005
1006$code.=<<___;
1007	vmovups	$iv,48($out,$in0)		# write output
1008	lea	64($in0),$in0
1009
1010	add	0($ctx),$A			# update context
1011	add	4($ctx),@T[0]
1012	add	8($ctx),$C
1013	add	12($ctx),$D
1014	mov	$A,0($ctx)
1015	add	16($ctx),$E
1016	mov	@T[0],4($ctx)
1017	mov	@T[0],$B			# magic seed
1018	mov	$C,8($ctx)
1019	mov	$D,12($ctx)
1020	mov	$E,16($ctx)
1021	jmp	.Loop_avx
1022
1023.align	16
1024.Ldone_avx:
1025___
1026				$jj=$j=$saved_j; @V=@saved_V;
1027				$r=$saved_r;     @rndkey=@saved_rndkey;
1028
1029	&Xtail_avx(\&body_20_39);
1030	&Xtail_avx(\&body_20_39);
1031	&Xtail_avx(\&body_20_39);
1032
1033$code.=<<___;
1034	vmovups	$iv,48($out,$in0)		# write output
1035	mov	88(%rsp),$ivp			# restore $ivp
1036
1037	add	0($ctx),$A			# update context
1038	add	4($ctx),@T[0]
1039	add	8($ctx),$C
1040	mov	$A,0($ctx)
1041	add	12($ctx),$D
1042	mov	@T[0],4($ctx)
1043	add	16($ctx),$E
1044	mov	$C,8($ctx)
1045	mov	$D,12($ctx)
1046	mov	$E,16($ctx)
1047	vmovups	$iv,($ivp)			# write IV
1048	vzeroall
1049___
1050$code.=<<___ if ($win64);
1051	movaps	96+0(%rsp),%xmm6
1052	movaps	96+16(%rsp),%xmm7
1053	movaps	96+32(%rsp),%xmm8
1054	movaps	96+48(%rsp),%xmm9
1055	movaps	96+64(%rsp),%xmm10
1056	movaps	96+80(%rsp),%xmm11
1057	movaps	96+96(%rsp),%xmm12
1058	movaps	96+112(%rsp),%xmm13
1059	movaps	96+128(%rsp),%xmm14
1060	movaps	96+144(%rsp),%xmm15
1061___
1062$code.=<<___;
1063	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1064	mov	0(%rsi),%r15
1065	mov	8(%rsi),%r14
1066	mov	16(%rsi),%r13
1067	mov	24(%rsi),%r12
1068	mov	32(%rsi),%rbp
1069	mov	40(%rsi),%rbx
1070	lea	48(%rsi),%rsp
1071.Lepilogue_avx:
1072	ret
1073.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1074___
1075}
1076$code.=<<___;
1077.align	64
1078K_XX_XX:
1079.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1080.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1081.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1082.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1083.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1084
1085.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1086.align	64
1087___
1088
1089# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1090#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1091if ($win64) {
1092$rec="%rcx";
1093$frame="%rdx";
1094$context="%r8";
1095$disp="%r9";
1096
1097$code.=<<___;
1098.extern	__imp_RtlVirtualUnwind
1099.type	ssse3_handler,\@abi-omnipotent
1100.align	16
1101ssse3_handler:
1102	push	%rsi
1103	push	%rdi
1104	push	%rbx
1105	push	%rbp
1106	push	%r12
1107	push	%r13
1108	push	%r14
1109	push	%r15
1110	pushfq
1111	sub	\$64,%rsp
1112
1113	mov	120($context),%rax	# pull context->Rax
1114	mov	248($context),%rbx	# pull context->Rip
1115
1116	mov	8($disp),%rsi		# disp->ImageBase
1117	mov	56($disp),%r11		# disp->HandlerData
1118
1119	mov	0(%r11),%r10d		# HandlerData[0]
1120	lea	(%rsi,%r10),%r10	# prologue label
1121	cmp	%r10,%rbx		# context->Rip<prologue label
1122	jb	.Lcommon_seh_tail
1123
1124	mov	152($context),%rax	# pull context->Rsp
1125
1126	mov	4(%r11),%r10d		# HandlerData[1]
1127	lea	(%rsi,%r10),%r10	# epilogue label
1128	cmp	%r10,%rbx		# context->Rip>=epilogue label
1129	jae	.Lcommon_seh_tail
1130
1131	lea	96(%rax),%rsi
1132	lea	512($context),%rdi	# &context.Xmm6
1133	mov	\$20,%ecx
1134	.long	0xa548f3fc		# cld; rep movsq
1135	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
1136
1137	mov	0(%rax),%r15
1138	mov	8(%rax),%r14
1139	mov	16(%rax),%r13
1140	mov	24(%rax),%r12
1141	mov	32(%rax),%rbp
1142	mov	40(%rax),%rbx
1143	lea	48(%rax),%rax
1144	mov	%rbx,144($context)	# restore context->Rbx
1145	mov	%rbp,160($context)	# restore context->Rbp
1146	mov	%r12,216($context)	# restore context->R12
1147	mov	%r13,224($context)	# restore context->R13
1148	mov	%r14,232($context)	# restore context->R14
1149	mov	%r15,240($context)	# restore context->R15
1150
1151.Lcommon_seh_tail:
1152	mov	8(%rax),%rdi
1153	mov	16(%rax),%rsi
1154	mov	%rax,152($context)	# restore context->Rsp
1155	mov	%rsi,168($context)	# restore context->Rsi
1156	mov	%rdi,176($context)	# restore context->Rdi
1157
1158	mov	40($disp),%rdi		# disp->ContextRecord
1159	mov	$context,%rsi		# context
1160	mov	\$154,%ecx		# sizeof(CONTEXT)
1161	.long	0xa548f3fc		# cld; rep movsq
1162
1163	mov	$disp,%rsi
1164	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1165	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1166	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1167	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1168	mov	40(%rsi),%r10		# disp->ContextRecord
1169	lea	56(%rsi),%r11		# &disp->HandlerData
1170	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1171	mov	%r10,32(%rsp)		# arg5
1172	mov	%r11,40(%rsp)		# arg6
1173	mov	%r12,48(%rsp)		# arg7
1174	mov	%rcx,56(%rsp)		# arg8, (NULL)
1175	call	*__imp_RtlVirtualUnwind(%rip)
1176
1177	mov	\$1,%eax		# ExceptionContinueSearch
1178	add	\$64,%rsp
1179	popfq
1180	pop	%r15
1181	pop	%r14
1182	pop	%r13
1183	pop	%r12
1184	pop	%rbp
1185	pop	%rbx
1186	pop	%rdi
1187	pop	%rsi
1188	ret
1189.size	ssse3_handler,.-ssse3_handler
1190
1191.section	.pdata
1192.align	4
1193	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
1194	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
1195	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
1196___
1197$code.=<<___ if ($avx);
1198	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
1199	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
1200	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
1201___
1202$code.=<<___;
1203.section	.xdata
1204.align	8
1205.LSEH_info_aesni_cbc_sha1_enc_ssse3:
1206	.byte	9,0,0,0
1207	.rva	ssse3_handler
1208	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1209___
1210$code.=<<___ if ($avx);
1211.LSEH_info_aesni_cbc_sha1_enc_avx:
1212	.byte	9,0,0,0
1213	.rva	ssse3_handler
1214	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1215___
1216}
1217
1218####################################################################
1219sub rex {
1220  local *opcode=shift;
1221  my ($dst,$src)=@_;
1222  my $rex=0;
1223
1224    $rex|=0x04			if($dst>=8);
1225    $rex|=0x01			if($src>=8);
1226    push @opcode,$rex|0x40	if($rex);
1227}
1228
1229sub aesni {
1230  my $line=shift;
1231  my @opcode=(0x66);
1232
1233    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1234	my %opcodelet = (
1235		"aesenc" => 0xdc,	"aesenclast" => 0xdd
1236	);
1237	return undef if (!defined($opcodelet{$1}));
1238	rex(\@opcode,$3,$2);
1239	push @opcode,0x0f,0x38,$opcodelet{$1};
1240	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
1241	return ".byte\t".join(',',@opcode);
1242    }
1243    return $line;
1244}
1245
1246$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1247$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1248
1249print $code;
1250close STDOUT;
1251