aesni-sha1-x86_64.pl revision 392aa7cc7d2b122614c5393c3e357da07fd07af3
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2011
11#
12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
18# AESNI code is weaved into it. Below are performance numbers in
19# cycles per processed byte, less is better, for standalone AESNI-CBC
20# encrypt, sum of the latter and standalone SHA1, and "stitched"
21# subroutine:
22#
23#		AES-128-CBC	+SHA1		stitch      gain
24# Westmere	3.77[+5.6]	9.37		6.65	    +41%
25# Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
26#
27#		AES-192-CBC
28# Westmere	4.51		10.11		6.97	    +45%
29# Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
30#
31#		AES-256-CBC
32# Westmere	5.25		10.85		7.25	    +50%
33# Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
34#
35# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
36#	background information. Above numbers in parentheses are SSSE3
37#	results collected on AVX-capable CPU, i.e. apply on OSes that
38#	don't support AVX.
39#
40# Needless to mention that it makes no sense to implement "stitched"
41# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
42# fully utilize parallelism, so stitching would not give any gain
43# anyway. Well, there might be some, e.g. because of better cache
44# locality... For reference, here are performance results for
45# standalone AESNI-CBC decrypt:
46#
47#		AES-128-CBC	AES-192-CBC	AES-256-CBC
48# Westmere	1.31		1.55		1.80
49# Sandy Bridge	0.93		1.06		1.22
50
51$flavour = shift;
52$output  = shift;
53if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
54
55$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
60die "can't locate x86_64-xlate.pl";
61
62$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
63		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
64	   $1>=2.19);
65$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
67	   $1>=2.09);
68$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
70	   $1>=10);
71
72open STDOUT,"| $^X $xlate $flavour $output";
73
74# void aesni_cbc_sha1_enc(const void *inp,
75#			void *out,
76#			size_t length,
77#			const AES_KEY *key,
78#			unsigned char *iv,
79#			SHA_CTX *ctx,
80#			const void *in0);
81
82$code.=<<___;
83.text
84.extern	OPENSSL_ia32cap_P
85
86.globl	aesni_cbc_sha1_enc
87.type	aesni_cbc_sha1_enc,\@abi-omnipotent
88.align	16
89aesni_cbc_sha1_enc:
90	# caller should check for SSSE3 and AES-NI bits
91	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
92	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
93___
94$code.=<<___ if ($avx);
95	and	\$`1<<28`,%r11d		# mask AVX bit
96	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
97	or	%r11d,%r10d
98	cmp	\$`1<<28|1<<30`,%r10d
99	je	aesni_cbc_sha1_enc_avx
100___
101$code.=<<___;
102	jmp	aesni_cbc_sha1_enc_ssse3
103	ret
104.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
105___
106
107my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
108
109my $Xi=4;
110my @X=map("%xmm$_",(4..7,0..3));
111my @Tx=map("%xmm$_",(8..10));
112my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
113my @T=("%esi","%edi");
114my $j=0; my $jj=0; my $r=0; my $sn=0;
115my $K_XX_XX="%r11";
116my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
117my @rndkey=("%xmm14","%xmm15");
118
119sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
120{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
121  my $arg = pop;
122    $arg = "\$$arg" if ($arg*1 eq $arg);
123    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
124}
125
126my $_rol=sub { &rol(@_) };
127my $_ror=sub { &ror(@_) };
128
129$code.=<<___;
130.type	aesni_cbc_sha1_enc_ssse3,\@function,6
131.align	16
132aesni_cbc_sha1_enc_ssse3:
133	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
134	#shr	\$6,$len			# debugging artefact
135	#jz	.Lepilogue_ssse3		# debugging artefact
136	push	%rbx
137	push	%rbp
138	push	%r12
139	push	%r13
140	push	%r14
141	push	%r15
142	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
143	#mov	$in0,$inp			# debugging artefact
144	#lea	64(%rsp),$ctx			# debugging artefact
145___
146$code.=<<___ if ($win64);
147	movaps	%xmm6,96+0(%rsp)
148	movaps	%xmm7,96+16(%rsp)
149	movaps	%xmm8,96+32(%rsp)
150	movaps	%xmm9,96+48(%rsp)
151	movaps	%xmm10,96+64(%rsp)
152	movaps	%xmm11,96+80(%rsp)
153	movaps	%xmm12,96+96(%rsp)
154	movaps	%xmm13,96+112(%rsp)
155	movaps	%xmm14,96+128(%rsp)
156	movaps	%xmm15,96+144(%rsp)
157.Lprologue_ssse3:
158___
159$code.=<<___;
160	mov	$in0,%r12			# reassign arguments
161	mov	$out,%r13
162	mov	$len,%r14
163	mov	$key,%r15
164	movdqu	($ivp),$iv			# load IV
165	mov	$ivp,88(%rsp)			# save $ivp
166___
167my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
168my $rounds="${ivp}d";
169$code.=<<___;
170	shl	\$6,$len
171	sub	$in0,$out
172	mov	240($key),$rounds
173	add	$inp,$len		# end of input
174
175	lea	K_XX_XX(%rip),$K_XX_XX
176	mov	0($ctx),$A		# load context
177	mov	4($ctx),$B
178	mov	8($ctx),$C
179	mov	12($ctx),$D
180	mov	$B,@T[0]		# magic seed
181	mov	16($ctx),$E
182
183	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
184	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
185	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
186	movdqu	16($inp),@X[-3&7]
187	movdqu	32($inp),@X[-2&7]
188	movdqu	48($inp),@X[-1&7]
189	pshufb	@X[2],@X[-4&7]		# byte swap
190	add	\$64,$inp
191	pshufb	@X[2],@X[-3&7]
192	pshufb	@X[2],@X[-2&7]
193	pshufb	@X[2],@X[-1&7]
194	paddd	@Tx[1],@X[-4&7]		# add K_00_19
195	paddd	@Tx[1],@X[-3&7]
196	paddd	@Tx[1],@X[-2&7]
197	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
198	psubd	@Tx[1],@X[-4&7]		# restore X[]
199	movdqa	@X[-3&7],16(%rsp)
200	psubd	@Tx[1],@X[-3&7]
201	movdqa	@X[-2&7],32(%rsp)
202	psubd	@Tx[1],@X[-2&7]
203	movups	($key),$rndkey0		# $key[0]
204	movups	16($key),$rndkey[0]	# forward reference
205	jmp	.Loop_ssse3
206___
207
208my $aesenc=sub {
209  use integer;
210  my ($n,$k)=($r/10,$r%10);
211    if ($k==0) {
212      $code.=<<___;
213	movups		`16*$n`($in0),$in		# load input
214	xorps		$rndkey0,$in
215___
216      $code.=<<___ if ($n);
217	movups		$iv,`16*($n-1)`($out,$in0)	# write output
218___
219      $code.=<<___;
220	xorps		$in,$iv
221	aesenc		$rndkey[0],$iv
222	movups		`32+16*$k`($key),$rndkey[1]
223___
224    } elsif ($k==9) {
225      $sn++;
226      $code.=<<___;
227	cmp		\$11,$rounds
228	jb		.Laesenclast$sn
229	movups		`32+16*($k+0)`($key),$rndkey[1]
230	aesenc		$rndkey[0],$iv
231	movups		`32+16*($k+1)`($key),$rndkey[0]
232	aesenc		$rndkey[1],$iv
233	je		.Laesenclast$sn
234	movups		`32+16*($k+2)`($key),$rndkey[1]
235	aesenc		$rndkey[0],$iv
236	movups		`32+16*($k+3)`($key),$rndkey[0]
237	aesenc		$rndkey[1],$iv
238.Laesenclast$sn:
239	aesenclast	$rndkey[0],$iv
240	movups		16($key),$rndkey[1]		# forward reference
241___
242    } else {
243      $code.=<<___;
244	aesenc		$rndkey[0],$iv
245	movups		`32+16*$k`($key),$rndkey[1]
246___
247    }
248    $r++;	unshift(@rndkey,pop(@rndkey));
249};
250
251sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
252{ use integer;
253  my $body = shift;
254  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
255  my ($a,$b,$c,$d,$e);
256
257	&movdqa	(@X[0],@X[-3&7]);
258	 eval(shift(@insns));
259	 eval(shift(@insns));
260	&movdqa	(@Tx[0],@X[-1&7]);
261	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
262	 eval(shift(@insns));
263	 eval(shift(@insns));
264
265	  &paddd	(@Tx[1],@X[-1&7]);
266	 eval(shift(@insns));
267	 eval(shift(@insns));
268	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
269	 eval(shift(@insns));
270	 eval(shift(@insns));
271	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
272	 eval(shift(@insns));
273	 eval(shift(@insns));
274
275	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
276	 eval(shift(@insns));
277	 eval(shift(@insns));
278	 eval(shift(@insns));
279	 eval(shift(@insns));
280
281	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
282	 eval(shift(@insns));
283	 eval(shift(@insns));
284	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
285	 eval(shift(@insns));
286	 eval(shift(@insns));
287
288	&movdqa	(@Tx[2],@X[0]);
289	&movdqa	(@Tx[0],@X[0]);
290	 eval(shift(@insns));
291	 eval(shift(@insns));
292	 eval(shift(@insns));
293	 eval(shift(@insns));
294
295	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
296	&paddd	(@X[0],@X[0]);
297	 eval(shift(@insns));
298	 eval(shift(@insns));
299	 eval(shift(@insns));
300	 eval(shift(@insns));
301
302	&psrld	(@Tx[0],31);
303	 eval(shift(@insns));
304	 eval(shift(@insns));
305	&movdqa	(@Tx[1],@Tx[2]);
306	 eval(shift(@insns));
307	 eval(shift(@insns));
308
309	&psrld	(@Tx[2],30);
310	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
311	 eval(shift(@insns));
312	 eval(shift(@insns));
313	 eval(shift(@insns));
314	 eval(shift(@insns));
315
316	&pslld	(@Tx[1],2);
317	&pxor	(@X[0],@Tx[2]);
318	 eval(shift(@insns));
319	 eval(shift(@insns));
320	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
321	 eval(shift(@insns));
322	 eval(shift(@insns));
323
324	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
325
326	 foreach (@insns) { eval; }	# remaining instructions [if any]
327
328  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
329		push(@Tx,shift(@Tx));
330}
331
332sub Xupdate_ssse3_32_79()
333{ use integer;
334  my $body = shift;
335  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
336  my ($a,$b,$c,$d,$e);
337
338	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
339	 eval(shift(@insns));		# body_20_39
340	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
341	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
342	 eval(shift(@insns));
343	 eval(shift(@insns));
344	 eval(shift(@insns));		# rol
345
346	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
347	 eval(shift(@insns));
348	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
349	if ($Xi%5) {
350	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
351	} else {			# ... or load next one
352	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
353	}
354	  &paddd	(@Tx[1],@X[-1&7]);
355	 eval(shift(@insns));		# ror
356	 eval(shift(@insns));
357
358	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
359	 eval(shift(@insns));		# body_20_39
360	 eval(shift(@insns));
361	 eval(shift(@insns));
362	 eval(shift(@insns));		# rol
363
364	&movdqa	(@Tx[0],@X[0]);
365	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
366	 eval(shift(@insns));
367	 eval(shift(@insns));
368	 eval(shift(@insns));		# ror
369	 eval(shift(@insns));
370
371	&pslld	(@X[0],2);
372	 eval(shift(@insns));		# body_20_39
373	 eval(shift(@insns));
374	&psrld	(@Tx[0],30);
375	 eval(shift(@insns));
376	 eval(shift(@insns));		# rol
377	 eval(shift(@insns));
378	 eval(shift(@insns));
379	 eval(shift(@insns));		# ror
380	 eval(shift(@insns));
381
382	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
383	 eval(shift(@insns));		# body_20_39
384	 eval(shift(@insns));
385	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
386	 eval(shift(@insns));
387	 eval(shift(@insns));		# rol
388	 eval(shift(@insns));
389	 eval(shift(@insns));
390	 eval(shift(@insns));		# rol
391	 eval(shift(@insns));
392
393	 foreach (@insns) { eval; }	# remaining instructions
394
395  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
396		push(@Tx,shift(@Tx));
397}
398
399sub Xuplast_ssse3_80()
400{ use integer;
401  my $body = shift;
402  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
403  my ($a,$b,$c,$d,$e);
404
405	 eval(shift(@insns));
406	  &paddd	(@Tx[1],@X[-1&7]);
407	 eval(shift(@insns));
408	 eval(shift(@insns));
409	 eval(shift(@insns));
410	 eval(shift(@insns));
411
412	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
413
414	 foreach (@insns) { eval; }		# remaining instructions
415
416	&cmp	($inp,$len);
417	&je	(".Ldone_ssse3");
418
419	unshift(@Tx,pop(@Tx));
420
421	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
422	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
423	&movdqu	(@X[-4&7],"0($inp)");		# load input
424	&movdqu	(@X[-3&7],"16($inp)");
425	&movdqu	(@X[-2&7],"32($inp)");
426	&movdqu	(@X[-1&7],"48($inp)");
427	&pshufb	(@X[-4&7],@X[2]);		# byte swap
428	&add	($inp,64);
429
430  $Xi=0;
431}
432
433sub Xloop_ssse3()
434{ use integer;
435  my $body = shift;
436  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
437  my ($a,$b,$c,$d,$e);
438
439	 eval(shift(@insns));
440	 eval(shift(@insns));
441	&pshufb	(@X[($Xi-3)&7],@X[2]);
442	 eval(shift(@insns));
443	 eval(shift(@insns));
444	&paddd	(@X[($Xi-4)&7],@Tx[1]);
445	 eval(shift(@insns));
446	 eval(shift(@insns));
447	 eval(shift(@insns));
448	 eval(shift(@insns));
449	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
450	 eval(shift(@insns));
451	 eval(shift(@insns));
452	&psubd	(@X[($Xi-4)&7],@Tx[1]);
453
454	foreach (@insns) { eval; }
455  $Xi++;
456}
457
458sub Xtail_ssse3()
459{ use integer;
460  my $body = shift;
461  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
462  my ($a,$b,$c,$d,$e);
463
464	foreach (@insns) { eval; }
465}
466
467sub body_00_19 () {
468  use integer;
469  my ($k,$n);
470  my @r=(
471	'($a,$b,$c,$d,$e)=@V;'.
472	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
473	'&xor	($c,$d);',
474	'&mov	(@T[1],$a);',	# $b in next round
475	'&$_rol	($a,5);',
476	'&and	(@T[0],$c);',	# ($b&($c^$d))
477	'&xor	($c,$d);',	# restore $c
478	'&xor	(@T[0],$d);',
479	'&add	($e,$a);',
480	'&$_ror	($b,$j?7:2);',	# $b>>>2
481	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
482	);
483	$n = scalar(@r);
484	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
485	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
486	$jj++;
487    return @r;
488}
489
490sub body_20_39 () {
491  use integer;
492  my ($k,$n);
493  my @r=(
494	'($a,$b,$c,$d,$e)=@V;'.
495	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
496	'&xor	(@T[0],$d);',	# ($b^$d)
497	'&mov	(@T[1],$a);',	# $b in next round
498	'&$_rol	($a,5);',
499	'&xor	(@T[0],$c);',	# ($b^$d^$c)
500	'&add	($e,$a);',
501	'&$_ror	($b,7);',	# $b>>>2
502	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
503	);
504	$n = scalar(@r);
505	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
506	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
507	$jj++;
508    return @r;
509}
510
511sub body_40_59 () {
512  use integer;
513  my ($k,$n);
514  my @r=(
515	'($a,$b,$c,$d,$e)=@V;'.
516	'&mov	(@T[1],$c);',
517	'&xor	($c,$d);',
518	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
519	'&and	(@T[1],$d);',
520	'&and	(@T[0],$c);',	# ($b&($c^$d))
521	'&$_ror	($b,7);',	# $b>>>2
522	'&add	($e,@T[1]);',
523	'&mov	(@T[1],$a);',	# $b in next round
524	'&$_rol	($a,5);',
525	'&add	($e,@T[0]);',
526	'&xor	($c,$d);',	# restore $c
527	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
528	);
529	$n = scalar(@r);
530	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
531	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
532	$jj++;
533    return @r;
534}
535$code.=<<___;
536.align	16
537.Loop_ssse3:
538___
539	&Xupdate_ssse3_16_31(\&body_00_19);
540	&Xupdate_ssse3_16_31(\&body_00_19);
541	&Xupdate_ssse3_16_31(\&body_00_19);
542	&Xupdate_ssse3_16_31(\&body_00_19);
543	&Xupdate_ssse3_32_79(\&body_00_19);
544	&Xupdate_ssse3_32_79(\&body_20_39);
545	&Xupdate_ssse3_32_79(\&body_20_39);
546	&Xupdate_ssse3_32_79(\&body_20_39);
547	&Xupdate_ssse3_32_79(\&body_20_39);
548	&Xupdate_ssse3_32_79(\&body_20_39);
549	&Xupdate_ssse3_32_79(\&body_40_59);
550	&Xupdate_ssse3_32_79(\&body_40_59);
551	&Xupdate_ssse3_32_79(\&body_40_59);
552	&Xupdate_ssse3_32_79(\&body_40_59);
553	&Xupdate_ssse3_32_79(\&body_40_59);
554	&Xupdate_ssse3_32_79(\&body_20_39);
555	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
556
557				$saved_j=$j; @saved_V=@V;
558				$saved_r=$r; @saved_rndkey=@rndkey;
559
560	&Xloop_ssse3(\&body_20_39);
561	&Xloop_ssse3(\&body_20_39);
562	&Xloop_ssse3(\&body_20_39);
563
564$code.=<<___;
565	movups	$iv,48($out,$in0)		# write output
566	lea	64($in0),$in0
567
568	add	0($ctx),$A			# update context
569	add	4($ctx),@T[0]
570	add	8($ctx),$C
571	add	12($ctx),$D
572	mov	$A,0($ctx)
573	add	16($ctx),$E
574	mov	@T[0],4($ctx)
575	mov	@T[0],$B			# magic seed
576	mov	$C,8($ctx)
577	mov	$D,12($ctx)
578	mov	$E,16($ctx)
579	jmp	.Loop_ssse3
580
581.align	16
582.Ldone_ssse3:
583___
584				$jj=$j=$saved_j; @V=@saved_V;
585				$r=$saved_r;     @rndkey=@saved_rndkey;
586
587	&Xtail_ssse3(\&body_20_39);
588	&Xtail_ssse3(\&body_20_39);
589	&Xtail_ssse3(\&body_20_39);
590
591$code.=<<___;
592	movups	$iv,48($out,$in0)		# write output
593	mov	88(%rsp),$ivp			# restore $ivp
594
595	add	0($ctx),$A			# update context
596	add	4($ctx),@T[0]
597	add	8($ctx),$C
598	mov	$A,0($ctx)
599	add	12($ctx),$D
600	mov	@T[0],4($ctx)
601	add	16($ctx),$E
602	mov	$C,8($ctx)
603	mov	$D,12($ctx)
604	mov	$E,16($ctx)
605	movups	$iv,($ivp)			# write IV
606___
607$code.=<<___ if ($win64);
608	movaps	96+0(%rsp),%xmm6
609	movaps	96+16(%rsp),%xmm7
610	movaps	96+32(%rsp),%xmm8
611	movaps	96+48(%rsp),%xmm9
612	movaps	96+64(%rsp),%xmm10
613	movaps	96+80(%rsp),%xmm11
614	movaps	96+96(%rsp),%xmm12
615	movaps	96+112(%rsp),%xmm13
616	movaps	96+128(%rsp),%xmm14
617	movaps	96+144(%rsp),%xmm15
618___
619$code.=<<___;
620	lea	`104+($win64?10*16:0)`(%rsp),%rsi
621	mov	0(%rsi),%r15
622	mov	8(%rsi),%r14
623	mov	16(%rsi),%r13
624	mov	24(%rsi),%r12
625	mov	32(%rsi),%rbp
626	mov	40(%rsi),%rbx
627	lea	48(%rsi),%rsp
628.Lepilogue_ssse3:
629	ret
630.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
631___
632
633$j=$jj=$r=$sn=0;
634
635if ($avx) {
636my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
637
638my $Xi=4;
639my @X=map("%xmm$_",(4..7,0..3));
640my @Tx=map("%xmm$_",(8..10));
641my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
642my @T=("%esi","%edi");
643
644my $_rol=sub { &shld(@_[0],@_) };
645my $_ror=sub { &shrd(@_[0],@_) };
646
647$code.=<<___;
648.type	aesni_cbc_sha1_enc_avx,\@function,6
649.align	16
650aesni_cbc_sha1_enc_avx:
651	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
652	#shr	\$6,$len			# debugging artefact
653	#jz	.Lepilogue_avx			# debugging artefact
654	push	%rbx
655	push	%rbp
656	push	%r12
657	push	%r13
658	push	%r14
659	push	%r15
660	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
661	#mov	$in0,$inp			# debugging artefact
662	#lea	64(%rsp),$ctx			# debugging artefact
663___
664$code.=<<___ if ($win64);
665	movaps	%xmm6,96+0(%rsp)
666	movaps	%xmm7,96+16(%rsp)
667	movaps	%xmm8,96+32(%rsp)
668	movaps	%xmm9,96+48(%rsp)
669	movaps	%xmm10,96+64(%rsp)
670	movaps	%xmm11,96+80(%rsp)
671	movaps	%xmm12,96+96(%rsp)
672	movaps	%xmm13,96+112(%rsp)
673	movaps	%xmm14,96+128(%rsp)
674	movaps	%xmm15,96+144(%rsp)
675.Lprologue_avx:
676___
677$code.=<<___;
678	vzeroall
679	mov	$in0,%r12			# reassign arguments
680	mov	$out,%r13
681	mov	$len,%r14
682	mov	$key,%r15
683	vmovdqu	($ivp),$iv			# load IV
684	mov	$ivp,88(%rsp)			# save $ivp
685___
686my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
687my $rounds="${ivp}d";
688$code.=<<___;
689	shl	\$6,$len
690	sub	$in0,$out
691	mov	240($key),$rounds
692	add	\$112,$key		# size optimization
693	add	$inp,$len		# end of input
694
695	lea	K_XX_XX(%rip),$K_XX_XX
696	mov	0($ctx),$A		# load context
697	mov	4($ctx),$B
698	mov	8($ctx),$C
699	mov	12($ctx),$D
700	mov	$B,@T[0]		# magic seed
701	mov	16($ctx),$E
702
703	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
704	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
705	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
706	vmovdqu	16($inp),@X[-3&7]
707	vmovdqu	32($inp),@X[-2&7]
708	vmovdqu	48($inp),@X[-1&7]
709	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
710	add	\$64,$inp
711	vpshufb	@X[2],@X[-3&7],@X[-3&7]
712	vpshufb	@X[2],@X[-2&7],@X[-2&7]
713	vpshufb	@X[2],@X[-1&7],@X[-1&7]
714	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
715	vpaddd	@Tx[1],@X[-3&7],@X[1]
716	vpaddd	@Tx[1],@X[-2&7],@X[2]
717	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
718	vmovdqa	@X[1],16(%rsp)
719	vmovdqa	@X[2],32(%rsp)
720	vmovups	-112($key),$rndkey0	# $key[0]
721	vmovups	16-112($key),$rndkey[0]	# forward reference
722	jmp	.Loop_avx
723___
724
725my $aesenc=sub {
726  use integer;
727  my ($n,$k)=($r/10,$r%10);
728    if ($k==0) {
729      $code.=<<___;
730	vmovups		`16*$n`($in0),$in		# load input
731	vxorps		$rndkey0,$in,$in
732___
733      $code.=<<___ if ($n);
734	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
735___
736      $code.=<<___;
737	vxorps		$in,$iv,$iv
738	vaesenc		$rndkey[0],$iv,$iv
739	vmovups		`32+16*$k-112`($key),$rndkey[1]
740___
741    } elsif ($k==9) {
742      $sn++;
743      $code.=<<___;
744	cmp		\$11,$rounds
745	jb		.Lvaesenclast$sn
746	vaesenc		$rndkey[0],$iv,$iv
747	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
748	vaesenc		$rndkey[1],$iv,$iv
749	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
750	je		.Lvaesenclast$sn
751	vaesenc		$rndkey[0],$iv,$iv
752	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
753	vaesenc		$rndkey[1],$iv,$iv
754	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
755.Lvaesenclast$sn:
756	vaesenclast	$rndkey[0],$iv,$iv
757	vmovups		16-112($key),$rndkey[1]		# forward reference
758___
759    } else {
760      $code.=<<___;
761	vaesenc		$rndkey[0],$iv,$iv
762	vmovups		`32+16*$k-112`($key),$rndkey[1]
763___
764    }
765    $r++;	unshift(@rndkey,pop(@rndkey));
766};
767
768sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
769{ use integer;
770  my $body = shift;
771  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
772  my ($a,$b,$c,$d,$e);
773
774	 eval(shift(@insns));
775	 eval(shift(@insns));
776	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
777	 eval(shift(@insns));
778	 eval(shift(@insns));
779
780	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
781	 eval(shift(@insns));
782	 eval(shift(@insns));
783	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
784	 eval(shift(@insns));
785	 eval(shift(@insns));
786	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
787	 eval(shift(@insns));
788	 eval(shift(@insns));
789
790	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
791	 eval(shift(@insns));
792	 eval(shift(@insns));
793	 eval(shift(@insns));
794	 eval(shift(@insns));
795
796	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
797	 eval(shift(@insns));
798	 eval(shift(@insns));
799	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
800	 eval(shift(@insns));
801	 eval(shift(@insns));
802
803	&vpsrld	(@Tx[0],@X[0],31);
804	 eval(shift(@insns));
805	 eval(shift(@insns));
806	 eval(shift(@insns));
807	 eval(shift(@insns));
808
809	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
810	&vpaddd	(@X[0],@X[0],@X[0]);
811	 eval(shift(@insns));
812	 eval(shift(@insns));
813	 eval(shift(@insns));
814	 eval(shift(@insns));
815
816	&vpsrld	(@Tx[1],@Tx[2],30);
817	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
818	 eval(shift(@insns));
819	 eval(shift(@insns));
820	 eval(shift(@insns));
821	 eval(shift(@insns));
822
823	&vpslld	(@Tx[2],@Tx[2],2);
824	&vpxor	(@X[0],@X[0],@Tx[1]);
825	 eval(shift(@insns));
826	 eval(shift(@insns));
827	 eval(shift(@insns));
828	 eval(shift(@insns));
829
830	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
831	 eval(shift(@insns));
832	 eval(shift(@insns));
833	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
834	 eval(shift(@insns));
835	 eval(shift(@insns));
836
837
838	 foreach (@insns) { eval; }	# remaining instructions [if any]
839
840  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
841		push(@Tx,shift(@Tx));
842}
843
844sub Xupdate_avx_32_79()
845{ use integer;
846  my $body = shift;
847  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
848  my ($a,$b,$c,$d,$e);
849
850	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
851	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
852	 eval(shift(@insns));		# body_20_39
853	 eval(shift(@insns));
854	 eval(shift(@insns));
855	 eval(shift(@insns));		# rol
856
857	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
858	 eval(shift(@insns));
859	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
860	if ($Xi%5) {
861	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
862	} else {			# ... or load next one
863	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
864	}
865	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
866	 eval(shift(@insns));		# ror
867	 eval(shift(@insns));
868
869	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
870	 eval(shift(@insns));		# body_20_39
871	 eval(shift(@insns));
872	 eval(shift(@insns));
873	 eval(shift(@insns));		# rol
874
875	&vpsrld	(@Tx[0],@X[0],30);
876	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
877	 eval(shift(@insns));
878	 eval(shift(@insns));
879	 eval(shift(@insns));		# ror
880	 eval(shift(@insns));
881
882	&vpslld	(@X[0],@X[0],2);
883	 eval(shift(@insns));		# body_20_39
884	 eval(shift(@insns));
885	 eval(shift(@insns));
886	 eval(shift(@insns));		# rol
887	 eval(shift(@insns));
888	 eval(shift(@insns));
889	 eval(shift(@insns));		# ror
890	 eval(shift(@insns));
891
892	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
893	 eval(shift(@insns));		# body_20_39
894	 eval(shift(@insns));
895	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
896	 eval(shift(@insns));
897	 eval(shift(@insns));		# rol
898	 eval(shift(@insns));
899	 eval(shift(@insns));
900	 eval(shift(@insns));		# rol
901	 eval(shift(@insns));
902
903	 foreach (@insns) { eval; }	# remaining instructions
904
905  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
906		push(@Tx,shift(@Tx));
907}
908
909sub Xuplast_avx_80()
910{ use integer;
911  my $body = shift;
912  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
913  my ($a,$b,$c,$d,$e);
914
915	 eval(shift(@insns));
916	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
917	 eval(shift(@insns));
918	 eval(shift(@insns));
919	 eval(shift(@insns));
920	 eval(shift(@insns));
921
922	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
923
924	 foreach (@insns) { eval; }		# remaining instructions
925
926	&cmp	($inp,$len);
927	&je	(".Ldone_avx");
928
929	unshift(@Tx,pop(@Tx));
930
931	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
932	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
933	&vmovdqu(@X[-4&7],"0($inp)");		# load input
934	&vmovdqu(@X[-3&7],"16($inp)");
935	&vmovdqu(@X[-2&7],"32($inp)");
936	&vmovdqu(@X[-1&7],"48($inp)");
937	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
938	&add	($inp,64);
939
940  $Xi=0;
941}
942
943sub Xloop_avx()
944{ use integer;
945  my $body = shift;
946  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
947  my ($a,$b,$c,$d,$e);
948
949	 eval(shift(@insns));
950	 eval(shift(@insns));
951	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
952	 eval(shift(@insns));
953	 eval(shift(@insns));
954	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
955	 eval(shift(@insns));
956	 eval(shift(@insns));
957	 eval(shift(@insns));
958	 eval(shift(@insns));
959	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
960	 eval(shift(@insns));
961	 eval(shift(@insns));
962
963	foreach (@insns) { eval; }
964  $Xi++;
965}
966
967sub Xtail_avx()
968{ use integer;
969  my $body = shift;
970  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
971  my ($a,$b,$c,$d,$e);
972
973	foreach (@insns) { eval; }
974}
975
976$code.=<<___;
977.align	16
978.Loop_avx:
979___
980	&Xupdate_avx_16_31(\&body_00_19);
981	&Xupdate_avx_16_31(\&body_00_19);
982	&Xupdate_avx_16_31(\&body_00_19);
983	&Xupdate_avx_16_31(\&body_00_19);
984	&Xupdate_avx_32_79(\&body_00_19);
985	&Xupdate_avx_32_79(\&body_20_39);
986	&Xupdate_avx_32_79(\&body_20_39);
987	&Xupdate_avx_32_79(\&body_20_39);
988	&Xupdate_avx_32_79(\&body_20_39);
989	&Xupdate_avx_32_79(\&body_20_39);
990	&Xupdate_avx_32_79(\&body_40_59);
991	&Xupdate_avx_32_79(\&body_40_59);
992	&Xupdate_avx_32_79(\&body_40_59);
993	&Xupdate_avx_32_79(\&body_40_59);
994	&Xupdate_avx_32_79(\&body_40_59);
995	&Xupdate_avx_32_79(\&body_20_39);
996	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
997
998				$saved_j=$j; @saved_V=@V;
999				$saved_r=$r; @saved_rndkey=@rndkey;
1000
1001	&Xloop_avx(\&body_20_39);
1002	&Xloop_avx(\&body_20_39);
1003	&Xloop_avx(\&body_20_39);
1004
1005$code.=<<___;
1006	vmovups	$iv,48($out,$in0)		# write output
1007	lea	64($in0),$in0
1008
1009	add	0($ctx),$A			# update context
1010	add	4($ctx),@T[0]
1011	add	8($ctx),$C
1012	add	12($ctx),$D
1013	mov	$A,0($ctx)
1014	add	16($ctx),$E
1015	mov	@T[0],4($ctx)
1016	mov	@T[0],$B			# magic seed
1017	mov	$C,8($ctx)
1018	mov	$D,12($ctx)
1019	mov	$E,16($ctx)
1020	jmp	.Loop_avx
1021
1022.align	16
1023.Ldone_avx:
1024___
1025				$jj=$j=$saved_j; @V=@saved_V;
1026				$r=$saved_r;     @rndkey=@saved_rndkey;
1027
1028	&Xtail_avx(\&body_20_39);
1029	&Xtail_avx(\&body_20_39);
1030	&Xtail_avx(\&body_20_39);
1031
1032$code.=<<___;
1033	vmovups	$iv,48($out,$in0)		# write output
1034	mov	88(%rsp),$ivp			# restore $ivp
1035
1036	add	0($ctx),$A			# update context
1037	add	4($ctx),@T[0]
1038	add	8($ctx),$C
1039	mov	$A,0($ctx)
1040	add	12($ctx),$D
1041	mov	@T[0],4($ctx)
1042	add	16($ctx),$E
1043	mov	$C,8($ctx)
1044	mov	$D,12($ctx)
1045	mov	$E,16($ctx)
1046	vmovups	$iv,($ivp)			# write IV
1047	vzeroall
1048___
1049$code.=<<___ if ($win64);
1050	movaps	96+0(%rsp),%xmm6
1051	movaps	96+16(%rsp),%xmm7
1052	movaps	96+32(%rsp),%xmm8
1053	movaps	96+48(%rsp),%xmm9
1054	movaps	96+64(%rsp),%xmm10
1055	movaps	96+80(%rsp),%xmm11
1056	movaps	96+96(%rsp),%xmm12
1057	movaps	96+112(%rsp),%xmm13
1058	movaps	96+128(%rsp),%xmm14
1059	movaps	96+144(%rsp),%xmm15
1060___
1061$code.=<<___;
1062	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1063	mov	0(%rsi),%r15
1064	mov	8(%rsi),%r14
1065	mov	16(%rsi),%r13
1066	mov	24(%rsi),%r12
1067	mov	32(%rsi),%rbp
1068	mov	40(%rsi),%rbx
1069	lea	48(%rsi),%rsp
1070.Lepilogue_avx:
1071	ret
1072.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1073___
1074}
1075$code.=<<___;
1076.align	64
1077K_XX_XX:
1078.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1079.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1080.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1081.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1082.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1083
1084.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1085.align	64
1086___
1087
1088# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1089#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1090if ($win64) {
1091$rec="%rcx";
1092$frame="%rdx";
1093$context="%r8";
1094$disp="%r9";
1095
1096$code.=<<___;
1097.extern	__imp_RtlVirtualUnwind
1098.type	ssse3_handler,\@abi-omnipotent
1099.align	16
1100ssse3_handler:
1101	push	%rsi
1102	push	%rdi
1103	push	%rbx
1104	push	%rbp
1105	push	%r12
1106	push	%r13
1107	push	%r14
1108	push	%r15
1109	pushfq
1110	sub	\$64,%rsp
1111
1112	mov	120($context),%rax	# pull context->Rax
1113	mov	248($context),%rbx	# pull context->Rip
1114
1115	mov	8($disp),%rsi		# disp->ImageBase
1116	mov	56($disp),%r11		# disp->HandlerData
1117
1118	mov	0(%r11),%r10d		# HandlerData[0]
1119	lea	(%rsi,%r10),%r10	# prologue label
1120	cmp	%r10,%rbx		# context->Rip<prologue label
1121	jb	.Lcommon_seh_tail
1122
1123	mov	152($context),%rax	# pull context->Rsp
1124
1125	mov	4(%r11),%r10d		# HandlerData[1]
1126	lea	(%rsi,%r10),%r10	# epilogue label
1127	cmp	%r10,%rbx		# context->Rip>=epilogue label
1128	jae	.Lcommon_seh_tail
1129
1130	lea	96(%rax),%rsi
1131	lea	512($context),%rdi	# &context.Xmm6
1132	mov	\$20,%ecx
1133	.long	0xa548f3fc		# cld; rep movsq
1134	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
1135
1136	mov	0(%rax),%r15
1137	mov	8(%rax),%r14
1138	mov	16(%rax),%r13
1139	mov	24(%rax),%r12
1140	mov	32(%rax),%rbp
1141	mov	40(%rax),%rbx
1142	lea	48(%rax),%rax
1143	mov	%rbx,144($context)	# restore context->Rbx
1144	mov	%rbp,160($context)	# restore context->Rbp
1145	mov	%r12,216($context)	# restore context->R12
1146	mov	%r13,224($context)	# restore context->R13
1147	mov	%r14,232($context)	# restore context->R14
1148	mov	%r15,240($context)	# restore context->R15
1149
1150.Lcommon_seh_tail:
1151	mov	8(%rax),%rdi
1152	mov	16(%rax),%rsi
1153	mov	%rax,152($context)	# restore context->Rsp
1154	mov	%rsi,168($context)	# restore context->Rsi
1155	mov	%rdi,176($context)	# restore context->Rdi
1156
1157	mov	40($disp),%rdi		# disp->ContextRecord
1158	mov	$context,%rsi		# context
1159	mov	\$154,%ecx		# sizeof(CONTEXT)
1160	.long	0xa548f3fc		# cld; rep movsq
1161
1162	mov	$disp,%rsi
1163	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1164	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1165	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1166	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1167	mov	40(%rsi),%r10		# disp->ContextRecord
1168	lea	56(%rsi),%r11		# &disp->HandlerData
1169	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1170	mov	%r10,32(%rsp)		# arg5
1171	mov	%r11,40(%rsp)		# arg6
1172	mov	%r12,48(%rsp)		# arg7
1173	mov	%rcx,56(%rsp)		# arg8, (NULL)
1174	call	*__imp_RtlVirtualUnwind(%rip)
1175
1176	mov	\$1,%eax		# ExceptionContinueSearch
1177	add	\$64,%rsp
1178	popfq
1179	pop	%r15
1180	pop	%r14
1181	pop	%r13
1182	pop	%r12
1183	pop	%rbp
1184	pop	%rbx
1185	pop	%rdi
1186	pop	%rsi
1187	ret
1188.size	ssse3_handler,.-ssse3_handler
1189
1190.section	.pdata
1191.align	4
1192	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
1193	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
1194	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
1195___
1196$code.=<<___ if ($avx);
1197	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
1198	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
1199	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
1200___
1201$code.=<<___;
1202.section	.xdata
1203.align	8
1204.LSEH_info_aesni_cbc_sha1_enc_ssse3:
1205	.byte	9,0,0,0
1206	.rva	ssse3_handler
1207	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1208___
1209$code.=<<___ if ($avx);
1210.LSEH_info_aesni_cbc_sha1_enc_avx:
1211	.byte	9,0,0,0
1212	.rva	ssse3_handler
1213	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1214___
1215}
1216
1217####################################################################
1218sub rex {
1219  local *opcode=shift;
1220  my ($dst,$src)=@_;
1221  my $rex=0;
1222
1223    $rex|=0x04			if($dst>=8);
1224    $rex|=0x01			if($src>=8);
1225    push @opcode,$rex|0x40	if($rex);
1226}
1227
1228sub aesni {
1229  my $line=shift;
1230  my @opcode=(0x66);
1231
1232    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1233	my %opcodelet = (
1234		"aesenc" => 0xdc,	"aesenclast" => 0xdd
1235	);
1236	return undef if (!defined($opcodelet{$1}));
1237	rex(\@opcode,$3,$2);
1238	push @opcode,0x0f,0x38,$opcodelet{$1};
1239	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
1240	return ".byte\t".join(',',@opcode);
1241    }
1242    return $line;
1243}
1244
1245$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1246$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1247
1248print $code;
1249close STDOUT;
1250