sha1-x86_64.pl revision 04ef91b390dfcc6125913e2f2af502d23d7a5112
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52######################################################################
53# Current performance is summarized in following table. Numbers are
54# CPU clock cycles spent to process single byte (less is better).
55#
56#		x86_64		SSSE3		AVX
57# P4		9.8		-
58# Opteron	6.6		-
59# Core2		6.7		6.1/+10%	-
60# Atom		11.0		9.7/+13%	-
61# Westmere	7.1		5.6/+27%	-
62# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
63
64$flavour = shift;
65$output  = shift;
66if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67
68$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73die "can't locate x86_64-xlate.pl";
74
75$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77	   $1>=2.19);
78$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80	   $1>=2.09);
81$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83	   $1>=10);
84
85open OUT,"| \"$^X\" $xlate $flavour $output";
86*STDOUT=*OUT;
87
88$ctx="%rdi";	# 1st arg
89$inp="%rsi";	# 2nd arg
90$num="%rdx";	# 3rd arg
91
92# reassign arguments in order to produce more compact code
93$ctx="%r8";
94$inp="%r9";
95$num="%r10";
96
97$t0="%eax";
98$t1="%ebx";
99$t2="%ecx";
100@xi=("%edx","%ebp");
101$A="%esi";
102$B="%edi";
103$C="%r11d";
104$D="%r12d";
105$E="%r13d";
106
107@V=($A,$B,$C,$D,$E);
108
109sub BODY_00_19 {
110my ($i,$a,$b,$c,$d,$e)=@_;
111my $j=$i+1;
112$code.=<<___ if ($i==0);
113	mov	`4*$i`($inp),$xi[0]
114	bswap	$xi[0]
115	mov	$xi[0],`4*$i`(%rsp)
116___
117$code.=<<___ if ($i<15);
118	mov	$c,$t0
119	mov	`4*$j`($inp),$xi[1]
120	mov	$a,$t2
121	xor	$d,$t0
122	bswap	$xi[1]
123	rol	\$5,$t2
124	lea	0x5a827999($xi[0],$e),$e
125	and	$b,$t0
126	mov	$xi[1],`4*$j`(%rsp)
127	add	$t2,$e
128	xor	$d,$t0
129	rol	\$30,$b
130	add	$t0,$e
131___
132$code.=<<___ if ($i>=15);
133	mov	`4*($j%16)`(%rsp),$xi[1]
134	mov	$c,$t0
135	mov	$a,$t2
136	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
137	xor	$d,$t0
138	rol	\$5,$t2
139	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
140	and	$b,$t0
141	lea	0x5a827999($xi[0],$e),$e
142	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
143	xor	$d,$t0
144	rol	\$1,$xi[1]
145	add	$t2,$e
146	rol	\$30,$b
147	mov	$xi[1],`4*($j%16)`(%rsp)
148	add	$t0,$e
149___
150unshift(@xi,pop(@xi));
151}
152
153sub BODY_20_39 {
154my ($i,$a,$b,$c,$d,$e)=@_;
155my $j=$i+1;
156my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
157$code.=<<___ if ($i<79);
158	mov	`4*($j%16)`(%rsp),$xi[1]
159	mov	$c,$t0
160	mov	$a,$t2
161	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
162	xor	$b,$t0
163	rol	\$5,$t2
164	lea	$K($xi[0],$e),$e
165	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
166	xor	$d,$t0
167	add	$t2,$e
168	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
169	rol	\$30,$b
170	add	$t0,$e
171	rol	\$1,$xi[1]
172___
173$code.=<<___ if ($i<76);
174	mov	$xi[1],`4*($j%16)`(%rsp)
175___
176$code.=<<___ if ($i==79);
177	mov	$c,$t0
178	mov	$a,$t2
179	xor	$b,$t0
180	lea	$K($xi[0],$e),$e
181	rol	\$5,$t2
182	xor	$d,$t0
183	add	$t2,$e
184	rol	\$30,$b
185	add	$t0,$e
186___
187unshift(@xi,pop(@xi));
188}
189
190sub BODY_40_59 {
191my ($i,$a,$b,$c,$d,$e)=@_;
192my $j=$i+1;
193$code.=<<___;
194	mov	`4*($j%16)`(%rsp),$xi[1]
195	mov	$c,$t0
196	mov	$c,$t1
197	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
198	and	$d,$t0
199	mov	$a,$t2
200	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
201	xor	$d,$t1
202	lea	0x8f1bbcdc($xi[0],$e),$e
203	rol	\$5,$t2
204	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
205	add	$t0,$e
206	and	$b,$t1
207	rol	\$1,$xi[1]
208	add	$t1,$e
209	rol	\$30,$b
210	mov	$xi[1],`4*($j%16)`(%rsp)
211	add	$t2,$e
212___
213unshift(@xi,pop(@xi));
214}
215
216$code.=<<___;
217.text
218.extern	OPENSSL_ia32cap_P
219
220.globl	sha1_block_data_order
221.type	sha1_block_data_order,\@function,3
222.align	16
223sha1_block_data_order:
224	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
225	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
226	test	\$`1<<9`,%r8d		# check SSSE3 bit
227	jz	.Lialu
228___
229$code.=<<___ if ($avx);
230	and	\$`1<<28`,%r8d		# mask AVX bit
231	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
232	or	%r9d,%r8d
233	cmp	\$`1<<28|1<<30`,%r8d
234	je	_avx_shortcut
235___
236$code.=<<___;
237	jmp	_ssse3_shortcut
238
239.align	16
240.Lialu:
241	push	%rbx
242	push	%rbp
243	push	%r12
244	push	%r13
245	mov	%rsp,%r11
246	mov	%rdi,$ctx	# reassigned argument
247	sub	\$`8+16*4`,%rsp
248	mov	%rsi,$inp	# reassigned argument
249	and	\$-64,%rsp
250	mov	%rdx,$num	# reassigned argument
251	mov	%r11,`16*4`(%rsp)
252.Lprologue:
253
254	mov	0($ctx),$A
255	mov	4($ctx),$B
256	mov	8($ctx),$C
257	mov	12($ctx),$D
258	mov	16($ctx),$E
259	jmp	.Lloop
260
261.align	16
262.Lloop:
263___
264for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
265for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
266for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
267for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
268$code.=<<___;
269	add	0($ctx),$A
270	add	4($ctx),$B
271	add	8($ctx),$C
272	add	12($ctx),$D
273	add	16($ctx),$E
274	mov	$A,0($ctx)
275	mov	$B,4($ctx)
276	mov	$C,8($ctx)
277	mov	$D,12($ctx)
278	mov	$E,16($ctx)
279
280	sub	\$1,$num
281	lea	`16*4`($inp),$inp
282	jnz	.Lloop
283
284	mov	`16*4`(%rsp),%rsi
285	mov	(%rsi),%r13
286	mov	8(%rsi),%r12
287	mov	16(%rsi),%rbp
288	mov	24(%rsi),%rbx
289	lea	32(%rsi),%rsp
290.Lepilogue:
291	ret
292.size	sha1_block_data_order,.-sha1_block_data_order
293___
294{{{
295my $Xi=4;
296my @X=map("%xmm$_",(4..7,0..3));
297my @Tx=map("%xmm$_",(8..10));
298my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
299my @T=("%esi","%edi");
300my $j=0;
301my $K_XX_XX="%r11";
302
303my $_rol=sub { &rol(@_) };
304my $_ror=sub { &ror(@_) };
305
306$code.=<<___;
307.type	sha1_block_data_order_ssse3,\@function,3
308.align	16
309sha1_block_data_order_ssse3:
310_ssse3_shortcut:
311	push	%rbx
312	push	%rbp
313	push	%r12
314	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
315___
316$code.=<<___ if ($win64);
317	movaps	%xmm6,64+0(%rsp)
318	movaps	%xmm7,64+16(%rsp)
319	movaps	%xmm8,64+32(%rsp)
320	movaps	%xmm9,64+48(%rsp)
321	movaps	%xmm10,64+64(%rsp)
322.Lprologue_ssse3:
323___
324$code.=<<___;
325	mov	%rdi,$ctx	# reassigned argument
326	mov	%rsi,$inp	# reassigned argument
327	mov	%rdx,$num	# reassigned argument
328
329	shl	\$6,$num
330	add	$inp,$num
331	lea	K_XX_XX(%rip),$K_XX_XX
332
333	mov	0($ctx),$A		# load context
334	mov	4($ctx),$B
335	mov	8($ctx),$C
336	mov	12($ctx),$D
337	mov	$B,@T[0]		# magic seed
338	mov	16($ctx),$E
339
340	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
341	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
342	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
343	movdqu	16($inp),@X[-3&7]
344	movdqu	32($inp),@X[-2&7]
345	movdqu	48($inp),@X[-1&7]
346	pshufb	@X[2],@X[-4&7]		# byte swap
347	add	\$64,$inp
348	pshufb	@X[2],@X[-3&7]
349	pshufb	@X[2],@X[-2&7]
350	pshufb	@X[2],@X[-1&7]
351	paddd	@Tx[1],@X[-4&7]		# add K_00_19
352	paddd	@Tx[1],@X[-3&7]
353	paddd	@Tx[1],@X[-2&7]
354	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
355	psubd	@Tx[1],@X[-4&7]		# restore X[]
356	movdqa	@X[-3&7],16(%rsp)
357	psubd	@Tx[1],@X[-3&7]
358	movdqa	@X[-2&7],32(%rsp)
359	psubd	@Tx[1],@X[-2&7]
360	jmp	.Loop_ssse3
361___
362
363sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
364{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
365  my $arg = pop;
366    $arg = "\$$arg" if ($arg*1 eq $arg);
367    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
368}
369
370sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
371{ use integer;
372  my $body = shift;
373  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
374  my ($a,$b,$c,$d,$e);
375
376	&movdqa	(@X[0],@X[-3&7]);
377	 eval(shift(@insns));
378	 eval(shift(@insns));
379	&movdqa	(@Tx[0],@X[-1&7]);
380	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
381	 eval(shift(@insns));
382	 eval(shift(@insns));
383
384	  &paddd	(@Tx[1],@X[-1&7]);
385	 eval(shift(@insns));
386	 eval(shift(@insns));
387	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
388	 eval(shift(@insns));
389	 eval(shift(@insns));
390	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
391	 eval(shift(@insns));
392	 eval(shift(@insns));
393
394	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
395	 eval(shift(@insns));
396	 eval(shift(@insns));
397	 eval(shift(@insns));
398	 eval(shift(@insns));
399
400	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
401	 eval(shift(@insns));
402	 eval(shift(@insns));
403	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
404	 eval(shift(@insns));
405	 eval(shift(@insns));
406
407	&movdqa	(@Tx[2],@X[0]);
408	&movdqa	(@Tx[0],@X[0]);
409	 eval(shift(@insns));
410	 eval(shift(@insns));
411	 eval(shift(@insns));
412	 eval(shift(@insns));
413
414	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
415	&paddd	(@X[0],@X[0]);
416	 eval(shift(@insns));
417	 eval(shift(@insns));
418	 eval(shift(@insns));
419	 eval(shift(@insns));
420
421	&psrld	(@Tx[0],31);
422	 eval(shift(@insns));
423	 eval(shift(@insns));
424	&movdqa	(@Tx[1],@Tx[2]);
425	 eval(shift(@insns));
426	 eval(shift(@insns));
427
428	&psrld	(@Tx[2],30);
429	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
430	 eval(shift(@insns));
431	 eval(shift(@insns));
432	 eval(shift(@insns));
433	 eval(shift(@insns));
434
435	&pslld	(@Tx[1],2);
436	&pxor	(@X[0],@Tx[2]);
437	 eval(shift(@insns));
438	 eval(shift(@insns));
439	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
440	 eval(shift(@insns));
441	 eval(shift(@insns));
442
443	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
444
445	 foreach (@insns) { eval; }	# remaining instructions [if any]
446
447  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
448		push(@Tx,shift(@Tx));
449}
450
451sub Xupdate_ssse3_32_79()
452{ use integer;
453  my $body = shift;
454  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
455  my ($a,$b,$c,$d,$e);
456
457	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
458	 eval(shift(@insns));		# body_20_39
459	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
460	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
461	 eval(shift(@insns));
462	 eval(shift(@insns));
463	 eval(shift(@insns));		# rol
464
465	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
466	 eval(shift(@insns));
467	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
468	if ($Xi%5) {
469	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
470	} else {			# ... or load next one
471	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
472	}
473	  &paddd	(@Tx[1],@X[-1&7]);
474	 eval(shift(@insns));		# ror
475	 eval(shift(@insns));
476
477	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
478	 eval(shift(@insns));		# body_20_39
479	 eval(shift(@insns));
480	 eval(shift(@insns));
481	 eval(shift(@insns));		# rol
482
483	&movdqa	(@Tx[0],@X[0]);
484	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
485	 eval(shift(@insns));
486	 eval(shift(@insns));
487	 eval(shift(@insns));		# ror
488	 eval(shift(@insns));
489
490	&pslld	(@X[0],2);
491	 eval(shift(@insns));		# body_20_39
492	 eval(shift(@insns));
493	&psrld	(@Tx[0],30);
494	 eval(shift(@insns));
495	 eval(shift(@insns));		# rol
496	 eval(shift(@insns));
497	 eval(shift(@insns));
498	 eval(shift(@insns));		# ror
499	 eval(shift(@insns));
500
501	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
502	 eval(shift(@insns));		# body_20_39
503	 eval(shift(@insns));
504	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
505	 eval(shift(@insns));
506	 eval(shift(@insns));		# rol
507	 eval(shift(@insns));
508	 eval(shift(@insns));
509	 eval(shift(@insns));		# rol
510	 eval(shift(@insns));
511
512	 foreach (@insns) { eval; }	# remaining instructions
513
514  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
515		push(@Tx,shift(@Tx));
516}
517
518sub Xuplast_ssse3_80()
519{ use integer;
520  my $body = shift;
521  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
522  my ($a,$b,$c,$d,$e);
523
524	 eval(shift(@insns));
525	  &paddd	(@Tx[1],@X[-1&7]);
526	 eval(shift(@insns));
527	 eval(shift(@insns));
528	 eval(shift(@insns));
529	 eval(shift(@insns));
530
531	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
532
533	 foreach (@insns) { eval; }		# remaining instructions
534
535	&cmp	($inp,$num);
536	&je	(".Ldone_ssse3");
537
538	unshift(@Tx,pop(@Tx));
539
540	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
541	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
542	&movdqu	(@X[-4&7],"0($inp)");		# load input
543	&movdqu	(@X[-3&7],"16($inp)");
544	&movdqu	(@X[-2&7],"32($inp)");
545	&movdqu	(@X[-1&7],"48($inp)");
546	&pshufb	(@X[-4&7],@X[2]);		# byte swap
547	&add	($inp,64);
548
549  $Xi=0;
550}
551
552sub Xloop_ssse3()
553{ use integer;
554  my $body = shift;
555  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
556  my ($a,$b,$c,$d,$e);
557
558	 eval(shift(@insns));
559	 eval(shift(@insns));
560	&pshufb	(@X[($Xi-3)&7],@X[2]);
561	 eval(shift(@insns));
562	 eval(shift(@insns));
563	&paddd	(@X[($Xi-4)&7],@Tx[1]);
564	 eval(shift(@insns));
565	 eval(shift(@insns));
566	 eval(shift(@insns));
567	 eval(shift(@insns));
568	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
569	 eval(shift(@insns));
570	 eval(shift(@insns));
571	&psubd	(@X[($Xi-4)&7],@Tx[1]);
572
573	foreach (@insns) { eval; }
574  $Xi++;
575}
576
577sub Xtail_ssse3()
578{ use integer;
579  my $body = shift;
580  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
581  my ($a,$b,$c,$d,$e);
582
583	foreach (@insns) { eval; }
584}
585
586sub body_00_19 () {
587	(
588	'($a,$b,$c,$d,$e)=@V;'.
589	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
590	'&xor	($c,$d);',
591	'&mov	(@T[1],$a);',	# $b in next round
592	'&$_rol	($a,5);',
593	'&and	(@T[0],$c);',	# ($b&($c^$d))
594	'&xor	($c,$d);',	# restore $c
595	'&xor	(@T[0],$d);',
596	'&add	($e,$a);',
597	'&$_ror	($b,$j?7:2);',	# $b>>>2
598	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
599	);
600}
601
602sub body_20_39 () {
603	(
604	'($a,$b,$c,$d,$e)=@V;'.
605	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
606	'&xor	(@T[0],$d);',	# ($b^$d)
607	'&mov	(@T[1],$a);',	# $b in next round
608	'&$_rol	($a,5);',
609	'&xor	(@T[0],$c);',	# ($b^$d^$c)
610	'&add	($e,$a);',
611	'&$_ror	($b,7);',	# $b>>>2
612	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
613	);
614}
615
616sub body_40_59 () {
617	(
618	'($a,$b,$c,$d,$e)=@V;'.
619	'&mov	(@T[1],$c);',
620	'&xor	($c,$d);',
621	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
622	'&and	(@T[1],$d);',
623	'&and	(@T[0],$c);',	# ($b&($c^$d))
624	'&$_ror	($b,7);',	# $b>>>2
625	'&add	($e,@T[1]);',
626	'&mov	(@T[1],$a);',	# $b in next round
627	'&$_rol	($a,5);',
628	'&add	($e,@T[0]);',
629	'&xor	($c,$d);',	# restore $c
630	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
631	);
632}
633$code.=<<___;
634.align	16
635.Loop_ssse3:
636___
637	&Xupdate_ssse3_16_31(\&body_00_19);
638	&Xupdate_ssse3_16_31(\&body_00_19);
639	&Xupdate_ssse3_16_31(\&body_00_19);
640	&Xupdate_ssse3_16_31(\&body_00_19);
641	&Xupdate_ssse3_32_79(\&body_00_19);
642	&Xupdate_ssse3_32_79(\&body_20_39);
643	&Xupdate_ssse3_32_79(\&body_20_39);
644	&Xupdate_ssse3_32_79(\&body_20_39);
645	&Xupdate_ssse3_32_79(\&body_20_39);
646	&Xupdate_ssse3_32_79(\&body_20_39);
647	&Xupdate_ssse3_32_79(\&body_40_59);
648	&Xupdate_ssse3_32_79(\&body_40_59);
649	&Xupdate_ssse3_32_79(\&body_40_59);
650	&Xupdate_ssse3_32_79(\&body_40_59);
651	&Xupdate_ssse3_32_79(\&body_40_59);
652	&Xupdate_ssse3_32_79(\&body_20_39);
653	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
654
655				$saved_j=$j; @saved_V=@V;
656
657	&Xloop_ssse3(\&body_20_39);
658	&Xloop_ssse3(\&body_20_39);
659	&Xloop_ssse3(\&body_20_39);
660
661$code.=<<___;
662	add	0($ctx),$A			# update context
663	add	4($ctx),@T[0]
664	add	8($ctx),$C
665	add	12($ctx),$D
666	mov	$A,0($ctx)
667	add	16($ctx),$E
668	mov	@T[0],4($ctx)
669	mov	@T[0],$B			# magic seed
670	mov	$C,8($ctx)
671	mov	$D,12($ctx)
672	mov	$E,16($ctx)
673	jmp	.Loop_ssse3
674
675.align	16
676.Ldone_ssse3:
677___
678				$j=$saved_j; @V=@saved_V;
679
680	&Xtail_ssse3(\&body_20_39);
681	&Xtail_ssse3(\&body_20_39);
682	&Xtail_ssse3(\&body_20_39);
683
684$code.=<<___;
685	add	0($ctx),$A			# update context
686	add	4($ctx),@T[0]
687	add	8($ctx),$C
688	mov	$A,0($ctx)
689	add	12($ctx),$D
690	mov	@T[0],4($ctx)
691	add	16($ctx),$E
692	mov	$C,8($ctx)
693	mov	$D,12($ctx)
694	mov	$E,16($ctx)
695___
696$code.=<<___ if ($win64);
697	movaps	64+0(%rsp),%xmm6
698	movaps	64+16(%rsp),%xmm7
699	movaps	64+32(%rsp),%xmm8
700	movaps	64+48(%rsp),%xmm9
701	movaps	64+64(%rsp),%xmm10
702___
703$code.=<<___;
704	lea	`64+($win64?5*16:0)`(%rsp),%rsi
705	mov	0(%rsi),%r12
706	mov	8(%rsi),%rbp
707	mov	16(%rsi),%rbx
708	lea	24(%rsi),%rsp
709.Lepilogue_ssse3:
710	ret
711.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
712___
713
714if ($avx) {
715my $Xi=4;
716my @X=map("%xmm$_",(4..7,0..3));
717my @Tx=map("%xmm$_",(8..10));
718my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
719my @T=("%esi","%edi");
720my $j=0;
721my $K_XX_XX="%r11";
722
723my $_rol=sub { &shld(@_[0],@_) };
724my $_ror=sub { &shrd(@_[0],@_) };
725
726$code.=<<___;
727.type	sha1_block_data_order_avx,\@function,3
728.align	16
729sha1_block_data_order_avx:
730_avx_shortcut:
731	push	%rbx
732	push	%rbp
733	push	%r12
734	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
735___
736$code.=<<___ if ($win64);
737	movaps	%xmm6,64+0(%rsp)
738	movaps	%xmm7,64+16(%rsp)
739	movaps	%xmm8,64+32(%rsp)
740	movaps	%xmm9,64+48(%rsp)
741	movaps	%xmm10,64+64(%rsp)
742.Lprologue_avx:
743___
744$code.=<<___;
745	mov	%rdi,$ctx	# reassigned argument
746	mov	%rsi,$inp	# reassigned argument
747	mov	%rdx,$num	# reassigned argument
748	vzeroall
749
750	shl	\$6,$num
751	add	$inp,$num
752	lea	K_XX_XX(%rip),$K_XX_XX
753
754	mov	0($ctx),$A		# load context
755	mov	4($ctx),$B
756	mov	8($ctx),$C
757	mov	12($ctx),$D
758	mov	$B,@T[0]		# magic seed
759	mov	16($ctx),$E
760
761	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
762	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
763	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
764	vmovdqu	16($inp),@X[-3&7]
765	vmovdqu	32($inp),@X[-2&7]
766	vmovdqu	48($inp),@X[-1&7]
767	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
768	add	\$64,$inp
769	vpshufb	@X[2],@X[-3&7],@X[-3&7]
770	vpshufb	@X[2],@X[-2&7],@X[-2&7]
771	vpshufb	@X[2],@X[-1&7],@X[-1&7]
772	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
773	vpaddd	@Tx[1],@X[-3&7],@X[1]
774	vpaddd	@Tx[1],@X[-2&7],@X[2]
775	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
776	vmovdqa	@X[1],16(%rsp)
777	vmovdqa	@X[2],32(%rsp)
778	jmp	.Loop_avx
779___
780
781sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
782{ use integer;
783  my $body = shift;
784  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
785  my ($a,$b,$c,$d,$e);
786
787	 eval(shift(@insns));
788	 eval(shift(@insns));
789	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
790	 eval(shift(@insns));
791	 eval(shift(@insns));
792
793	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
794	 eval(shift(@insns));
795	 eval(shift(@insns));
796	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
797	 eval(shift(@insns));
798	 eval(shift(@insns));
799	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
800	 eval(shift(@insns));
801	 eval(shift(@insns));
802
803	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
804	 eval(shift(@insns));
805	 eval(shift(@insns));
806	 eval(shift(@insns));
807	 eval(shift(@insns));
808
809	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
810	 eval(shift(@insns));
811	 eval(shift(@insns));
812	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
813	 eval(shift(@insns));
814	 eval(shift(@insns));
815
816	&vpsrld	(@Tx[0],@X[0],31);
817	 eval(shift(@insns));
818	 eval(shift(@insns));
819	 eval(shift(@insns));
820	 eval(shift(@insns));
821
822	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
823	&vpaddd	(@X[0],@X[0],@X[0]);
824	 eval(shift(@insns));
825	 eval(shift(@insns));
826	 eval(shift(@insns));
827	 eval(shift(@insns));
828
829	&vpsrld	(@Tx[1],@Tx[2],30);
830	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
831	 eval(shift(@insns));
832	 eval(shift(@insns));
833	 eval(shift(@insns));
834	 eval(shift(@insns));
835
836	&vpslld	(@Tx[2],@Tx[2],2);
837	&vpxor	(@X[0],@X[0],@Tx[1]);
838	 eval(shift(@insns));
839	 eval(shift(@insns));
840	 eval(shift(@insns));
841	 eval(shift(@insns));
842
843	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
844	 eval(shift(@insns));
845	 eval(shift(@insns));
846	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
847	 eval(shift(@insns));
848	 eval(shift(@insns));
849
850
851	 foreach (@insns) { eval; }	# remaining instructions [if any]
852
853  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
854		push(@Tx,shift(@Tx));
855}
856
857sub Xupdate_avx_32_79()
858{ use integer;
859  my $body = shift;
860  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
861  my ($a,$b,$c,$d,$e);
862
863	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
864	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
865	 eval(shift(@insns));		# body_20_39
866	 eval(shift(@insns));
867	 eval(shift(@insns));
868	 eval(shift(@insns));		# rol
869
870	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
871	 eval(shift(@insns));
872	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
873	if ($Xi%5) {
874	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
875	} else {			# ... or load next one
876	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
877	}
878	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
879	 eval(shift(@insns));		# ror
880	 eval(shift(@insns));
881
882	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
883	 eval(shift(@insns));		# body_20_39
884	 eval(shift(@insns));
885	 eval(shift(@insns));
886	 eval(shift(@insns));		# rol
887
888	&vpsrld	(@Tx[0],@X[0],30);
889	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
890	 eval(shift(@insns));
891	 eval(shift(@insns));
892	 eval(shift(@insns));		# ror
893	 eval(shift(@insns));
894
895	&vpslld	(@X[0],@X[0],2);
896	 eval(shift(@insns));		# body_20_39
897	 eval(shift(@insns));
898	 eval(shift(@insns));
899	 eval(shift(@insns));		# rol
900	 eval(shift(@insns));
901	 eval(shift(@insns));
902	 eval(shift(@insns));		# ror
903	 eval(shift(@insns));
904
905	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
906	 eval(shift(@insns));		# body_20_39
907	 eval(shift(@insns));
908	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
909	 eval(shift(@insns));
910	 eval(shift(@insns));		# rol
911	 eval(shift(@insns));
912	 eval(shift(@insns));
913	 eval(shift(@insns));		# rol
914	 eval(shift(@insns));
915
916	 foreach (@insns) { eval; }	# remaining instructions
917
918  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
919		push(@Tx,shift(@Tx));
920}
921
922sub Xuplast_avx_80()
923{ use integer;
924  my $body = shift;
925  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
926  my ($a,$b,$c,$d,$e);
927
928	 eval(shift(@insns));
929	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
930	 eval(shift(@insns));
931	 eval(shift(@insns));
932	 eval(shift(@insns));
933	 eval(shift(@insns));
934
935	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
936
937	 foreach (@insns) { eval; }		# remaining instructions
938
939	&cmp	($inp,$num);
940	&je	(".Ldone_avx");
941
942	unshift(@Tx,pop(@Tx));
943
944	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
945	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
946	&vmovdqu(@X[-4&7],"0($inp)");		# load input
947	&vmovdqu(@X[-3&7],"16($inp)");
948	&vmovdqu(@X[-2&7],"32($inp)");
949	&vmovdqu(@X[-1&7],"48($inp)");
950	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
951	&add	($inp,64);
952
953  $Xi=0;
954}
955
956sub Xloop_avx()
957{ use integer;
958  my $body = shift;
959  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
960  my ($a,$b,$c,$d,$e);
961
962	 eval(shift(@insns));
963	 eval(shift(@insns));
964	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
965	 eval(shift(@insns));
966	 eval(shift(@insns));
967	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
968	 eval(shift(@insns));
969	 eval(shift(@insns));
970	 eval(shift(@insns));
971	 eval(shift(@insns));
972	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
973	 eval(shift(@insns));
974	 eval(shift(@insns));
975
976	foreach (@insns) { eval; }
977  $Xi++;
978}
979
980sub Xtail_avx()
981{ use integer;
982  my $body = shift;
983  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
984  my ($a,$b,$c,$d,$e);
985
986	foreach (@insns) { eval; }
987}
988
989$code.=<<___;
990.align	16
991.Loop_avx:
992___
993	&Xupdate_avx_16_31(\&body_00_19);
994	&Xupdate_avx_16_31(\&body_00_19);
995	&Xupdate_avx_16_31(\&body_00_19);
996	&Xupdate_avx_16_31(\&body_00_19);
997	&Xupdate_avx_32_79(\&body_00_19);
998	&Xupdate_avx_32_79(\&body_20_39);
999	&Xupdate_avx_32_79(\&body_20_39);
1000	&Xupdate_avx_32_79(\&body_20_39);
1001	&Xupdate_avx_32_79(\&body_20_39);
1002	&Xupdate_avx_32_79(\&body_20_39);
1003	&Xupdate_avx_32_79(\&body_40_59);
1004	&Xupdate_avx_32_79(\&body_40_59);
1005	&Xupdate_avx_32_79(\&body_40_59);
1006	&Xupdate_avx_32_79(\&body_40_59);
1007	&Xupdate_avx_32_79(\&body_40_59);
1008	&Xupdate_avx_32_79(\&body_20_39);
1009	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1010
1011				$saved_j=$j; @saved_V=@V;
1012
1013	&Xloop_avx(\&body_20_39);
1014	&Xloop_avx(\&body_20_39);
1015	&Xloop_avx(\&body_20_39);
1016
1017$code.=<<___;
1018	add	0($ctx),$A			# update context
1019	add	4($ctx),@T[0]
1020	add	8($ctx),$C
1021	add	12($ctx),$D
1022	mov	$A,0($ctx)
1023	add	16($ctx),$E
1024	mov	@T[0],4($ctx)
1025	mov	@T[0],$B			# magic seed
1026	mov	$C,8($ctx)
1027	mov	$D,12($ctx)
1028	mov	$E,16($ctx)
1029	jmp	.Loop_avx
1030
1031.align	16
1032.Ldone_avx:
1033___
1034				$j=$saved_j; @V=@saved_V;
1035
1036	&Xtail_avx(\&body_20_39);
1037	&Xtail_avx(\&body_20_39);
1038	&Xtail_avx(\&body_20_39);
1039
1040$code.=<<___;
1041	vzeroall
1042
1043	add	0($ctx),$A			# update context
1044	add	4($ctx),@T[0]
1045	add	8($ctx),$C
1046	mov	$A,0($ctx)
1047	add	12($ctx),$D
1048	mov	@T[0],4($ctx)
1049	add	16($ctx),$E
1050	mov	$C,8($ctx)
1051	mov	$D,12($ctx)
1052	mov	$E,16($ctx)
1053___
1054$code.=<<___ if ($win64);
1055	movaps	64+0(%rsp),%xmm6
1056	movaps	64+16(%rsp),%xmm7
1057	movaps	64+32(%rsp),%xmm8
1058	movaps	64+48(%rsp),%xmm9
1059	movaps	64+64(%rsp),%xmm10
1060___
1061$code.=<<___;
1062	lea	`64+($win64?5*16:0)`(%rsp),%rsi
1063	mov	0(%rsi),%r12
1064	mov	8(%rsi),%rbp
1065	mov	16(%rsi),%rbx
1066	lea	24(%rsi),%rsp
1067.Lepilogue_avx:
1068	ret
1069.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1070___
1071}
1072$code.=<<___;
1073.align	64
1074K_XX_XX:
1075.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1076.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1077.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1078.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1079.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1080___
1081}}}
1082$code.=<<___;
1083.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1084.align	64
1085___
1086
1087# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1088#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1089if ($win64) {
1090$rec="%rcx";
1091$frame="%rdx";
1092$context="%r8";
1093$disp="%r9";
1094
1095$code.=<<___;
1096.extern	__imp_RtlVirtualUnwind
1097.type	se_handler,\@abi-omnipotent
1098.align	16
1099se_handler:
1100	push	%rsi
1101	push	%rdi
1102	push	%rbx
1103	push	%rbp
1104	push	%r12
1105	push	%r13
1106	push	%r14
1107	push	%r15
1108	pushfq
1109	sub	\$64,%rsp
1110
1111	mov	120($context),%rax	# pull context->Rax
1112	mov	248($context),%rbx	# pull context->Rip
1113
1114	lea	.Lprologue(%rip),%r10
1115	cmp	%r10,%rbx		# context->Rip<.Lprologue
1116	jb	.Lcommon_seh_tail
1117
1118	mov	152($context),%rax	# pull context->Rsp
1119
1120	lea	.Lepilogue(%rip),%r10
1121	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1122	jae	.Lcommon_seh_tail
1123
1124	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1125	lea	32(%rax),%rax
1126
1127	mov	-8(%rax),%rbx
1128	mov	-16(%rax),%rbp
1129	mov	-24(%rax),%r12
1130	mov	-32(%rax),%r13
1131	mov	%rbx,144($context)	# restore context->Rbx
1132	mov	%rbp,160($context)	# restore context->Rbp
1133	mov	%r12,216($context)	# restore context->R12
1134	mov	%r13,224($context)	# restore context->R13
1135
1136	jmp	.Lcommon_seh_tail
1137.size	se_handler,.-se_handler
1138
1139.type	ssse3_handler,\@abi-omnipotent
1140.align	16
1141ssse3_handler:
1142	push	%rsi
1143	push	%rdi
1144	push	%rbx
1145	push	%rbp
1146	push	%r12
1147	push	%r13
1148	push	%r14
1149	push	%r15
1150	pushfq
1151	sub	\$64,%rsp
1152
1153	mov	120($context),%rax	# pull context->Rax
1154	mov	248($context),%rbx	# pull context->Rip
1155
1156	mov	8($disp),%rsi		# disp->ImageBase
1157	mov	56($disp),%r11		# disp->HandlerData
1158
1159	mov	0(%r11),%r10d		# HandlerData[0]
1160	lea	(%rsi,%r10),%r10	# prologue label
1161	cmp	%r10,%rbx		# context->Rip<prologue label
1162	jb	.Lcommon_seh_tail
1163
1164	mov	152($context),%rax	# pull context->Rsp
1165
1166	mov	4(%r11),%r10d		# HandlerData[1]
1167	lea	(%rsi,%r10),%r10	# epilogue label
1168	cmp	%r10,%rbx		# context->Rip>=epilogue label
1169	jae	.Lcommon_seh_tail
1170
1171	lea	64(%rax),%rsi
1172	lea	512($context),%rdi	# &context.Xmm6
1173	mov	\$10,%ecx
1174	.long	0xa548f3fc		# cld; rep movsq
1175	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
1176
1177	mov	-8(%rax),%rbx
1178	mov	-16(%rax),%rbp
1179	mov	-24(%rax),%r12
1180	mov	%rbx,144($context)	# restore context->Rbx
1181	mov	%rbp,160($context)	# restore context->Rbp
1182	mov	%r12,216($context)	# restore cotnext->R12
1183
1184.Lcommon_seh_tail:
1185	mov	8(%rax),%rdi
1186	mov	16(%rax),%rsi
1187	mov	%rax,152($context)	# restore context->Rsp
1188	mov	%rsi,168($context)	# restore context->Rsi
1189	mov	%rdi,176($context)	# restore context->Rdi
1190
1191	mov	40($disp),%rdi		# disp->ContextRecord
1192	mov	$context,%rsi		# context
1193	mov	\$154,%ecx		# sizeof(CONTEXT)
1194	.long	0xa548f3fc		# cld; rep movsq
1195
1196	mov	$disp,%rsi
1197	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1198	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1199	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1200	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1201	mov	40(%rsi),%r10		# disp->ContextRecord
1202	lea	56(%rsi),%r11		# &disp->HandlerData
1203	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1204	mov	%r10,32(%rsp)		# arg5
1205	mov	%r11,40(%rsp)		# arg6
1206	mov	%r12,48(%rsp)		# arg7
1207	mov	%rcx,56(%rsp)		# arg8, (NULL)
1208	call	*__imp_RtlVirtualUnwind(%rip)
1209
1210	mov	\$1,%eax		# ExceptionContinueSearch
1211	add	\$64,%rsp
1212	popfq
1213	pop	%r15
1214	pop	%r14
1215	pop	%r13
1216	pop	%r12
1217	pop	%rbp
1218	pop	%rbx
1219	pop	%rdi
1220	pop	%rsi
1221	ret
1222.size	ssse3_handler,.-ssse3_handler
1223
1224.section	.pdata
1225.align	4
1226	.rva	.LSEH_begin_sha1_block_data_order
1227	.rva	.LSEH_end_sha1_block_data_order
1228	.rva	.LSEH_info_sha1_block_data_order
1229	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1230	.rva	.LSEH_end_sha1_block_data_order_ssse3
1231	.rva	.LSEH_info_sha1_block_data_order_ssse3
1232___
1233$code.=<<___ if ($avx);
1234	.rva	.LSEH_begin_sha1_block_data_order_avx
1235	.rva	.LSEH_end_sha1_block_data_order_avx
1236	.rva	.LSEH_info_sha1_block_data_order_avx
1237___
1238$code.=<<___;
1239.section	.xdata
1240.align	8
1241.LSEH_info_sha1_block_data_order:
1242	.byte	9,0,0,0
1243	.rva	se_handler
1244.LSEH_info_sha1_block_data_order_ssse3:
1245	.byte	9,0,0,0
1246	.rva	ssse3_handler
1247	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1248___
1249$code.=<<___ if ($avx);
1250.LSEH_info_sha1_block_data_order_avx:
1251	.byte	9,0,0,0
1252	.rva	ssse3_handler
1253	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1254___
1255}
1256
1257####################################################################
1258
1259$code =~ s/\`([^\`]*)\`/eval $1/gem;
1260print $code;
1261close STDOUT;
1262