1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# sha256/512_block procedure for x86_64.
10#
11# 40% improvement over compiler-generated code on Opteron. On EM64T
12# sha256 was observed to run >80% faster and sha512 - >40%. No magical
13# tricks, just straight implementation... I really wonder why gcc
14# [being armed with inline assembler] fails to generate as fast code.
15# The only thing which is cool about this module is that it's very
16# same instruction sequence used for both SHA-256 and SHA-512. In
17# former case the instructions operate on 32-bit operands, while in
18# latter - on 64-bit ones. All I had to do is to get one flavor right,
19# the other one passed the test right away:-)
20#
21# sha256_block runs in ~1005 cycles on Opteron, which gives you
22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23# frequency in GHz. sha512_block runs in ~1275 cycles, which results
24# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25# Well, if you compare it to IA-64 implementation, which maintains
26# X[16] in register bank[!], tends to 4 instructions per CPU clock
27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28# issue Opteron pipeline and X[16] maintained in memory. So that *if*
29# there is a way to improve it, *then* the only way would be to try to
30# offload X[16] updates to SSE unit, but that would require "deeper"
31# loop unroll, which in turn would naturally cause size blow-up, not
32# to mention increased complexity! And once again, only *if* it's
33# actually possible to noticeably improve overall ILP, instruction
34# level parallelism, on a given CPU implementation in this case.
35#
36# Special note on Intel EM64T. While Opteron CPU exhibits perfect
37# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38# [currently available] EM64T CPUs apparently are far from it. On the
39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40# sha256_block:-( This is presumably because 64-bit shifts/rotates
41# apparently are not atomic instructions, but implemented in microcode.
42#
43# May 2012.
44#
45# Optimization including one of Pavel Semjanov's ideas, alternative
46# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47# unfortunately -2% SHA512 on P4 [which nobody should care about
48# that much].
49#
50# June 2012.
51#
52# Add SIMD code paths, see below for improvement coefficients. SSSE3
53# code path was not attempted for SHA512, because improvement is not
54# estimated to be high enough, noticeably less than 9%, to justify
55# the effort, not on pre-AVX processors. [Obviously with exclusion
56# for VIA Nano, but it has SHA512 instruction that is faster and
57# should be used instead.] For reference, corresponding estimated
58# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59# higher coefficients are observed on VIA Nano and Bulldozer has more
60# to do with specifics of their architecture [which is topic for
61# separate discussion].
62#
63# November 2012.
64#
65# Add AVX2 code path. Two consecutive input blocks are loaded to
66# 256-bit %ymm registers, with data from first block to least
67# significant 128-bit halves and data from second to most significant.
68# The data is then processed with same SIMD instruction sequence as
69# for AVX, but with %ymm as operands. Side effect is increased stack
70# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
71# code size increase.
72#
73# March 2014.
74#
75# Add support for Intel SHA Extensions.
76
77######################################################################
78# Current performance in cycles per processed byte (less is better):
79#
80#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
81#
82# AMD K8	14.9	-	    -		    9.57    -
83# P4		17.3	-	    -		    30.8    -
84# Core 2	15.6	13.8(+13%)  -		    9.97    -
85# Westmere	14.8	12.3(+19%)  -		    9.58    -
86# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
87# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
88# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
89# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
90# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
91# Atom		23.0	18.9(+22%)  -		    14.7    -
92#
93# (*)	whichever best applicable;
94# (**)	switch from ror to shrd stands for fair share of improvement;
95# (***)	execution time is fully determined by remaining integer-only
96#	part, body_00_15; reducing the amount of SIMD instructions
97#	below certain limit makes no difference/sense; to conserve
98#	space SHA256 XOP code path is therefore omitted;
99
100$flavour = shift;
101$output  = shift;
102if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
103
104$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
105
106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109die "can't locate x86_64-xlate.pl";
110
111if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
112		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
113	$avx = ($1>=2.19) + ($1>=2.22);
114}
115
116if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
117	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
118	$avx = ($1>=2.09) + ($1>=2.10);
119}
120
121if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
122	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
123	$avx = ($1>=10) + ($1>=11);
124}
125
126if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
127	$avx = ($2>=3.0) + ($2>3.0);
128}
129
130$shaext=0;	### set to zero if compiling for 1.0.1
131$avx=1		if (!$shaext && $avx);
132
133open OUT,"| \"$^X\" $xlate $flavour";
134*STDOUT=*OUT;
135
136if ($output =~ /512/) {
137	$func="sha512_block_data_order";
138	$TABLE="K512";
139	$SZ=8;
140	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
141					"%r8", "%r9", "%r10","%r11");
142	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
143	@Sigma0=(28,34,39);
144	@Sigma1=(14,18,41);
145	@sigma0=(1,  8, 7);
146	@sigma1=(19,61, 6);
147	$rounds=80;
148} else {
149	$func="sha256_block_data_order";
150	$TABLE="K256";
151	$SZ=4;
152	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
153					"%r8d","%r9d","%r10d","%r11d");
154	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
155	@Sigma0=( 2,13,22);
156	@Sigma1=( 6,11,25);
157	@sigma0=( 7,18, 3);
158	@sigma1=(17,19,10);
159	$rounds=64;
160}
161
162$ctx="%rdi";	# 1st arg, zapped by $a3
163$inp="%rsi";	# 2nd arg
164$Tbl="%rbp";
165
166$_ctx="16*$SZ+0*8(%rsp)";
167$_inp="16*$SZ+1*8(%rsp)";
168$_end="16*$SZ+2*8(%rsp)";
169$_rsp="16*$SZ+3*8(%rsp)";
170$framesz="16*$SZ+4*8";
171
172
173sub ROUND_00_15()
174{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
175  my $STRIDE=$SZ;
176     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
177
178$code.=<<___;
179	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
180	mov	$f,$a2
181
182	xor	$e,$a0
183	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
184	xor	$g,$a2			# f^g
185
186	mov	$T1,`$SZ*($i&0xf)`(%rsp)
187	xor	$a,$a1
188	and	$e,$a2			# (f^g)&e
189
190	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
191	add	$h,$T1			# T1+=h
192	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
193
194	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
195	xor	$e,$a0
196	add	$a2,$T1			# T1+=Ch(e,f,g)
197
198	mov	$a,$a2
199	add	($Tbl),$T1		# T1+=K[round]
200	xor	$a,$a1
201
202	xor	$b,$a2			# a^b, b^c in next round
203	ror	\$$Sigma1[0],$a0	# Sigma1(e)
204	mov	$b,$h
205
206	and	$a2,$a3
207	ror	\$$Sigma0[0],$a1	# Sigma0(a)
208	add	$a0,$T1			# T1+=Sigma1(e)
209
210	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
211	add	$T1,$d			# d+=T1
212	add	$T1,$h			# h+=T1
213
214	lea	$STRIDE($Tbl),$Tbl	# round++
215___
216$code.=<<___ if ($i<15);
217	add	$a1,$h			# h+=Sigma0(a)
218___
219	($a2,$a3) = ($a3,$a2);
220}
221
222sub ROUND_16_XX()
223{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
224
225$code.=<<___;
226	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
227	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
228
229	mov	$a0,$T1
230	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
231	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
232	mov	$a2,$a1
233	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
234
235	xor	$T1,$a0
236	shr	\$$sigma0[2],$T1
237	ror	\$$sigma0[0],$a0
238	xor	$a1,$a2
239	shr	\$$sigma1[2],$a1
240
241	ror	\$$sigma1[0],$a2
242	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
243	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
244	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
245
246	add	`$SZ*($i&0xf)`(%rsp),$T1
247	mov	$e,$a0
248	add	$a2,$T1
249	mov	$a,$a1
250___
251	&ROUND_00_15(@_);
252}
253
254$code=<<___;
255.text
256
257.extern	OPENSSL_ia32cap_P
258.globl	$func
259.type	$func,\@function,3
260.align	16
261$func:
262___
263$code.=<<___ if ($SZ==4 || $avx);
264	lea	OPENSSL_ia32cap_P(%rip),%r11
265	mov	0(%r11),%r9d
266	mov	4(%r11),%r10d
267	mov	8(%r11),%r11d
268___
269$code.=<<___ if ($SZ==4 && $shaext);
270	test	\$`1<<29`,%r11d		# check for SHA
271	jnz	_shaext_shortcut
272___
273$code.=<<___ if ($avx && $SZ==8);
274	test	\$`1<<11`,%r10d		# check for XOP
275	jnz	.Lxop_shortcut
276___
277$code.=<<___ if ($avx>1);
278	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
279	cmp	\$`1<<8|1<<5|1<<3`,%r11d
280	je	.Lavx2_shortcut
281___
282$code.=<<___ if ($avx);
283	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
284	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
285	or	%r9d,%r10d
286	cmp	\$`1<<28|1<<9|1<<30`,%r10d
287	je	.Lavx_shortcut
288___
289$code.=<<___ if ($SZ==4);
290	test	\$`1<<9`,%r10d
291	jnz	.Lssse3_shortcut
292___
293$code.=<<___;
294	push	%rbx
295	push	%rbp
296	push	%r12
297	push	%r13
298	push	%r14
299	push	%r15
300	mov	%rsp,%r11		# copy %rsp
301	shl	\$4,%rdx		# num*16
302	sub	\$$framesz,%rsp
303	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
304	and	\$-64,%rsp		# align stack frame
305	mov	$ctx,$_ctx		# save ctx, 1st arg
306	mov	$inp,$_inp		# save inp, 2nd arh
307	mov	%rdx,$_end		# save end pointer, "3rd" arg
308	mov	%r11,$_rsp		# save copy of %rsp
309.Lprologue:
310
311	mov	$SZ*0($ctx),$A
312	mov	$SZ*1($ctx),$B
313	mov	$SZ*2($ctx),$C
314	mov	$SZ*3($ctx),$D
315	mov	$SZ*4($ctx),$E
316	mov	$SZ*5($ctx),$F
317	mov	$SZ*6($ctx),$G
318	mov	$SZ*7($ctx),$H
319	jmp	.Lloop
320
321.align	16
322.Lloop:
323	mov	$B,$a3
324	lea	$TABLE(%rip),$Tbl
325	xor	$C,$a3			# magic
326___
327	for($i=0;$i<16;$i++) {
328		$code.="	mov	$SZ*$i($inp),$T1\n";
329		$code.="	mov	@ROT[4],$a0\n";
330		$code.="	mov	@ROT[0],$a1\n";
331		$code.="	bswap	$T1\n";
332		&ROUND_00_15($i,@ROT);
333		unshift(@ROT,pop(@ROT));
334	}
335$code.=<<___;
336	jmp	.Lrounds_16_xx
337.align	16
338.Lrounds_16_xx:
339___
340	for(;$i<32;$i++) {
341		&ROUND_16_XX($i,@ROT);
342		unshift(@ROT,pop(@ROT));
343	}
344
345$code.=<<___;
346	cmpb	\$0,`$SZ-1`($Tbl)
347	jnz	.Lrounds_16_xx
348
349	mov	$_ctx,$ctx
350	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
351	lea	16*$SZ($inp),$inp
352
353	add	$SZ*0($ctx),$A
354	add	$SZ*1($ctx),$B
355	add	$SZ*2($ctx),$C
356	add	$SZ*3($ctx),$D
357	add	$SZ*4($ctx),$E
358	add	$SZ*5($ctx),$F
359	add	$SZ*6($ctx),$G
360	add	$SZ*7($ctx),$H
361
362	cmp	$_end,$inp
363
364	mov	$A,$SZ*0($ctx)
365	mov	$B,$SZ*1($ctx)
366	mov	$C,$SZ*2($ctx)
367	mov	$D,$SZ*3($ctx)
368	mov	$E,$SZ*4($ctx)
369	mov	$F,$SZ*5($ctx)
370	mov	$G,$SZ*6($ctx)
371	mov	$H,$SZ*7($ctx)
372	jb	.Lloop
373
374	mov	$_rsp,%rsi
375	mov	(%rsi),%r15
376	mov	8(%rsi),%r14
377	mov	16(%rsi),%r13
378	mov	24(%rsi),%r12
379	mov	32(%rsi),%rbp
380	mov	40(%rsi),%rbx
381	lea	48(%rsi),%rsp
382.Lepilogue:
383	ret
384.size	$func,.-$func
385___
386
387if ($SZ==4) {
388$code.=<<___;
389.align	64
390.type	$TABLE,\@object
391$TABLE:
392	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
393	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
394	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
395	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
396	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
397	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
398	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
399	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
400	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
401	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
402	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
403	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
404	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
405	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
406	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
407	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
408	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
409	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
410	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
411	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
412	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
413	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
414	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
415	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
416	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
417	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
418	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
419	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
420	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
421	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
422	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
423	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
424
425	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
426	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
427	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
428	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
429	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
430	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
431	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
432___
433} else {
434$code.=<<___;
435.align	64
436.type	$TABLE,\@object
437$TABLE:
438	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
439	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
440	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
441	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
442	.quad	0x3956c25bf348b538,0x59f111f1b605d019
443	.quad	0x3956c25bf348b538,0x59f111f1b605d019
444	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
445	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
446	.quad	0xd807aa98a3030242,0x12835b0145706fbe
447	.quad	0xd807aa98a3030242,0x12835b0145706fbe
448	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
449	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
450	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
451	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
452	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
453	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
454	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
455	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
456	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
457	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
458	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
459	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
460	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
461	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
462	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
463	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
464	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
465	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
466	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
467	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
468	.quad	0x06ca6351e003826f,0x142929670a0e6e70
469	.quad	0x06ca6351e003826f,0x142929670a0e6e70
470	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
471	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
472	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
473	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
474	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
475	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
476	.quad	0x81c2c92e47edaee6,0x92722c851482353b
477	.quad	0x81c2c92e47edaee6,0x92722c851482353b
478	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
479	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
480	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
481	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
482	.quad	0xd192e819d6ef5218,0xd69906245565a910
483	.quad	0xd192e819d6ef5218,0xd69906245565a910
484	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
485	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
486	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
487	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
488	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
489	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
490	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
491	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
492	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
493	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
494	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
495	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
496	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
497	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
498	.quad	0x90befffa23631e28,0xa4506cebde82bde9
499	.quad	0x90befffa23631e28,0xa4506cebde82bde9
500	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
501	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
502	.quad	0xca273eceea26619c,0xd186b8c721c0c207
503	.quad	0xca273eceea26619c,0xd186b8c721c0c207
504	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
505	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
506	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
507	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
508	.quad	0x113f9804bef90dae,0x1b710b35131c471b
509	.quad	0x113f9804bef90dae,0x1b710b35131c471b
510	.quad	0x28db77f523047d84,0x32caab7b40c72493
511	.quad	0x28db77f523047d84,0x32caab7b40c72493
512	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
513	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
514	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
515	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
516	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
517	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
518
519	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
520	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
521	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
522___
523}
524
525######################################################################
526# SIMD code paths
527#
528if ($SZ==4 && $shaext) {{{
529######################################################################
530# Intel SHA Extensions implementation of SHA256 update function.
531#
532my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
533
534my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
535my @MSG=map("%xmm$_",(3..6));
536
537$code.=<<___;
538.type	sha256_block_data_order_shaext,\@function,3
539.align	64
540sha256_block_data_order_shaext:
541_shaext_shortcut:
542___
543$code.=<<___ if ($win64);
544	lea	`-8-5*16`(%rsp),%rsp
545	movaps	%xmm6,-8-5*16(%rax)
546	movaps	%xmm7,-8-4*16(%rax)
547	movaps	%xmm8,-8-3*16(%rax)
548	movaps	%xmm9,-8-2*16(%rax)
549	movaps	%xmm10,-8-1*16(%rax)
550.Lprologue_shaext:
551___
552$code.=<<___;
553	lea		K256+0x80(%rip),$Tbl
554	movdqu		($ctx),$ABEF		# DCBA
555	movdqu		16($ctx),$CDGH		# HGFE
556	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
557
558	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
559	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
560	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
561	movdqa		$TMP,$BSWAP		# offload
562	palignr		\$8,$CDGH,$ABEF		# ABEF
563	punpcklqdq	$Wi,$CDGH		# CDGH
564	jmp		.Loop_shaext
565
566.align	16
567.Loop_shaext:
568	movdqu		($inp),@MSG[0]
569	movdqu		0x10($inp),@MSG[1]
570	movdqu		0x20($inp),@MSG[2]
571	pshufb		$TMP,@MSG[0]
572	movdqu		0x30($inp),@MSG[3]
573
574	movdqa		0*32-0x80($Tbl),$Wi
575	paddd		@MSG[0],$Wi
576	pshufb		$TMP,@MSG[1]
577	movdqa		$CDGH,$CDGH_SAVE	# offload
578	sha256rnds2	$ABEF,$CDGH		# 0-3
579	pshufd		\$0x0e,$Wi,$Wi
580	nop
581	movdqa		$ABEF,$ABEF_SAVE	# offload
582	sha256rnds2	$CDGH,$ABEF
583
584	movdqa		1*32-0x80($Tbl),$Wi
585	paddd		@MSG[1],$Wi
586	pshufb		$TMP,@MSG[2]
587	sha256rnds2	$ABEF,$CDGH		# 4-7
588	pshufd		\$0x0e,$Wi,$Wi
589	lea		0x40($inp),$inp
590	sha256msg1	@MSG[1],@MSG[0]
591	sha256rnds2	$CDGH,$ABEF
592
593	movdqa		2*32-0x80($Tbl),$Wi
594	paddd		@MSG[2],$Wi
595	pshufb		$TMP,@MSG[3]
596	sha256rnds2	$ABEF,$CDGH		# 8-11
597	pshufd		\$0x0e,$Wi,$Wi
598	movdqa		@MSG[3],$TMP
599	palignr		\$4,@MSG[2],$TMP
600	nop
601	paddd		$TMP,@MSG[0]
602	sha256msg1	@MSG[2],@MSG[1]
603	sha256rnds2	$CDGH,$ABEF
604
605	movdqa		3*32-0x80($Tbl),$Wi
606	paddd		@MSG[3],$Wi
607	sha256msg2	@MSG[3],@MSG[0]
608	sha256rnds2	$ABEF,$CDGH		# 12-15
609	pshufd		\$0x0e,$Wi,$Wi
610	movdqa		@MSG[0],$TMP
611	palignr		\$4,@MSG[3],$TMP
612	nop
613	paddd		$TMP,@MSG[1]
614	sha256msg1	@MSG[3],@MSG[2]
615	sha256rnds2	$CDGH,$ABEF
616___
617for($i=4;$i<16-3;$i++) {
618$code.=<<___;
619	movdqa		$i*32-0x80($Tbl),$Wi
620	paddd		@MSG[0],$Wi
621	sha256msg2	@MSG[0],@MSG[1]
622	sha256rnds2	$ABEF,$CDGH		# 16-19...
623	pshufd		\$0x0e,$Wi,$Wi
624	movdqa		@MSG[1],$TMP
625	palignr		\$4,@MSG[0],$TMP
626	nop
627	paddd		$TMP,@MSG[2]
628	sha256msg1	@MSG[0],@MSG[3]
629	sha256rnds2	$CDGH,$ABEF
630___
631	push(@MSG,shift(@MSG));
632}
633$code.=<<___;
634	movdqa		13*32-0x80($Tbl),$Wi
635	paddd		@MSG[0],$Wi
636	sha256msg2	@MSG[0],@MSG[1]
637	sha256rnds2	$ABEF,$CDGH		# 52-55
638	pshufd		\$0x0e,$Wi,$Wi
639	movdqa		@MSG[1],$TMP
640	palignr		\$4,@MSG[0],$TMP
641	sha256rnds2	$CDGH,$ABEF
642	paddd		$TMP,@MSG[2]
643
644	movdqa		14*32-0x80($Tbl),$Wi
645	paddd		@MSG[1],$Wi
646	sha256rnds2	$ABEF,$CDGH		# 56-59
647	pshufd		\$0x0e,$Wi,$Wi
648	sha256msg2	@MSG[1],@MSG[2]
649	movdqa		$BSWAP,$TMP
650	sha256rnds2	$CDGH,$ABEF
651
652	movdqa		15*32-0x80($Tbl),$Wi
653	paddd		@MSG[2],$Wi
654	nop
655	sha256rnds2	$ABEF,$CDGH		# 60-63
656	pshufd		\$0x0e,$Wi,$Wi
657	dec		$num
658	nop
659	sha256rnds2	$CDGH,$ABEF
660
661	paddd		$CDGH_SAVE,$CDGH
662	paddd		$ABEF_SAVE,$ABEF
663	jnz		.Loop_shaext
664
665	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
666	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
667	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
668	punpckhqdq	$CDGH,$ABEF		# DCBA
669	palignr		\$8,$TMP,$CDGH		# HGFE
670
671	movdqu	$ABEF,($ctx)
672	movdqu	$CDGH,16($ctx)
673___
674$code.=<<___ if ($win64);
675	movaps	-8-5*16(%rax),%xmm6
676	movaps	-8-4*16(%rax),%xmm7
677	movaps	-8-3*16(%rax),%xmm8
678	movaps	-8-2*16(%rax),%xmm9
679	movaps	-8-1*16(%rax),%xmm10
680	mov	%rax,%rsp
681.Lepilogue_shaext:
682___
683$code.=<<___;
684	ret
685.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
686___
687}}}
688{{{
689
690my $a4=$T1;
691my ($a,$b,$c,$d,$e,$f,$g,$h);
692
693sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
694{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
695  my $arg = pop;
696    $arg = "\$$arg" if ($arg*1 eq $arg);
697    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
698}
699
700sub body_00_15 () {
701	(
702	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
703
704	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
705	'&mov	($a,$a1)',
706	'&mov	($a4,$f)',
707
708	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
709	'&xor	($a0,$e)',
710	'&xor	($a4,$g)',			# f^g
711
712	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
713	'&xor	($a1,$a)',
714	'&and	($a4,$e)',			# (f^g)&e
715
716	'&xor	($a0,$e)',
717	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
718	'&mov	($a2,$a)',
719
720	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
721	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
722	'&xor	($a2,$b)',			# a^b, b^c in next round
723
724	'&add	($h,$a4)',			# h+=Ch(e,f,g)
725	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
726	'&and	($a3,$a2)',			# (b^c)&(a^b)
727
728	'&xor	($a1,$a)',
729	'&add	($h,$a0)',			# h+=Sigma1(e)
730	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
731
732	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
733	'&add	($d,$h)',			# d+=h
734	'&add	($h,$a3)',			# h+=Maj(a,b,c)
735
736	'&mov	($a0,$d)',
737	'&add	($a1,$h);'.			# h+=Sigma0(a)
738	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
739	);
740}
741
742######################################################################
743# SSSE3 code path
744#
745if ($SZ==4) {	# SHA256 only
746my @X = map("%xmm$_",(0..3));
747my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
748
749$code.=<<___;
750.type	${func}_ssse3,\@function,3
751.align	64
752${func}_ssse3:
753.Lssse3_shortcut:
754	push	%rbx
755	push	%rbp
756	push	%r12
757	push	%r13
758	push	%r14
759	push	%r15
760	mov	%rsp,%r11		# copy %rsp
761	shl	\$4,%rdx		# num*16
762	sub	\$`$framesz+$win64*16*4`,%rsp
763	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
764	and	\$-64,%rsp		# align stack frame
765	mov	$ctx,$_ctx		# save ctx, 1st arg
766	mov	$inp,$_inp		# save inp, 2nd arh
767	mov	%rdx,$_end		# save end pointer, "3rd" arg
768	mov	%r11,$_rsp		# save copy of %rsp
769___
770$code.=<<___ if ($win64);
771	movaps	%xmm6,16*$SZ+32(%rsp)
772	movaps	%xmm7,16*$SZ+48(%rsp)
773	movaps	%xmm8,16*$SZ+64(%rsp)
774	movaps	%xmm9,16*$SZ+80(%rsp)
775___
776$code.=<<___;
777.Lprologue_ssse3:
778
779	mov	$SZ*0($ctx),$A
780	mov	$SZ*1($ctx),$B
781	mov	$SZ*2($ctx),$C
782	mov	$SZ*3($ctx),$D
783	mov	$SZ*4($ctx),$E
784	mov	$SZ*5($ctx),$F
785	mov	$SZ*6($ctx),$G
786	mov	$SZ*7($ctx),$H
787___
788
789$code.=<<___;
790	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
791	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
792	jmp	.Lloop_ssse3
793.align	16
794.Lloop_ssse3:
795	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
796	movdqu	0x00($inp),@X[0]
797	movdqu	0x10($inp),@X[1]
798	movdqu	0x20($inp),@X[2]
799	pshufb	$t3,@X[0]
800	movdqu	0x30($inp),@X[3]
801	lea	$TABLE(%rip),$Tbl
802	pshufb	$t3,@X[1]
803	movdqa	0x00($Tbl),$t0
804	movdqa	0x20($Tbl),$t1
805	pshufb	$t3,@X[2]
806	paddd	@X[0],$t0
807	movdqa	0x40($Tbl),$t2
808	pshufb	$t3,@X[3]
809	movdqa	0x60($Tbl),$t3
810	paddd	@X[1],$t1
811	paddd	@X[2],$t2
812	paddd	@X[3],$t3
813	movdqa	$t0,0x00(%rsp)
814	mov	$A,$a1
815	movdqa	$t1,0x10(%rsp)
816	mov	$B,$a3
817	movdqa	$t2,0x20(%rsp)
818	xor	$C,$a3			# magic
819	movdqa	$t3,0x30(%rsp)
820	mov	$E,$a0
821	jmp	.Lssse3_00_47
822
823.align	16
824.Lssse3_00_47:
825	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
826___
827sub Xupdate_256_SSSE3 () {
828	(
829	'&movdqa	($t0,@X[1]);',
830	'&movdqa	($t3,@X[3])',
831	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
832	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
833	'&movdqa	($t1,$t0)',
834	'&movdqa	($t2,$t0);',
835	'&psrld		($t0,$sigma0[2])',
836	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
837	'&psrld		($t2,$sigma0[0])',
838	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
839	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
840	'&pxor		($t0,$t2)',
841	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
842	'&pxor		($t0,$t1)',
843	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
844	'&pxor		($t0,$t2);',
845	 '&movdqa	($t2,$t3)',
846	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
847	 '&psrld	($t3,$sigma1[2])',
848	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
849	 '&psrlq	($t2,$sigma1[0])',
850	 '&pxor		($t3,$t2);',
851	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
852	 '&pxor		($t3,$t2)',
853	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
854	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
855	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
856	 '&movdqa	($t2,$t3);',
857	 '&psrld	($t3,$sigma1[2])',
858	 '&psrlq	($t2,$sigma1[0])',
859	 '&pxor		($t3,$t2);',
860	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
861	 '&pxor		($t3,$t2);',
862	'&movdqa	($t2,16*2*$j."($Tbl)")',
863	 '&pshufb	($t3,$t5)',
864	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
865	);
866}
867
868sub SSSE3_256_00_47 () {
869my $j = shift;
870my $body = shift;
871my @X = @_;
872my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
873
874    if (0) {
875	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
876	    eval;
877	    eval(shift(@insns));
878	    eval(shift(@insns));
879	    eval(shift(@insns));
880	}
881    } else {			# squeeze extra 4% on Westmere and 19% on Atom
882	  eval(shift(@insns));	#@
883	&movdqa		($t0,@X[1]);
884	  eval(shift(@insns));
885	  eval(shift(@insns));
886	&movdqa		($t3,@X[3]);
887	  eval(shift(@insns));	#@
888	  eval(shift(@insns));
889	  eval(shift(@insns));
890	  eval(shift(@insns));	#@
891	  eval(shift(@insns));
892	&palignr	($t0,@X[0],$SZ);	# X[1..4]
893	  eval(shift(@insns));
894	  eval(shift(@insns));
895	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
896	  eval(shift(@insns));
897	  eval(shift(@insns));
898	  eval(shift(@insns));
899	  eval(shift(@insns));	#@
900	&movdqa		($t1,$t0);
901	  eval(shift(@insns));
902	  eval(shift(@insns));
903	&movdqa		($t2,$t0);
904	  eval(shift(@insns));	#@
905	  eval(shift(@insns));
906	&psrld		($t0,$sigma0[2]);
907	  eval(shift(@insns));
908	  eval(shift(@insns));
909	  eval(shift(@insns));
910	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
911	  eval(shift(@insns));	#@
912	  eval(shift(@insns));
913	&psrld		($t2,$sigma0[0]);
914	  eval(shift(@insns));
915	  eval(shift(@insns));
916	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
917	  eval(shift(@insns));
918	  eval(shift(@insns));	#@
919	&pslld		($t1,8*$SZ-$sigma0[1]);
920	  eval(shift(@insns));
921	  eval(shift(@insns));
922	&pxor		($t0,$t2);
923	  eval(shift(@insns));	#@
924	  eval(shift(@insns));
925	  eval(shift(@insns));
926	  eval(shift(@insns));	#@
927	&psrld		($t2,$sigma0[1]-$sigma0[0]);
928	  eval(shift(@insns));
929	&pxor		($t0,$t1);
930	  eval(shift(@insns));
931	  eval(shift(@insns));
932	&pslld		($t1,$sigma0[1]-$sigma0[0]);
933	  eval(shift(@insns));
934	  eval(shift(@insns));
935	&pxor		($t0,$t2);
936	  eval(shift(@insns));
937	  eval(shift(@insns));	#@
938	 &movdqa	($t2,$t3);
939	  eval(shift(@insns));
940	  eval(shift(@insns));
941	&pxor		($t0,$t1);		# sigma0(X[1..4])
942	  eval(shift(@insns));	#@
943	  eval(shift(@insns));
944	  eval(shift(@insns));
945	 &psrld		($t3,$sigma1[2]);
946	  eval(shift(@insns));
947	  eval(shift(@insns));
948	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
949	  eval(shift(@insns));	#@
950	  eval(shift(@insns));
951	 &psrlq		($t2,$sigma1[0]);
952	  eval(shift(@insns));
953	  eval(shift(@insns));
954	  eval(shift(@insns));
955	 &pxor		($t3,$t2);
956	  eval(shift(@insns));	#@
957	  eval(shift(@insns));
958	  eval(shift(@insns));
959	  eval(shift(@insns));	#@
960	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
961	  eval(shift(@insns));
962	  eval(shift(@insns));
963	 &pxor		($t3,$t2);
964	  eval(shift(@insns));	#@
965	  eval(shift(@insns));
966	  eval(shift(@insns));
967	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
968	 &pshufd	($t3,$t3,0b10000000);
969	  eval(shift(@insns));
970	  eval(shift(@insns));
971	  eval(shift(@insns));
972	 &psrldq	($t3,8);
973	  eval(shift(@insns));
974	  eval(shift(@insns));	#@
975	  eval(shift(@insns));
976	  eval(shift(@insns));
977	  eval(shift(@insns));	#@
978	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
979	  eval(shift(@insns));
980	  eval(shift(@insns));
981	  eval(shift(@insns));
982	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
983	  eval(shift(@insns));
984	  eval(shift(@insns));	#@
985	  eval(shift(@insns));
986	 &movdqa	($t2,$t3);
987	  eval(shift(@insns));
988	  eval(shift(@insns));
989	 &psrld		($t3,$sigma1[2]);
990	  eval(shift(@insns));
991	  eval(shift(@insns));	#@
992	 &psrlq		($t2,$sigma1[0]);
993	  eval(shift(@insns));
994	  eval(shift(@insns));
995	 &pxor		($t3,$t2);
996	  eval(shift(@insns));	#@
997	  eval(shift(@insns));
998	  eval(shift(@insns));
999	  eval(shift(@insns));	#@
1000	  eval(shift(@insns));
1001	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1002	  eval(shift(@insns));
1003	  eval(shift(@insns));
1004	  eval(shift(@insns));
1005	 &pxor		($t3,$t2);
1006	  eval(shift(@insns));
1007	  eval(shift(@insns));
1008	  eval(shift(@insns));	#@
1009	 #&pshufb	($t3,$t5);
1010	 &pshufd	($t3,$t3,0b00001000);
1011	  eval(shift(@insns));
1012	  eval(shift(@insns));
1013	&movdqa		($t2,16*2*$j."($Tbl)");
1014	  eval(shift(@insns));	#@
1015	  eval(shift(@insns));
1016	 &pslldq	($t3,8);
1017	  eval(shift(@insns));
1018	  eval(shift(@insns));
1019	  eval(shift(@insns));
1020	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1021	  eval(shift(@insns));	#@
1022	  eval(shift(@insns));
1023	  eval(shift(@insns));
1024    }
1025	&paddd		($t2,@X[0]);
1026	  foreach (@insns) { eval; }		# remaining instructions
1027	&movdqa		(16*$j."(%rsp)",$t2);
1028}
1029
1030    for ($i=0,$j=0; $j<4; $j++) {
1031	&SSSE3_256_00_47($j,\&body_00_15,@X);
1032	push(@X,shift(@X));			# rotate(@X)
1033    }
1034	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1035	&jne	(".Lssse3_00_47");
1036
1037    for ($i=0; $i<16; ) {
1038	foreach(body_00_15()) { eval; }
1039    }
1040$code.=<<___;
1041	mov	$_ctx,$ctx
1042	mov	$a1,$A
1043
1044	add	$SZ*0($ctx),$A
1045	lea	16*$SZ($inp),$inp
1046	add	$SZ*1($ctx),$B
1047	add	$SZ*2($ctx),$C
1048	add	$SZ*3($ctx),$D
1049	add	$SZ*4($ctx),$E
1050	add	$SZ*5($ctx),$F
1051	add	$SZ*6($ctx),$G
1052	add	$SZ*7($ctx),$H
1053
1054	cmp	$_end,$inp
1055
1056	mov	$A,$SZ*0($ctx)
1057	mov	$B,$SZ*1($ctx)
1058	mov	$C,$SZ*2($ctx)
1059	mov	$D,$SZ*3($ctx)
1060	mov	$E,$SZ*4($ctx)
1061	mov	$F,$SZ*5($ctx)
1062	mov	$G,$SZ*6($ctx)
1063	mov	$H,$SZ*7($ctx)
1064	jb	.Lloop_ssse3
1065
1066	mov	$_rsp,%rsi
1067___
1068$code.=<<___ if ($win64);
1069	movaps	16*$SZ+32(%rsp),%xmm6
1070	movaps	16*$SZ+48(%rsp),%xmm7
1071	movaps	16*$SZ+64(%rsp),%xmm8
1072	movaps	16*$SZ+80(%rsp),%xmm9
1073___
1074$code.=<<___;
1075	mov	(%rsi),%r15
1076	mov	8(%rsi),%r14
1077	mov	16(%rsi),%r13
1078	mov	24(%rsi),%r12
1079	mov	32(%rsi),%rbp
1080	mov	40(%rsi),%rbx
1081	lea	48(%rsi),%rsp
1082.Lepilogue_ssse3:
1083	ret
1084.size	${func}_ssse3,.-${func}_ssse3
1085___
1086}
1087
1088if ($avx) {{
1089######################################################################
1090# XOP code path
1091#
1092if ($SZ==8) {	# SHA512 only
1093$code.=<<___;
1094.type	${func}_xop,\@function,3
1095.align	64
1096${func}_xop:
1097.Lxop_shortcut:
1098	push	%rbx
1099	push	%rbp
1100	push	%r12
1101	push	%r13
1102	push	%r14
1103	push	%r15
1104	mov	%rsp,%r11		# copy %rsp
1105	shl	\$4,%rdx		# num*16
1106	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1107	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1108	and	\$-64,%rsp		# align stack frame
1109	mov	$ctx,$_ctx		# save ctx, 1st arg
1110	mov	$inp,$_inp		# save inp, 2nd arh
1111	mov	%rdx,$_end		# save end pointer, "3rd" arg
1112	mov	%r11,$_rsp		# save copy of %rsp
1113___
1114$code.=<<___ if ($win64);
1115	movaps	%xmm6,16*$SZ+32(%rsp)
1116	movaps	%xmm7,16*$SZ+48(%rsp)
1117	movaps	%xmm8,16*$SZ+64(%rsp)
1118	movaps	%xmm9,16*$SZ+80(%rsp)
1119___
1120$code.=<<___ if ($win64 && $SZ>4);
1121	movaps	%xmm10,16*$SZ+96(%rsp)
1122	movaps	%xmm11,16*$SZ+112(%rsp)
1123___
1124$code.=<<___;
1125.Lprologue_xop:
1126
1127	vzeroupper
1128	mov	$SZ*0($ctx),$A
1129	mov	$SZ*1($ctx),$B
1130	mov	$SZ*2($ctx),$C
1131	mov	$SZ*3($ctx),$D
1132	mov	$SZ*4($ctx),$E
1133	mov	$SZ*5($ctx),$F
1134	mov	$SZ*6($ctx),$G
1135	mov	$SZ*7($ctx),$H
1136	jmp	.Lloop_xop
1137___
1138					if ($SZ==4) {	# SHA256
1139    my @X = map("%xmm$_",(0..3));
1140    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1141
1142$code.=<<___;
1143.align	16
1144.Lloop_xop:
1145	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1146	vmovdqu	0x00($inp),@X[0]
1147	vmovdqu	0x10($inp),@X[1]
1148	vmovdqu	0x20($inp),@X[2]
1149	vmovdqu	0x30($inp),@X[3]
1150	vpshufb	$t3,@X[0],@X[0]
1151	lea	$TABLE(%rip),$Tbl
1152	vpshufb	$t3,@X[1],@X[1]
1153	vpshufb	$t3,@X[2],@X[2]
1154	vpaddd	0x00($Tbl),@X[0],$t0
1155	vpshufb	$t3,@X[3],@X[3]
1156	vpaddd	0x20($Tbl),@X[1],$t1
1157	vpaddd	0x40($Tbl),@X[2],$t2
1158	vpaddd	0x60($Tbl),@X[3],$t3
1159	vmovdqa	$t0,0x00(%rsp)
1160	mov	$A,$a1
1161	vmovdqa	$t1,0x10(%rsp)
1162	mov	$B,$a3
1163	vmovdqa	$t2,0x20(%rsp)
1164	xor	$C,$a3			# magic
1165	vmovdqa	$t3,0x30(%rsp)
1166	mov	$E,$a0
1167	jmp	.Lxop_00_47
1168
1169.align	16
1170.Lxop_00_47:
1171	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1172___
1173sub XOP_256_00_47 () {
1174my $j = shift;
1175my $body = shift;
1176my @X = @_;
1177my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1178
1179	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1180	  eval(shift(@insns));
1181	  eval(shift(@insns));
1182	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1183	  eval(shift(@insns));
1184	  eval(shift(@insns));
1185	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1186	  eval(shift(@insns));
1187	  eval(shift(@insns));
1188	&vpsrld		($t0,$t0,$sigma0[2]);
1189	  eval(shift(@insns));
1190	  eval(shift(@insns));
1191	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1192	  eval(shift(@insns));
1193	  eval(shift(@insns));
1194	  eval(shift(@insns));
1195	  eval(shift(@insns));
1196	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1197	  eval(shift(@insns));
1198	  eval(shift(@insns));
1199	&vpxor		($t0,$t0,$t1);
1200	  eval(shift(@insns));
1201	  eval(shift(@insns));
1202	  eval(shift(@insns));
1203	  eval(shift(@insns));
1204	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1205	  eval(shift(@insns));
1206	  eval(shift(@insns));
1207	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1208	  eval(shift(@insns));
1209	  eval(shift(@insns));
1210	 &vpsrld	($t2,@X[3],$sigma1[2]);
1211	  eval(shift(@insns));
1212	  eval(shift(@insns));
1213	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1214	  eval(shift(@insns));
1215	  eval(shift(@insns));
1216	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1217	  eval(shift(@insns));
1218	  eval(shift(@insns));
1219	 &vpxor		($t3,$t3,$t2);
1220	  eval(shift(@insns));
1221	  eval(shift(@insns));
1222	  eval(shift(@insns));
1223	  eval(shift(@insns));
1224	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1225	  eval(shift(@insns));
1226	  eval(shift(@insns));
1227	  eval(shift(@insns));
1228	  eval(shift(@insns));
1229	&vpsrldq	($t3,$t3,8);
1230	  eval(shift(@insns));
1231	  eval(shift(@insns));
1232	  eval(shift(@insns));
1233	  eval(shift(@insns));
1234	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1235	  eval(shift(@insns));
1236	  eval(shift(@insns));
1237	  eval(shift(@insns));
1238	  eval(shift(@insns));
1239	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1240	  eval(shift(@insns));
1241	  eval(shift(@insns));
1242	 &vpsrld	($t2,@X[0],$sigma1[2]);
1243	  eval(shift(@insns));
1244	  eval(shift(@insns));
1245	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1246	  eval(shift(@insns));
1247	  eval(shift(@insns));
1248	 &vpxor		($t3,$t3,$t2);
1249	  eval(shift(@insns));
1250	  eval(shift(@insns));
1251	  eval(shift(@insns));
1252	  eval(shift(@insns));
1253	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1254	  eval(shift(@insns));
1255	  eval(shift(@insns));
1256	  eval(shift(@insns));
1257	  eval(shift(@insns));
1258	&vpslldq	($t3,$t3,8);		# 22 instructions
1259	  eval(shift(@insns));
1260	  eval(shift(@insns));
1261	  eval(shift(@insns));
1262	  eval(shift(@insns));
1263	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1264	  eval(shift(@insns));
1265	  eval(shift(@insns));
1266	  eval(shift(@insns));
1267	  eval(shift(@insns));
1268	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1269	  foreach (@insns) { eval; }		# remaining instructions
1270	&vmovdqa	(16*$j."(%rsp)",$t2);
1271}
1272
1273    for ($i=0,$j=0; $j<4; $j++) {
1274	&XOP_256_00_47($j,\&body_00_15,@X);
1275	push(@X,shift(@X));			# rotate(@X)
1276    }
1277	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1278	&jne	(".Lxop_00_47");
1279
1280    for ($i=0; $i<16; ) {
1281	foreach(body_00_15()) { eval; }
1282    }
1283
1284					} else {	# SHA512
1285    my @X = map("%xmm$_",(0..7));
1286    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1287
1288$code.=<<___;
1289.align	16
1290.Lloop_xop:
1291	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1292	vmovdqu	0x00($inp),@X[0]
1293	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1294	vmovdqu	0x10($inp),@X[1]
1295	vmovdqu	0x20($inp),@X[2]
1296	vpshufb	$t3,@X[0],@X[0]
1297	vmovdqu	0x30($inp),@X[3]
1298	vpshufb	$t3,@X[1],@X[1]
1299	vmovdqu	0x40($inp),@X[4]
1300	vpshufb	$t3,@X[2],@X[2]
1301	vmovdqu	0x50($inp),@X[5]
1302	vpshufb	$t3,@X[3],@X[3]
1303	vmovdqu	0x60($inp),@X[6]
1304	vpshufb	$t3,@X[4],@X[4]
1305	vmovdqu	0x70($inp),@X[7]
1306	vpshufb	$t3,@X[5],@X[5]
1307	vpaddq	-0x80($Tbl),@X[0],$t0
1308	vpshufb	$t3,@X[6],@X[6]
1309	vpaddq	-0x60($Tbl),@X[1],$t1
1310	vpshufb	$t3,@X[7],@X[7]
1311	vpaddq	-0x40($Tbl),@X[2],$t2
1312	vpaddq	-0x20($Tbl),@X[3],$t3
1313	vmovdqa	$t0,0x00(%rsp)
1314	vpaddq	0x00($Tbl),@X[4],$t0
1315	vmovdqa	$t1,0x10(%rsp)
1316	vpaddq	0x20($Tbl),@X[5],$t1
1317	vmovdqa	$t2,0x20(%rsp)
1318	vpaddq	0x40($Tbl),@X[6],$t2
1319	vmovdqa	$t3,0x30(%rsp)
1320	vpaddq	0x60($Tbl),@X[7],$t3
1321	vmovdqa	$t0,0x40(%rsp)
1322	mov	$A,$a1
1323	vmovdqa	$t1,0x50(%rsp)
1324	mov	$B,$a3
1325	vmovdqa	$t2,0x60(%rsp)
1326	xor	$C,$a3			# magic
1327	vmovdqa	$t3,0x70(%rsp)
1328	mov	$E,$a0
1329	jmp	.Lxop_00_47
1330
1331.align	16
1332.Lxop_00_47:
1333	add	\$`16*2*$SZ`,$Tbl
1334___
1335sub XOP_512_00_47 () {
1336my $j = shift;
1337my $body = shift;
1338my @X = @_;
1339my @insns = (&$body,&$body);			# 52 instructions
1340
1341	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1342	  eval(shift(@insns));
1343	  eval(shift(@insns));
1344	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1345	  eval(shift(@insns));
1346	  eval(shift(@insns));
1347	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1348	  eval(shift(@insns));
1349	  eval(shift(@insns));
1350	&vpsrlq		($t0,$t0,$sigma0[2]);
1351	  eval(shift(@insns));
1352	  eval(shift(@insns));
1353	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1354	  eval(shift(@insns));
1355	  eval(shift(@insns));
1356	  eval(shift(@insns));
1357	  eval(shift(@insns));
1358	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1359	  eval(shift(@insns));
1360	  eval(shift(@insns));
1361	&vpxor		($t0,$t0,$t1);
1362	  eval(shift(@insns));
1363	  eval(shift(@insns));
1364	  eval(shift(@insns));
1365	  eval(shift(@insns));
1366	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1367	  eval(shift(@insns));
1368	  eval(shift(@insns));
1369	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1370	  eval(shift(@insns));
1371	  eval(shift(@insns));
1372	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1373	  eval(shift(@insns));
1374	  eval(shift(@insns));
1375	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1376	  eval(shift(@insns));
1377	  eval(shift(@insns));
1378	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1379	  eval(shift(@insns));
1380	  eval(shift(@insns));
1381	 &vpxor		($t3,$t3,$t2);
1382	  eval(shift(@insns));
1383	  eval(shift(@insns));
1384	  eval(shift(@insns));
1385	  eval(shift(@insns));
1386	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1387	  eval(shift(@insns));
1388	  eval(shift(@insns));
1389	  eval(shift(@insns));
1390	  eval(shift(@insns));
1391	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1392	  eval(shift(@insns));
1393	  eval(shift(@insns));
1394	  eval(shift(@insns));
1395	  eval(shift(@insns));
1396	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1397	  foreach (@insns) { eval; }		# remaining instructions
1398	&vmovdqa	(16*$j."(%rsp)",$t2);
1399}
1400
1401    for ($i=0,$j=0; $j<8; $j++) {
1402	&XOP_512_00_47($j,\&body_00_15,@X);
1403	push(@X,shift(@X));			# rotate(@X)
1404    }
1405	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1406	&jne	(".Lxop_00_47");
1407
1408    for ($i=0; $i<16; ) {
1409	foreach(body_00_15()) { eval; }
1410    }
1411}
1412$code.=<<___;
1413	mov	$_ctx,$ctx
1414	mov	$a1,$A
1415
1416	add	$SZ*0($ctx),$A
1417	lea	16*$SZ($inp),$inp
1418	add	$SZ*1($ctx),$B
1419	add	$SZ*2($ctx),$C
1420	add	$SZ*3($ctx),$D
1421	add	$SZ*4($ctx),$E
1422	add	$SZ*5($ctx),$F
1423	add	$SZ*6($ctx),$G
1424	add	$SZ*7($ctx),$H
1425
1426	cmp	$_end,$inp
1427
1428	mov	$A,$SZ*0($ctx)
1429	mov	$B,$SZ*1($ctx)
1430	mov	$C,$SZ*2($ctx)
1431	mov	$D,$SZ*3($ctx)
1432	mov	$E,$SZ*4($ctx)
1433	mov	$F,$SZ*5($ctx)
1434	mov	$G,$SZ*6($ctx)
1435	mov	$H,$SZ*7($ctx)
1436	jb	.Lloop_xop
1437
1438	mov	$_rsp,%rsi
1439	vzeroupper
1440___
1441$code.=<<___ if ($win64);
1442	movaps	16*$SZ+32(%rsp),%xmm6
1443	movaps	16*$SZ+48(%rsp),%xmm7
1444	movaps	16*$SZ+64(%rsp),%xmm8
1445	movaps	16*$SZ+80(%rsp),%xmm9
1446___
1447$code.=<<___ if ($win64 && $SZ>4);
1448	movaps	16*$SZ+96(%rsp),%xmm10
1449	movaps	16*$SZ+112(%rsp),%xmm11
1450___
1451$code.=<<___;
1452	mov	(%rsi),%r15
1453	mov	8(%rsi),%r14
1454	mov	16(%rsi),%r13
1455	mov	24(%rsi),%r12
1456	mov	32(%rsi),%rbp
1457	mov	40(%rsi),%rbx
1458	lea	48(%rsi),%rsp
1459.Lepilogue_xop:
1460	ret
1461.size	${func}_xop,.-${func}_xop
1462___
1463}
1464######################################################################
1465# AVX+shrd code path
1466#
1467local *ror = sub { &shrd(@_[0],@_) };
1468
1469$code.=<<___;
1470.type	${func}_avx,\@function,3
1471.align	64
1472${func}_avx:
1473.Lavx_shortcut:
1474	push	%rbx
1475	push	%rbp
1476	push	%r12
1477	push	%r13
1478	push	%r14
1479	push	%r15
1480	mov	%rsp,%r11		# copy %rsp
1481	shl	\$4,%rdx		# num*16
1482	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1483	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1484	and	\$-64,%rsp		# align stack frame
1485	mov	$ctx,$_ctx		# save ctx, 1st arg
1486	mov	$inp,$_inp		# save inp, 2nd arh
1487	mov	%rdx,$_end		# save end pointer, "3rd" arg
1488	mov	%r11,$_rsp		# save copy of %rsp
1489___
1490$code.=<<___ if ($win64);
1491	movaps	%xmm6,16*$SZ+32(%rsp)
1492	movaps	%xmm7,16*$SZ+48(%rsp)
1493	movaps	%xmm8,16*$SZ+64(%rsp)
1494	movaps	%xmm9,16*$SZ+80(%rsp)
1495___
1496$code.=<<___ if ($win64 && $SZ>4);
1497	movaps	%xmm10,16*$SZ+96(%rsp)
1498	movaps	%xmm11,16*$SZ+112(%rsp)
1499___
1500$code.=<<___;
1501.Lprologue_avx:
1502
1503	vzeroupper
1504	mov	$SZ*0($ctx),$A
1505	mov	$SZ*1($ctx),$B
1506	mov	$SZ*2($ctx),$C
1507	mov	$SZ*3($ctx),$D
1508	mov	$SZ*4($ctx),$E
1509	mov	$SZ*5($ctx),$F
1510	mov	$SZ*6($ctx),$G
1511	mov	$SZ*7($ctx),$H
1512___
1513					if ($SZ==4) {	# SHA256
1514    my @X = map("%xmm$_",(0..3));
1515    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1516
1517$code.=<<___;
1518	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1519	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1520	jmp	.Lloop_avx
1521.align	16
1522.Lloop_avx:
1523	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1524	vmovdqu	0x00($inp),@X[0]
1525	vmovdqu	0x10($inp),@X[1]
1526	vmovdqu	0x20($inp),@X[2]
1527	vmovdqu	0x30($inp),@X[3]
1528	vpshufb	$t3,@X[0],@X[0]
1529	lea	$TABLE(%rip),$Tbl
1530	vpshufb	$t3,@X[1],@X[1]
1531	vpshufb	$t3,@X[2],@X[2]
1532	vpaddd	0x00($Tbl),@X[0],$t0
1533	vpshufb	$t3,@X[3],@X[3]
1534	vpaddd	0x20($Tbl),@X[1],$t1
1535	vpaddd	0x40($Tbl),@X[2],$t2
1536	vpaddd	0x60($Tbl),@X[3],$t3
1537	vmovdqa	$t0,0x00(%rsp)
1538	mov	$A,$a1
1539	vmovdqa	$t1,0x10(%rsp)
1540	mov	$B,$a3
1541	vmovdqa	$t2,0x20(%rsp)
1542	xor	$C,$a3			# magic
1543	vmovdqa	$t3,0x30(%rsp)
1544	mov	$E,$a0
1545	jmp	.Lavx_00_47
1546
1547.align	16
1548.Lavx_00_47:
1549	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1550___
1551sub Xupdate_256_AVX () {
1552	(
1553	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1554	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1555	'&vpsrld	($t2,$t0,$sigma0[0]);',
1556	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1557	'&vpsrld	($t3,$t0,$sigma0[2])',
1558	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1559	'&vpxor		($t0,$t3,$t2)',
1560	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1561	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1562	'&vpxor		($t0,$t0,$t1)',
1563	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1564	'&vpxor		($t0,$t0,$t2)',
1565	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1566	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1567	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1568	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1569	 '&vpxor	($t2,$t2,$t3);',
1570	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1571	 '&vpxor	($t2,$t2,$t3)',
1572	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1573	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1574	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1575	 '&vpsrld	($t2,$t3,$sigma1[2])',
1576	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1577	 '&vpxor	($t2,$t2,$t3);',
1578	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1579	 '&vpxor	($t2,$t2,$t3)',
1580	 '&vpshufb	($t2,$t2,$t5)',
1581	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1582	);
1583}
1584
1585sub AVX_256_00_47 () {
1586my $j = shift;
1587my $body = shift;
1588my @X = @_;
1589my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1590
1591	foreach (Xupdate_256_AVX()) {		# 29 instructions
1592	    eval;
1593	    eval(shift(@insns));
1594	    eval(shift(@insns));
1595	    eval(shift(@insns));
1596	}
1597	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1598	  foreach (@insns) { eval; }		# remaining instructions
1599	&vmovdqa	(16*$j."(%rsp)",$t2);
1600}
1601
1602    for ($i=0,$j=0; $j<4; $j++) {
1603	&AVX_256_00_47($j,\&body_00_15,@X);
1604	push(@X,shift(@X));			# rotate(@X)
1605    }
1606	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1607	&jne	(".Lavx_00_47");
1608
1609    for ($i=0; $i<16; ) {
1610	foreach(body_00_15()) { eval; }
1611    }
1612
1613					} else {	# SHA512
1614    my @X = map("%xmm$_",(0..7));
1615    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1616
1617$code.=<<___;
1618	jmp	.Lloop_avx
1619.align	16
1620.Lloop_avx:
1621	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1622	vmovdqu	0x00($inp),@X[0]
1623	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1624	vmovdqu	0x10($inp),@X[1]
1625	vmovdqu	0x20($inp),@X[2]
1626	vpshufb	$t3,@X[0],@X[0]
1627	vmovdqu	0x30($inp),@X[3]
1628	vpshufb	$t3,@X[1],@X[1]
1629	vmovdqu	0x40($inp),@X[4]
1630	vpshufb	$t3,@X[2],@X[2]
1631	vmovdqu	0x50($inp),@X[5]
1632	vpshufb	$t3,@X[3],@X[3]
1633	vmovdqu	0x60($inp),@X[6]
1634	vpshufb	$t3,@X[4],@X[4]
1635	vmovdqu	0x70($inp),@X[7]
1636	vpshufb	$t3,@X[5],@X[5]
1637	vpaddq	-0x80($Tbl),@X[0],$t0
1638	vpshufb	$t3,@X[6],@X[6]
1639	vpaddq	-0x60($Tbl),@X[1],$t1
1640	vpshufb	$t3,@X[7],@X[7]
1641	vpaddq	-0x40($Tbl),@X[2],$t2
1642	vpaddq	-0x20($Tbl),@X[3],$t3
1643	vmovdqa	$t0,0x00(%rsp)
1644	vpaddq	0x00($Tbl),@X[4],$t0
1645	vmovdqa	$t1,0x10(%rsp)
1646	vpaddq	0x20($Tbl),@X[5],$t1
1647	vmovdqa	$t2,0x20(%rsp)
1648	vpaddq	0x40($Tbl),@X[6],$t2
1649	vmovdqa	$t3,0x30(%rsp)
1650	vpaddq	0x60($Tbl),@X[7],$t3
1651	vmovdqa	$t0,0x40(%rsp)
1652	mov	$A,$a1
1653	vmovdqa	$t1,0x50(%rsp)
1654	mov	$B,$a3
1655	vmovdqa	$t2,0x60(%rsp)
1656	xor	$C,$a3			# magic
1657	vmovdqa	$t3,0x70(%rsp)
1658	mov	$E,$a0
1659	jmp	.Lavx_00_47
1660
1661.align	16
1662.Lavx_00_47:
1663	add	\$`16*2*$SZ`,$Tbl
1664___
1665sub Xupdate_512_AVX () {
1666	(
1667	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1668	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1669	'&vpsrlq	($t2,$t0,$sigma0[0])',
1670	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1671	'&vpsrlq	($t3,$t0,$sigma0[2])',
1672	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1673	 '&vpxor	($t0,$t3,$t2)',
1674	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1675	 '&vpxor	($t0,$t0,$t1)',
1676	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1677	 '&vpxor	($t0,$t0,$t2)',
1678	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1679	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1680	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1681	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1682	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1683	 '&vpxor	($t3,$t3,$t2)',
1684	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1685	 '&vpxor	($t3,$t3,$t1)',
1686	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1687	 '&vpxor	($t3,$t3,$t2)',
1688	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1689	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1690	);
1691}
1692
1693sub AVX_512_00_47 () {
1694my $j = shift;
1695my $body = shift;
1696my @X = @_;
1697my @insns = (&$body,&$body);			# 52 instructions
1698
1699	foreach (Xupdate_512_AVX()) {		# 23 instructions
1700	    eval;
1701	    eval(shift(@insns));
1702	    eval(shift(@insns));
1703	}
1704	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1705	  foreach (@insns) { eval; }		# remaining instructions
1706	&vmovdqa	(16*$j."(%rsp)",$t2);
1707}
1708
1709    for ($i=0,$j=0; $j<8; $j++) {
1710	&AVX_512_00_47($j,\&body_00_15,@X);
1711	push(@X,shift(@X));			# rotate(@X)
1712    }
1713	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1714	&jne	(".Lavx_00_47");
1715
1716    for ($i=0; $i<16; ) {
1717	foreach(body_00_15()) { eval; }
1718    }
1719}
1720$code.=<<___;
1721	mov	$_ctx,$ctx
1722	mov	$a1,$A
1723
1724	add	$SZ*0($ctx),$A
1725	lea	16*$SZ($inp),$inp
1726	add	$SZ*1($ctx),$B
1727	add	$SZ*2($ctx),$C
1728	add	$SZ*3($ctx),$D
1729	add	$SZ*4($ctx),$E
1730	add	$SZ*5($ctx),$F
1731	add	$SZ*6($ctx),$G
1732	add	$SZ*7($ctx),$H
1733
1734	cmp	$_end,$inp
1735
1736	mov	$A,$SZ*0($ctx)
1737	mov	$B,$SZ*1($ctx)
1738	mov	$C,$SZ*2($ctx)
1739	mov	$D,$SZ*3($ctx)
1740	mov	$E,$SZ*4($ctx)
1741	mov	$F,$SZ*5($ctx)
1742	mov	$G,$SZ*6($ctx)
1743	mov	$H,$SZ*7($ctx)
1744	jb	.Lloop_avx
1745
1746	mov	$_rsp,%rsi
1747	vzeroupper
1748___
1749$code.=<<___ if ($win64);
1750	movaps	16*$SZ+32(%rsp),%xmm6
1751	movaps	16*$SZ+48(%rsp),%xmm7
1752	movaps	16*$SZ+64(%rsp),%xmm8
1753	movaps	16*$SZ+80(%rsp),%xmm9
1754___
1755$code.=<<___ if ($win64 && $SZ>4);
1756	movaps	16*$SZ+96(%rsp),%xmm10
1757	movaps	16*$SZ+112(%rsp),%xmm11
1758___
1759$code.=<<___;
1760	mov	(%rsi),%r15
1761	mov	8(%rsi),%r14
1762	mov	16(%rsi),%r13
1763	mov	24(%rsi),%r12
1764	mov	32(%rsi),%rbp
1765	mov	40(%rsi),%rbx
1766	lea	48(%rsi),%rsp
1767.Lepilogue_avx:
1768	ret
1769.size	${func}_avx,.-${func}_avx
1770___
1771
1772if ($avx>1) {{
1773######################################################################
1774# AVX2+BMI code path
1775#
1776my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1777my $PUSH8=8*2*$SZ;
1778use integer;
1779
1780sub bodyx_00_15 () {
1781	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1782	(
1783	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1784
1785	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1786	'&and	($a4,$e)',		# f&e
1787	'&rorx	($a0,$e,$Sigma1[2])',
1788	'&rorx	($a2,$e,$Sigma1[1])',
1789
1790	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1791	'&lea	($h,"($h,$a4)")',
1792	'&andn	($a4,$e,$g)',		# ~e&g
1793	'&xor	($a0,$a2)',
1794
1795	'&rorx	($a1,$e,$Sigma1[0])',
1796	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1797	'&xor	($a0,$a1)',		# Sigma1(e)
1798	'&mov	($a2,$a)',
1799
1800	'&rorx	($a4,$a,$Sigma0[2])',
1801	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1802	'&xor	($a2,$b)',		# a^b, b^c in next round
1803	'&rorx	($a1,$a,$Sigma0[1])',
1804
1805	'&rorx	($a0,$a,$Sigma0[0])',
1806	'&lea	($d,"($d,$h)")',	# d+=h
1807	'&and	($a3,$a2)',		# (b^c)&(a^b)
1808	'&xor	($a1,$a4)',
1809
1810	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1811	'&xor	($a1,$a0)',		# Sigma0(a)
1812	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1813	'&mov	($a4,$e)',		# copy of f in future
1814
1815	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1816	);
1817	# and at the finish one has to $a+=$a1
1818}
1819
1820$code.=<<___;
1821.type	${func}_avx2,\@function,3
1822.align	64
1823${func}_avx2:
1824.Lavx2_shortcut:
1825	push	%rbx
1826	push	%rbp
1827	push	%r12
1828	push	%r13
1829	push	%r14
1830	push	%r15
1831	mov	%rsp,%r11		# copy %rsp
1832	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1833	shl	\$4,%rdx		# num*16
1834	and	\$-256*$SZ,%rsp		# align stack frame
1835	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1836	add	\$`2*$SZ*($rounds-8)`,%rsp
1837	mov	$ctx,$_ctx		# save ctx, 1st arg
1838	mov	$inp,$_inp		# save inp, 2nd arh
1839	mov	%rdx,$_end		# save end pointer, "3rd" arg
1840	mov	%r11,$_rsp		# save copy of %rsp
1841___
1842$code.=<<___ if ($win64);
1843	movaps	%xmm6,16*$SZ+32(%rsp)
1844	movaps	%xmm7,16*$SZ+48(%rsp)
1845	movaps	%xmm8,16*$SZ+64(%rsp)
1846	movaps	%xmm9,16*$SZ+80(%rsp)
1847___
1848$code.=<<___ if ($win64 && $SZ>4);
1849	movaps	%xmm10,16*$SZ+96(%rsp)
1850	movaps	%xmm11,16*$SZ+112(%rsp)
1851___
1852$code.=<<___;
1853.Lprologue_avx2:
1854
1855	vzeroupper
1856	sub	\$-16*$SZ,$inp		# inp++, size optimization
1857	mov	$SZ*0($ctx),$A
1858	mov	$inp,%r12		# borrow $T1
1859	mov	$SZ*1($ctx),$B
1860	cmp	%rdx,$inp		# $_end
1861	mov	$SZ*2($ctx),$C
1862	cmove	%rsp,%r12		# next block or random data
1863	mov	$SZ*3($ctx),$D
1864	mov	$SZ*4($ctx),$E
1865	mov	$SZ*5($ctx),$F
1866	mov	$SZ*6($ctx),$G
1867	mov	$SZ*7($ctx),$H
1868___
1869					if ($SZ==4) {	# SHA256
1870    my @X = map("%ymm$_",(0..3));
1871    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1872
1873$code.=<<___;
1874	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1875	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1876	jmp	.Loop_avx2
1877.align	16
1878.Loop_avx2:
1879	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1880	vmovdqu	-16*$SZ+0($inp),%xmm0
1881	vmovdqu	-16*$SZ+16($inp),%xmm1
1882	vmovdqu	-16*$SZ+32($inp),%xmm2
1883	vmovdqu	-16*$SZ+48($inp),%xmm3
1884	#mov		$inp,$_inp	# offload $inp
1885	vinserti128	\$1,(%r12),@X[0],@X[0]
1886	vinserti128	\$1,16(%r12),@X[1],@X[1]
1887	vpshufb		$t3,@X[0],@X[0]
1888	vinserti128	\$1,32(%r12),@X[2],@X[2]
1889	vpshufb		$t3,@X[1],@X[1]
1890	vinserti128	\$1,48(%r12),@X[3],@X[3]
1891
1892	lea	$TABLE(%rip),$Tbl
1893	vpshufb	$t3,@X[2],@X[2]
1894	vpaddd	0x00($Tbl),@X[0],$t0
1895	vpshufb	$t3,@X[3],@X[3]
1896	vpaddd	0x20($Tbl),@X[1],$t1
1897	vpaddd	0x40($Tbl),@X[2],$t2
1898	vpaddd	0x60($Tbl),@X[3],$t3
1899	vmovdqa	$t0,0x00(%rsp)
1900	xor	$a1,$a1
1901	vmovdqa	$t1,0x20(%rsp)
1902	lea	-$PUSH8(%rsp),%rsp
1903	mov	$B,$a3
1904	vmovdqa	$t2,0x00(%rsp)
1905	xor	$C,$a3			# magic
1906	vmovdqa	$t3,0x20(%rsp)
1907	mov	$F,$a4
1908	sub	\$-16*2*$SZ,$Tbl	# size optimization
1909	jmp	.Lavx2_00_47
1910
1911.align	16
1912.Lavx2_00_47:
1913___
1914
1915sub AVX2_256_00_47 () {
1916my $j = shift;
1917my $body = shift;
1918my @X = @_;
1919my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1920my $base = "+2*$PUSH8(%rsp)";
1921
1922	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1923	foreach (Xupdate_256_AVX()) {		# 29 instructions
1924	    eval;
1925	    eval(shift(@insns));
1926	    eval(shift(@insns));
1927	    eval(shift(@insns));
1928	}
1929	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1930	  foreach (@insns) { eval; }		# remaining instructions
1931	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1932}
1933
1934    for ($i=0,$j=0; $j<4; $j++) {
1935	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1936	push(@X,shift(@X));			# rotate(@X)
1937    }
1938	&lea	($Tbl,16*2*$SZ."($Tbl)");
1939	&cmpb	(($SZ-1)."($Tbl)",0);
1940	&jne	(".Lavx2_00_47");
1941
1942    for ($i=0; $i<16; ) {
1943	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1944	foreach(bodyx_00_15()) { eval; }
1945    }
1946					} else {	# SHA512
1947    my @X = map("%ymm$_",(0..7));
1948    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1949
1950$code.=<<___;
1951	jmp	.Loop_avx2
1952.align	16
1953.Loop_avx2:
1954	vmovdqu	-16*$SZ($inp),%xmm0
1955	vmovdqu	-16*$SZ+16($inp),%xmm1
1956	vmovdqu	-16*$SZ+32($inp),%xmm2
1957	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1958	vmovdqu	-16*$SZ+48($inp),%xmm3
1959	vmovdqu	-16*$SZ+64($inp),%xmm4
1960	vmovdqu	-16*$SZ+80($inp),%xmm5
1961	vmovdqu	-16*$SZ+96($inp),%xmm6
1962	vmovdqu	-16*$SZ+112($inp),%xmm7
1963	#mov	$inp,$_inp	# offload $inp
1964	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
1965	vinserti128	\$1,(%r12),@X[0],@X[0]
1966	vinserti128	\$1,16(%r12),@X[1],@X[1]
1967	 vpshufb	$t2,@X[0],@X[0]
1968	vinserti128	\$1,32(%r12),@X[2],@X[2]
1969	 vpshufb	$t2,@X[1],@X[1]
1970	vinserti128	\$1,48(%r12),@X[3],@X[3]
1971	 vpshufb	$t2,@X[2],@X[2]
1972	vinserti128	\$1,64(%r12),@X[4],@X[4]
1973	 vpshufb	$t2,@X[3],@X[3]
1974	vinserti128	\$1,80(%r12),@X[5],@X[5]
1975	 vpshufb	$t2,@X[4],@X[4]
1976	vinserti128	\$1,96(%r12),@X[6],@X[6]
1977	 vpshufb	$t2,@X[5],@X[5]
1978	vinserti128	\$1,112(%r12),@X[7],@X[7]
1979
1980	vpaddq	-0x80($Tbl),@X[0],$t0
1981	vpshufb	$t2,@X[6],@X[6]
1982	vpaddq	-0x60($Tbl),@X[1],$t1
1983	vpshufb	$t2,@X[7],@X[7]
1984	vpaddq	-0x40($Tbl),@X[2],$t2
1985	vpaddq	-0x20($Tbl),@X[3],$t3
1986	vmovdqa	$t0,0x00(%rsp)
1987	vpaddq	0x00($Tbl),@X[4],$t0
1988	vmovdqa	$t1,0x20(%rsp)
1989	vpaddq	0x20($Tbl),@X[5],$t1
1990	vmovdqa	$t2,0x40(%rsp)
1991	vpaddq	0x40($Tbl),@X[6],$t2
1992	vmovdqa	$t3,0x60(%rsp)
1993	lea	-$PUSH8(%rsp),%rsp
1994	vpaddq	0x60($Tbl),@X[7],$t3
1995	vmovdqa	$t0,0x00(%rsp)
1996	xor	$a1,$a1
1997	vmovdqa	$t1,0x20(%rsp)
1998	mov	$B,$a3
1999	vmovdqa	$t2,0x40(%rsp)
2000	xor	$C,$a3			# magic
2001	vmovdqa	$t3,0x60(%rsp)
2002	mov	$F,$a4
2003	add	\$16*2*$SZ,$Tbl
2004	jmp	.Lavx2_00_47
2005
2006.align	16
2007.Lavx2_00_47:
2008___
2009
2010sub AVX2_512_00_47 () {
2011my $j = shift;
2012my $body = shift;
2013my @X = @_;
2014my @insns = (&$body,&$body);			# 48 instructions
2015my $base = "+2*$PUSH8(%rsp)";
2016
2017	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
2018	foreach (Xupdate_512_AVX()) {		# 23 instructions
2019	    eval;
2020	    if ($_ !~ /\;$/) {
2021		eval(shift(@insns));
2022		eval(shift(@insns));
2023		eval(shift(@insns));
2024	    }
2025	}
2026	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2027	  foreach (@insns) { eval; }		# remaining instructions
2028	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2029}
2030
2031    for ($i=0,$j=0; $j<8; $j++) {
2032	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2033	push(@X,shift(@X));			# rotate(@X)
2034    }
2035	&lea	($Tbl,16*2*$SZ."($Tbl)");
2036	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2037	&jne	(".Lavx2_00_47");
2038
2039    for ($i=0; $i<16; ) {
2040	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2041	foreach(bodyx_00_15()) { eval; }
2042    }
2043}
2044$code.=<<___;
2045	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2046	add	$a1,$A
2047	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2048	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2049
2050	add	$SZ*0($ctx),$A
2051	add	$SZ*1($ctx),$B
2052	add	$SZ*2($ctx),$C
2053	add	$SZ*3($ctx),$D
2054	add	$SZ*4($ctx),$E
2055	add	$SZ*5($ctx),$F
2056	add	$SZ*6($ctx),$G
2057	add	$SZ*7($ctx),$H
2058
2059	mov	$A,$SZ*0($ctx)
2060	mov	$B,$SZ*1($ctx)
2061	mov	$C,$SZ*2($ctx)
2062	mov	$D,$SZ*3($ctx)
2063	mov	$E,$SZ*4($ctx)
2064	mov	$F,$SZ*5($ctx)
2065	mov	$G,$SZ*6($ctx)
2066	mov	$H,$SZ*7($ctx)
2067
2068	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2069	je	.Ldone_avx2
2070
2071	xor	$a1,$a1
2072	mov	$B,$a3
2073	xor	$C,$a3			# magic
2074	mov	$F,$a4
2075	jmp	.Lower_avx2
2076.align	16
2077.Lower_avx2:
2078___
2079    for ($i=0; $i<8; ) {
2080	my $base="+16($Tbl)";
2081	foreach(bodyx_00_15()) { eval; }
2082    }
2083$code.=<<___;
2084	lea	-$PUSH8($Tbl),$Tbl
2085	cmp	%rsp,$Tbl
2086	jae	.Lower_avx2
2087
2088	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2089	add	$a1,$A
2090	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2091	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2092
2093	add	$SZ*0($ctx),$A
2094	add	$SZ*1($ctx),$B
2095	add	$SZ*2($ctx),$C
2096	add	$SZ*3($ctx),$D
2097	add	$SZ*4($ctx),$E
2098	add	$SZ*5($ctx),$F
2099	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2100	add	$SZ*6($ctx),$G
2101	mov	$inp,%r12
2102	add	$SZ*7($ctx),$H
2103	cmp	$_end,$inp
2104
2105	mov	$A,$SZ*0($ctx)
2106	cmove	%rsp,%r12		# next block or stale data
2107	mov	$B,$SZ*1($ctx)
2108	mov	$C,$SZ*2($ctx)
2109	mov	$D,$SZ*3($ctx)
2110	mov	$E,$SZ*4($ctx)
2111	mov	$F,$SZ*5($ctx)
2112	mov	$G,$SZ*6($ctx)
2113	mov	$H,$SZ*7($ctx)
2114
2115	jbe	.Loop_avx2
2116	lea	(%rsp),$Tbl
2117
2118.Ldone_avx2:
2119	lea	($Tbl),%rsp
2120	mov	$_rsp,%rsi
2121	vzeroupper
2122___
2123$code.=<<___ if ($win64);
2124	movaps	16*$SZ+32(%rsp),%xmm6
2125	movaps	16*$SZ+48(%rsp),%xmm7
2126	movaps	16*$SZ+64(%rsp),%xmm8
2127	movaps	16*$SZ+80(%rsp),%xmm9
2128___
2129$code.=<<___ if ($win64 && $SZ>4);
2130	movaps	16*$SZ+96(%rsp),%xmm10
2131	movaps	16*$SZ+112(%rsp),%xmm11
2132___
2133$code.=<<___;
2134	mov	(%rsi),%r15
2135	mov	8(%rsi),%r14
2136	mov	16(%rsi),%r13
2137	mov	24(%rsi),%r12
2138	mov	32(%rsi),%rbp
2139	mov	40(%rsi),%rbx
2140	lea	48(%rsi),%rsp
2141.Lepilogue_avx2:
2142	ret
2143.size	${func}_avx2,.-${func}_avx2
2144___
2145}}
2146}}}}}
2147
2148# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2149#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2150if ($win64) {
2151$rec="%rcx";
2152$frame="%rdx";
2153$context="%r8";
2154$disp="%r9";
2155
2156$code.=<<___;
2157.extern	__imp_RtlVirtualUnwind
2158.type	se_handler,\@abi-omnipotent
2159.align	16
2160se_handler:
2161	push	%rsi
2162	push	%rdi
2163	push	%rbx
2164	push	%rbp
2165	push	%r12
2166	push	%r13
2167	push	%r14
2168	push	%r15
2169	pushfq
2170	sub	\$64,%rsp
2171
2172	mov	120($context),%rax	# pull context->Rax
2173	mov	248($context),%rbx	# pull context->Rip
2174
2175	mov	8($disp),%rsi		# disp->ImageBase
2176	mov	56($disp),%r11		# disp->HanderlData
2177
2178	mov	0(%r11),%r10d		# HandlerData[0]
2179	lea	(%rsi,%r10),%r10	# prologue label
2180	cmp	%r10,%rbx		# context->Rip<prologue label
2181	jb	.Lin_prologue
2182
2183	mov	152($context),%rax	# pull context->Rsp
2184
2185	mov	4(%r11),%r10d		# HandlerData[1]
2186	lea	(%rsi,%r10),%r10	# epilogue label
2187	cmp	%r10,%rbx		# context->Rip>=epilogue label
2188	jae	.Lin_prologue
2189___
2190$code.=<<___ if ($avx>1);
2191	lea	.Lavx2_shortcut(%rip),%r10
2192	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2193	jb	.Lnot_in_avx2
2194
2195	and	\$-256*$SZ,%rax
2196	add	\$`2*$SZ*($rounds-8)`,%rax
2197.Lnot_in_avx2:
2198___
2199$code.=<<___;
2200	mov	%rax,%rsi		# put aside Rsp
2201	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2202	lea	48(%rax),%rax
2203
2204	mov	-8(%rax),%rbx
2205	mov	-16(%rax),%rbp
2206	mov	-24(%rax),%r12
2207	mov	-32(%rax),%r13
2208	mov	-40(%rax),%r14
2209	mov	-48(%rax),%r15
2210	mov	%rbx,144($context)	# restore context->Rbx
2211	mov	%rbp,160($context)	# restore context->Rbp
2212	mov	%r12,216($context)	# restore context->R12
2213	mov	%r13,224($context)	# restore context->R13
2214	mov	%r14,232($context)	# restore context->R14
2215	mov	%r15,240($context)	# restore context->R15
2216
2217	lea	.Lepilogue(%rip),%r10
2218	cmp	%r10,%rbx
2219	jb	.Lin_prologue		# non-AVX code
2220
2221	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2222	lea	512($context),%rdi	# &context.Xmm6
2223	mov	\$`$SZ==4?8:12`,%ecx
2224	.long	0xa548f3fc		# cld; rep movsq
2225
2226.Lin_prologue:
2227	mov	8(%rax),%rdi
2228	mov	16(%rax),%rsi
2229	mov	%rax,152($context)	# restore context->Rsp
2230	mov	%rsi,168($context)	# restore context->Rsi
2231	mov	%rdi,176($context)	# restore context->Rdi
2232
2233	mov	40($disp),%rdi		# disp->ContextRecord
2234	mov	$context,%rsi		# context
2235	mov	\$154,%ecx		# sizeof(CONTEXT)
2236	.long	0xa548f3fc		# cld; rep movsq
2237
2238	mov	$disp,%rsi
2239	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2240	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2241	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2242	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2243	mov	40(%rsi),%r10		# disp->ContextRecord
2244	lea	56(%rsi),%r11		# &disp->HandlerData
2245	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2246	mov	%r10,32(%rsp)		# arg5
2247	mov	%r11,40(%rsp)		# arg6
2248	mov	%r12,48(%rsp)		# arg7
2249	mov	%rcx,56(%rsp)		# arg8, (NULL)
2250	call	*__imp_RtlVirtualUnwind(%rip)
2251
2252	mov	\$1,%eax		# ExceptionContinueSearch
2253	add	\$64,%rsp
2254	popfq
2255	pop	%r15
2256	pop	%r14
2257	pop	%r13
2258	pop	%r12
2259	pop	%rbp
2260	pop	%rbx
2261	pop	%rdi
2262	pop	%rsi
2263	ret
2264.size	se_handler,.-se_handler
2265___
2266$code.=<<___ if ($SZ == 4 && $shaext);
2267.type	shaext_handler,\@abi-omnipotent
2268.align	16
2269shaext_handler:
2270	push	%rsi
2271	push	%rdi
2272	push	%rbx
2273	push	%rbp
2274	push	%r12
2275	push	%r13
2276	push	%r14
2277	push	%r15
2278	pushfq
2279	sub	\$64,%rsp
2280
2281	mov	120($context),%rax	# pull context->Rax
2282	mov	248($context),%rbx	# pull context->Rip
2283
2284	lea	.Lprologue_shaext(%rip),%r10
2285	cmp	%r10,%rbx		# context->Rip<.Lprologue
2286	jb	.Lin_prologue
2287
2288	lea	.Lepilogue_shaext(%rip),%r10
2289	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2290	jae	.Lin_prologue
2291
2292	lea	-8-5*16(%rax),%rsi
2293	lea	512($context),%rdi	# &context.Xmm6
2294	mov	\$10,%ecx
2295	.long	0xa548f3fc		# cld; rep movsq
2296
2297	jmp	.Lin_prologue
2298.size	shaext_handler,.-shaext_handler
2299___
2300$code.=<<___;
2301.section	.pdata
2302.align	4
2303	.rva	.LSEH_begin_$func
2304	.rva	.LSEH_end_$func
2305	.rva	.LSEH_info_$func
2306___
2307$code.=<<___ if ($SZ==4 && $shext);
2308	.rva	.LSEH_begin_${func}_shaext
2309	.rva	.LSEH_end_${func}_shaext
2310	.rva	.LSEH_info_${func}_shaext
2311___
2312$code.=<<___ if ($SZ==4);
2313	.rva	.LSEH_begin_${func}_ssse3
2314	.rva	.LSEH_end_${func}_ssse3
2315	.rva	.LSEH_info_${func}_ssse3
2316___
2317$code.=<<___ if ($avx && $SZ==8);
2318	.rva	.LSEH_begin_${func}_xop
2319	.rva	.LSEH_end_${func}_xop
2320	.rva	.LSEH_info_${func}_xop
2321___
2322$code.=<<___ if ($avx);
2323	.rva	.LSEH_begin_${func}_avx
2324	.rva	.LSEH_end_${func}_avx
2325	.rva	.LSEH_info_${func}_avx
2326___
2327$code.=<<___ if ($avx>1);
2328	.rva	.LSEH_begin_${func}_avx2
2329	.rva	.LSEH_end_${func}_avx2
2330	.rva	.LSEH_info_${func}_avx2
2331___
2332$code.=<<___;
2333.section	.xdata
2334.align	8
2335.LSEH_info_$func:
2336	.byte	9,0,0,0
2337	.rva	se_handler
2338	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2339___
2340$code.=<<___ if ($SZ==4 && $shaext);
2341.LSEH_info_${func}_shaext:
2342	.byte	9,0,0,0
2343	.rva	shaext_handler
2344___
2345$code.=<<___ if ($SZ==4);
2346.LSEH_info_${func}_ssse3:
2347	.byte	9,0,0,0
2348	.rva	se_handler
2349	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2350___
2351$code.=<<___ if ($avx && $SZ==8);
2352.LSEH_info_${func}_xop:
2353	.byte	9,0,0,0
2354	.rva	se_handler
2355	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2356___
2357$code.=<<___ if ($avx);
2358.LSEH_info_${func}_avx:
2359	.byte	9,0,0,0
2360	.rva	se_handler
2361	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2362___
2363$code.=<<___ if ($avx>1);
2364.LSEH_info_${func}_avx2:
2365	.byte	9,0,0,0
2366	.rva	se_handler
2367	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2368___
2369}
2370
2371sub sha256op38 {
2372    my $instr = shift;
2373    my %opcodelet = (
2374		"sha256rnds2" => 0xcb,
2375  		"sha256msg1"  => 0xcc,
2376		"sha256msg2"  => 0xcd	);
2377
2378    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2379      my @opcode=(0x0f,0x38);
2380	push @opcode,$opcodelet{$instr};
2381	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2382	return ".byte\t".join(',',@opcode);
2383    } else {
2384	return $instr."\t".@_[0];
2385    }
2386}
2387
2388foreach (split("\n",$code)) {
2389	s/\`([^\`]*)\`/eval $1/geo;
2390
2391	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2392
2393	print $_,"\n";
2394}
2395close STDOUT;
2396