195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#!/usr/bin/env perl
295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ====================================================================
495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# project. The module is, however, dual licensed under OpenSSL and
695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# CRYPTOGAMS licenses depending on where you obtain it. For further
795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# details see http://www.openssl.org/~appro/cryptogams/.
895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# ====================================================================
995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
1095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# sha1_block procedure for x86_64.
1195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
1295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# It was brought to my attention that on EM64T compiler-generated code
1395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# was far behind 32-bit assembler implementation. This is unlike on
1495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Opteron where compiler-generated code was only 15% behind 32-bit
1595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# assembler, which originally made it hard to motivate the effort.
1695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# There was suggestion to mechanically translate 32-bit code, but I
1795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# dismissed it, reasoning that x86_64 offers enough register bank
1895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
1995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# implementation:-) However! While 64-bit code does perform better
2095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
2195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# x86_64 does offer larger *addressable* bank, but out-of-order core
2295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# reaches for even more registers through dynamic aliasing, and EM64T
2395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# core must have managed to run-time optimize even 32-bit code just as
2495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# good as 64-bit one. Performance improvement is summarized in the
2595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# following table:
2695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
2795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#		gcc 3.4		32-bit asm	cycles/byte
2895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Opteron	+45%		+20%		6.8
2995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Xeon P4	+65%		+0%		9.9
3095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Core2		+60%		+10%		7.0
3195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
3295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# August 2009.
3395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
3495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# The code was revised to minimize code size and to maximize
3595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# "distance" between instructions producing input to 'lea'
3695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# instruction and the 'lea' instruction itself, which is essential
3795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# for Intel Atom core.
3895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
3995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# October 2010.
4095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
4195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
4295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# is to offload message schedule denoted by Wt in NIST specification,
4395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
4495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# for background and implementation details. The only difference from
4595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
4695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# to free temporary registers.
4795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
4895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# April 2011.
4995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
5095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Add AVX code path. See sha1-586.pl for further information.
5195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
5295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# May 2013.
5395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
5495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
5595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# and loading pair of consecutive blocks to 256-bit %ymm registers)
5695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# did not provide impressive performance improvement till a crucial
5795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# hint regarding the number of Xupdate iterations to pre-compute in
5895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# advance was provided by Ilya Albrekht of Intel Corp.
5995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
60cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley# March 2014.
61cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley#
62cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley# Add support for Intel SHA Extensions.
63cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
6495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley######################################################################
6595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# Current performance is summarized in following table. Numbers are
6695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# CPU clock cycles spent to process single byte (less is better).
6795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#
6895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#		x86_64		SSSE3		AVX[2]
695c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# P4		9.05		-
705c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# Opteron	6.26		-
715c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# Core2		6.55		6.05/+8%	-
725c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# Westmere	6.73		5.30/+27%	-
735c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
745c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
755c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# Haswell	5.45		4.15/+31%	3.57/+53%
765c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# Bulldozer	9.11		5.95/+53%
775c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# VIA Nano	9.32		7.15/+30%
78cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley# Atom		10.3		9.17/+12%
795c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# Silvermont	13.1(*)		9.37/+40%
805c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley#
815c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley# (*)	obviously suboptimal result, nothing was done about it,
825c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley#	because SSSE3 code is compiled unconditionally;
8395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
8495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$flavour = shift;
8595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$output  = shift;
8695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
8795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
8895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
8995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
9095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
9195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
9295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
9395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleydie "can't locate x86_64-xlate.pl";
9495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
9595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
9695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
9795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	$avx = ($1>=2.19) + ($1>=2.22);
9895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
9995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
10095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
10195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
10295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	$avx = ($1>=2.09) + ($1>=2.10);
10395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
10495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
10595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
10695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
10795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	$avx = ($1>=10) + ($1>=11);
10895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
10995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
1102811da2ecaf125511349d13d5fd7e1fa74e1a914Adam Langleyif (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([2-9]\.[0-9]+)/) {
1112811da2ecaf125511349d13d5fd7e1fa74e1a914Adam Langley	$avx = ($2>=3.0) + ($2>3.0);
1122811da2ecaf125511349d13d5fd7e1fa74e1a914Adam Langley}
1132811da2ecaf125511349d13d5fd7e1fa74e1a914Adam Langley
114006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley$shaext=0;	### set to zero if compiling for 1.0.1
115006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley$avx=1		if (!$shaext && $avx);
116006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley
11795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyopen OUT,"| \"$^X\" $xlate $flavour $output";
11895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley*STDOUT=*OUT;
11995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
12095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$ctx="%rdi";	# 1st arg
12195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$inp="%rsi";	# 2nd arg
12295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$num="%rdx";	# 3rd arg
12395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
12495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# reassign arguments in order to produce more compact code
12595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$ctx="%r8";
12695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$inp="%r9";
12795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$num="%r10";
12895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
12995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$t0="%eax";
13095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$t1="%ebx";
13195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$t2="%ecx";
1325c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley@xi=("%edx","%ebp","%r14d");
13395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$A="%esi";
13495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$B="%edi";
13595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$C="%r11d";
13695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$D="%r12d";
13795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$E="%r13d";
13895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
13995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley@V=($A,$B,$C,$D,$E);
14095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
14195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub BODY_00_19 {
14295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($i,$a,$b,$c,$d,$e)=@_;
14395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $j=$i+1;
14495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($i==0);
14595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	`4*$i`($inp),$xi[0]
14695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bswap	$xi[0]
14795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
14895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($i<15);
14995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	`4*$j`($inp),$xi[1]
1505c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$d,$t0
1515c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$xi[0],`4*$i`(%rsp)
15295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$a,$t2
15395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	bswap	$xi[1]
1545c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	$c,$t0
15595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$5,$t2
15695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$b,$t0
1575c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	lea	0x5a827999($xi[0],$e),$e
15895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t2,$e
15995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	$d,$t0
16095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$30,$b
16195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t0,$e
16295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
16395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($i>=15);
1645c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	`4*($j%16)`(%rsp),$xi[1]
1655c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$d,$t0
1665c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$xi[0],`4*($i%16)`(%rsp)
16795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$a,$t2
16895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
1695c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	$c,$t0
17095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$5,$t2
17195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
17295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$b,$t0
17395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	0x5a827999($xi[0],$e),$e
1745c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	rol	\$30,$b
17595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	$d,$t0
17695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t2,$e
1775c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	rol	\$1,$xi[1]
17895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t0,$e
17995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
1805c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langleypush(@xi,shift(@xi));
18195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
18295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
18395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub BODY_20_39 {
18495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($i,$a,$b,$c,$d,$e)=@_;
18595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $j=$i+1;
18695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $K=($i<40)?0x6ed9eba1:0xca62c1d6;
18795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($i<79);
1885c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	`4*($j%16)`(%rsp),$xi[1]
1895c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$b,$t0
1905c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
19195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$a,$t2
19295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
1935c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	$d,$t0
19495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$5,$t2
19595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
1965c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	lea	$K($xi[0],$e),$e
1975c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	$c,$t0
19895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t2,$e
19995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$30,$b
20095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t0,$e
20195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$1,$xi[1]
20295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
20395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($i==79);
2045c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$b,$t0
20595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$a,$t2
2065c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	$d,$t0
20795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	$K($xi[0],$e),$e
20895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$5,$t2
2095c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	$c,$t0
21095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t2,$e
21195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$30,$b
21295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t0,$e
21395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
2145c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langleypush(@xi,shift(@xi));
21595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
21695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
21795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub BODY_40_59 {
21895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($i,$a,$b,$c,$d,$e)=@_;
21995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $j=$i+1;
22095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
2215c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	`4*($j%16)`(%rsp),$xi[1]
2225c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$d,$t0
2235c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$xi[0],`4*($i%16)`(%rsp)
2245c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	$d,$t1
22595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
2265c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	and	$c,$t0
22795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$a,$t2
22895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
22995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	0x8f1bbcdc($xi[0],$e),$e
2305c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	xor	$c,$t1
23195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$5,$t2
23295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t0,$e
23395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rol	\$1,$xi[1]
2345c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	and	$b,$t1
23595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$t2,$e
2365c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	rol	\$30,$b
2375c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	add	$t1,$e
23895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
2395c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langleypush(@xi,shift(@xi));
24095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
24195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
24295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
24395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.text
24495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.extern	OPENSSL_ia32cap_P
24595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
24695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.globl	sha1_block_data_order
24795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	sha1_block_data_order,\@function,3
24895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
24995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysha1_block_data_order:
25095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
25195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
25295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
25395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	test	\$`1<<9`,%r8d		# check SSSE3 bit
25495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jz	.Lialu
255006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley___
256006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley$code.=<<___ if ($shaext);
257cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	test	\$`1<<29`,%r10d		# check SHA bit
258cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	jnz	_shaext_shortcut
25995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
26095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($avx>1);
26195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
26295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	\$`1<<3|1<<5|1<<8`,%r10d
26395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	je	_avx2_shortcut
26495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
26595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($avx);
26695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	\$`1<<28`,%r8d		# mask AVX bit
26795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
26895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	or	%r9d,%r8d
26995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	\$`1<<28|1<<30`,%r8d
27095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	je	_avx_shortcut
27195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
27295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
27395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	_ssse3_shortcut
27495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
27595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
27695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lialu:
2775c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	%rsp,%rax
27895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbx
27995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbp
28095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r12
28195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r13
2825c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	push	%r14
28395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdi,$ctx	# reassigned argument
28495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	\$`8+16*4`,%rsp
28595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rsi,$inp	# reassigned argument
28695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	\$-64,%rsp
28795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdx,$num	# reassigned argument
2885c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	%rax,`16*4`(%rsp)
28995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lprologue:
29095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
29195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	0($ctx),$A
29295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	4($ctx),$B
29395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	8($ctx),$C
29495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	12($ctx),$D
29595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	16($ctx),$E
29695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Lloop
29795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
29895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
29995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lloop:
30095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
30195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyfor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
30295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyfor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
30395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyfor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
30495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyfor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
30595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
30695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	0($ctx),$A
30795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	4($ctx),$B
30895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	8($ctx),$C
30995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	12($ctx),$D
31095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	16($ctx),$E
31195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$A,0($ctx)
31295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$B,4($ctx)
31395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,8($ctx)
31495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$D,12($ctx)
31595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$E,16($ctx)
31695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
31795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	\$1,$num
31895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	`16*4`($inp),$inp
31995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jnz	.Lloop
32095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
32195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	`16*4`(%rsp),%rsi
3225c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	-40(%rsi),%r14
3235c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	-32(%rsi),%r13
3245c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	-24(%rsi),%r12
3255c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	-16(%rsi),%rbp
3265c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	-8(%rsi),%rbx
3275c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	lea	(%rsi),%rsp
32895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lepilogue:
32995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ret
33095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	sha1_block_data_order,.-sha1_block_data_order
33195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
332006779a02c515e5e6c8406c4aea26950b37676e0Adam Langleyif ($shaext) {{{
333cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley######################################################################
334cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley# Intel SHA Extensions implementation of SHA1 update function.
335cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley#
336cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleymy ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
337cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleymy ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
338cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleymy @MSG=map("%xmm$_",(4..7));
339cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
340cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley$code.=<<___;
341cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.type	sha1_block_data_order_shaext,\@function,3
342cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.align	32
343cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleysha1_block_data_order_shaext:
344cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley_shaext_shortcut:
345cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley___
346cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley$code.=<<___ if ($win64);
347cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	lea	`-8-4*16`(%rsp),%rsp
348cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movaps	%xmm6,-8-4*16(%rax)
349cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movaps	%xmm7,-8-3*16(%rax)
350cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movaps	%xmm8,-8-2*16(%rax)
351cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movaps	%xmm9,-8-1*16(%rax)
352cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.Lprologue_shaext:
353cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley___
354cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley$code.=<<___;
355cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu	($ctx),$ABCD
356cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movd	16($ctx),$E
357cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
358cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
359cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu	($inp),@MSG[0]
360cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
361cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu	0x10($inp),@MSG[1]
362cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufd	\$0b00011011,$E,$E		# flip word order
363cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu	0x20($inp),@MSG[2]
364cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufb	$BSWAP,@MSG[0]
365cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu	0x30($inp),@MSG[3]
366cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufb	$BSWAP,@MSG[1]
367cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufb	$BSWAP,@MSG[2]
368cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa	$E,$E_SAVE			# offload $E
369cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufb	$BSWAP,@MSG[3]
370cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	jmp	.Loop_shaext
371cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
372cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.align	16
373cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.Loop_shaext:
374cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	dec		$num
375cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	lea		0x40($inp),%rax		# next input block
376cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	paddd		@MSG[0],$E
377cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	cmovne		%rax,$inp
378cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
379cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley___
380cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleyfor($i=0;$i<20-4;$i+=2) {
381cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley$code.=<<___;
382cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1msg1	@MSG[1],@MSG[0]
383cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa		$ABCD,$E_
384cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
385cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1nexte	@MSG[1],$E_
386cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pxor		@MSG[2],@MSG[0]
387cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1msg1	@MSG[2],@MSG[1]
388cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1msg2	@MSG[3],@MSG[0]
389cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
390cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa		$ABCD,$E
391cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
392cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1nexte	@MSG[2],$E
393cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pxor		@MSG[3],@MSG[1]
394cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1msg2	@MSG[0],@MSG[1]
395cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley___
396cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
397cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley}
398cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley$code.=<<___;
399cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu		($inp),@MSG[0]
400cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa		$ABCD,$E_
401cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1rnds4	\$3,$E,$ABCD		# 64-67
402cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1nexte	@MSG[1],$E_
403cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu		0x10($inp),@MSG[1]
404cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufb		$BSWAP,@MSG[0]
405cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
406cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa		$ABCD,$E
407cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1rnds4	\$3,$E_,$ABCD		# 68-71
408cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1nexte	@MSG[2],$E
409cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu		0x20($inp),@MSG[2]
410cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufb		$BSWAP,@MSG[1]
411cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
412cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa		$ABCD,$E_
413cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1rnds4	\$3,$E,$ABCD		# 72-75
414cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1nexte	@MSG[3],$E_
415cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu		0x30($inp),@MSG[3]
416cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufb		$BSWAP,@MSG[2]
417cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
418cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa		$ABCD,$E
419cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1rnds4	\$3,$E_,$ABCD		# 76-79
420cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sha1nexte	$E_SAVE,$E
421cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufb		$BSWAP,@MSG[3]
422cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
423cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	paddd		$ABCD_SAVE,$ABCD
424cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqa		$E,$E_SAVE		# offload $E
425cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
426cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	jnz		.Loop_shaext
427cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
428cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufd	\$0b00011011,$ABCD,$ABCD
429cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pshufd	\$0b00011011,$E,$E
430cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movdqu	$ABCD,($ctx)
431cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movd	$E,16($ctx)
432cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley___
433cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley$code.=<<___ if ($win64);
434cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movaps	-8-4*16(%rax),%xmm6
435cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movaps	-8-3*16(%rax),%xmm7
436cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movaps	-8-2*16(%rax),%xmm8
437cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	movaps	-8-1*16(%rax),%xmm9
438cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	mov	%rax,%rsp
439cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.Lepilogue_shaext:
440cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley___
441cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley$code.=<<___;
442cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	ret
443cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
444cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley___
445cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley}}}
446cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley{{{
44795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $Xi=4;
44895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @X=map("%xmm$_",(4..7,0..3));
44995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @Tx=map("%xmm$_",(8..10));
45095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $Kx="%xmm11";
45195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
45295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @T=("%esi","%edi");
45395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $j=0;
45495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $rx=0;
45595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $K_XX_XX="%r11";
45695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
45795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $_rol=sub { &rol(@_) };
45895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $_ror=sub { &ror(@_) };
45995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
46095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ my $sn;
46195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub align32() {
46295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  ++$sn;
46395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
46495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
46595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	32
46695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lalign32_$sn:
46795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
46895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
46995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
47095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
47195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
47295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	sha1_block_data_order_ssse3,\@function,3
47395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
47495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysha1_block_data_order_ssse3:
47595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley_ssse3_shortcut:
47695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rsp,%rax
47795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbx
47895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbp
47995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r12
48095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r13		# redundant, done to share Win64 SE handler
48195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r14
48295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
48395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
48495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($win64);
48595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	%xmm6,-40-6*16(%rax)
48695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	%xmm7,-40-5*16(%rax)
48795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	%xmm8,-40-4*16(%rax)
48895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	%xmm9,-40-3*16(%rax)
48995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	%xmm10,-40-2*16(%rax)
49095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	%xmm11,-40-1*16(%rax)
49195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lprologue_ssse3:
49295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
49395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
49495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rax,%r14	# original %rsp
49595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	\$-64,%rsp
49695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdi,$ctx	# reassigned argument
49795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rsi,$inp	# reassigned argument
49895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdx,$num	# reassigned argument
49995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
50095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	shl	\$6,$num
50195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$inp,$num
50295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	K_XX_XX+64(%rip),$K_XX_XX
50395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
50495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	0($ctx),$A		# load context
50595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	4($ctx),$B
50695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	8($ctx),$C
50795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	12($ctx),$D
50895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$B,@T[0]		# magic seed
50995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	16($ctx),$E
51095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,@T[1]
51195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	$D,@T[1]
51295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	@T[1],@T[0]
51395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
51495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
51595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
51695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
51795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqu	16($inp),@X[-3&7]
51895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqu	32($inp),@X[-2&7]
51995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqu	48($inp),@X[-1&7]
52095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pshufb	@X[2],@X[-4&7]		# byte swap
52195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pshufb	@X[2],@X[-3&7]
52295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pshufb	@X[2],@X[-2&7]
5235c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	add	\$64,$inp
52495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	paddd	@Tx[1],@X[-4&7]		# add K_00_19
5255c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	pshufb	@X[2],@X[-1&7]
52695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	paddd	@Tx[1],@X[-3&7]
52795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	paddd	@Tx[1],@X[-2&7]
52895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
52995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	psubd	@Tx[1],@X[-4&7]		# restore X[]
53095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqa	@X[-3&7],16(%rsp)
53195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	psubd	@Tx[1],@X[-3&7]
53295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movdqa	@X[-2&7],32(%rsp)
53395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	psubd	@Tx[1],@X[-2&7]
53495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Loop_ssse3
53595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
53695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
53795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
53895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
53995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $arg = pop;
54095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley    $arg = "\$$arg" if ($arg*1 eq $arg);
54195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
54295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
54395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
54495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
54595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
54695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
54795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
54895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
54995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
5505c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# ror
5515c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
55295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
55395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqa	(@Tx[0],@X[-1&7]);
5545c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	  &paddd	(@Tx[1],@X[-1&7]);
55595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
55695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
55795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
5585c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
55995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
5605c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# rol
56195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
56295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
56395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
56495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
5655c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley
56695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
56795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
5685c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# ror
56995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
57095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
57195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
57295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
57395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
57495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
57595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
5765c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# rol
57795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
57895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
57995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
58095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
58195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqa	(@Tx[2],@X[0]);
58295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
58395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
5845c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# ror
5855c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	&movdqa	(@Tx[0],@X[0]);
58695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
58795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
58895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
58995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&paddd	(@X[0],@X[0]);
59095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
59195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
59295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
59395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&psrld	(@Tx[0],31);
59495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
5955c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# rol
59695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
59795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqa	(@Tx[1],@Tx[2]);
59895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
59995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
60095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
60195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&psrld	(@Tx[2],30);
60295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
6035c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# ror
6045c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
60595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
60695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
60795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
60895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
60995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pslld	(@Tx[1],2);
61095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pxor	(@X[0],@Tx[2]);
61195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
61295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
6135c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# rol
61495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
61595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
61695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
61795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
6185c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
61995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
62095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }	# remaining instructions [if any]
62195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
62295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
62395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley		push(@Tx,shift(@Tx));
62495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
62595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
62695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xupdate_ssse3_32_79()
62795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
62895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
62995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
63095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
63195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
6325c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns))		if ($Xi==8);
63395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
6345c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns))		if ($Xi==8);
6355c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# body_20_39
63695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
6375c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
6385c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
6395c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
64095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
64195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
64295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
64395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
64495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
6455c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));
64695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	if ($Xi%5) {
64795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
64895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	} else {			# ... or load next one
64995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
65095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	}
65195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# ror
6525c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	  &paddd	(@Tx[1],@X[-1&7]);
65395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
65495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
65595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
65695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# body_20_39
65795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
65895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
65995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
6605c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
66195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
66295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqa	(@Tx[0],@X[0]);
66395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
66495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
6655c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
66695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# ror
66795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
6685c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# body_20_39
66995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
67095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pslld	(@X[0],2);
67195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
67295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
6735c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	&psrld	(@Tx[0],30);
6745c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
67595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
67695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
67795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# ror
67895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
67995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
68095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
6815c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));		# body_20_39
6825c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
6835c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
6845c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
68595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
68695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
68795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
68895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
68995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
69095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
69195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
69295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }	# remaining instructions
69395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
69495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
69595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley		push(@Tx,shift(@Tx));
69695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
69795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
69895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xuplast_ssse3_80()
69995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
70095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
70195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
70295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
70395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
70495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
70595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
70695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
70795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
7085c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	  &paddd	(@Tx[1],@X[-1&7]);
7095c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));
71095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
71195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
71295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
71395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
71495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }		# remaining instructions
71595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
71695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&cmp	($inp,$num);
71795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&je	(".Ldone_ssse3");
71895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
71995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	unshift(@Tx,pop(@Tx));
72095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
72195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
72295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
72395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqu	(@X[-4&7],"0($inp)");		# load input
72495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqu	(@X[-3&7],"16($inp)");
72595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqu	(@X[-2&7],"32($inp)");
72695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqu	(@X[-1&7],"48($inp)");
72795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pshufb	(@X[-4&7],@X[2]);		# byte swap
72895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&add	($inp,64);
72995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
73095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  $Xi=0;
73195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
73295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
73395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xloop_ssse3()
73495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
73595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
73695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
73795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
73895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
73995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
74095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
7415c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));
74295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&pshufb	(@X[($Xi-3)&7],@X[2]);
74395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
74495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
74595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
7465c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));
74795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&paddd	(@X[($Xi-4)&7],@Tx[1]);
74895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
74995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
75095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
75195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
75295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
75395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
75495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
7555c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));
7565c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	 eval(shift(@insns));
75795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&psubd	(@X[($Xi-4)&7],@Tx[1]);
75895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
75995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	foreach (@insns) { eval; }
76095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  $Xi++;
76195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
76295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
76395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xtail_ssse3()
76495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
76595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
76695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
76795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
76895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
76995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	foreach (@insns) { eval; }
77095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
77195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
77295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub body_00_19 () {	# ((c^d)&b)^d
77395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	# on start @T[0]=(c^d)&b
77495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	return &body_20_39() if ($rx==19); $rx++;
77595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	(
77695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'($a,$b,$c,$d,$e)=@V;'.
77795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&$_ror	($b,$j?7:2)',	# $b>>>2
77895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	(@T[0],$d)',
77995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&mov	(@T[1],$a)',	# $b for next round
78095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
78195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
78295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($b,$c)',	# $c^$d for next round
78395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
78495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&$_rol	($a,5)',
78595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,@T[0])',
78695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
78795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
78895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($b,$c)',	# restore $b
78995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
79095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	);
79195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
79295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
79395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub body_20_39 () {	# b^d^c
79495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	# on entry @T[0]=b^d
79595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	return &body_40_59() if ($rx==39); $rx++;
79695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	(
79795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'($a,$b,$c,$d,$e)=@V;'.
79895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
79995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	(@T[0],$d)	if($j==19);'.
80095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
80195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&mov	(@T[1],$a)',	# $b for next round
80295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
80395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&$_rol	($a,5)',
80495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,@T[0])',
80595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
80695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
80795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&$_ror	($b,7)',	# $b>>>2
80895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
80995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	);
81095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
81195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
81295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub body_40_59 () {	# ((b^c)&(c^d))^c
81395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	# on entry @T[0]=(b^c), (c^=d)
81495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	$rx++;
81595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	(
81695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'($a,$b,$c,$d,$e)=@V;'.
81795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
81895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
81995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($c,$d)		if ($j>=40)',	# restore $c
82095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
82195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&$_ror	($b,7)',	# $b>>>2
82295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&mov	(@T[1],$a)',	# $b for next round
82395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	(@T[0],$c)',
82495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
82595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&$_rol	($a,5)',
82695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,@T[0])',
82795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	(@T[1],$c)	if ($j==59);'.
82895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
82995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
83095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
83195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
83295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	);
83395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
83495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
83595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
83695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Loop_ssse3:
83795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
83895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_16_31(\&body_00_19);
83995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_16_31(\&body_00_19);
84095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_16_31(\&body_00_19);
84195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_16_31(\&body_00_19);
84295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_00_19);
84395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_20_39);
84495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_20_39);
84595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_20_39);
84695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_20_39);
84795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_20_39);
84895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_40_59);
84995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_40_59);
85095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_40_59);
85195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_40_59);
85295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_40_59);
85395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_ssse3_32_79(\&body_20_39);
85495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
85595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
85695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley				$saved_j=$j; @saved_V=@V;
85795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
85895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_ssse3(\&body_20_39);
85995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_ssse3(\&body_20_39);
86095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_ssse3(\&body_20_39);
86195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
86295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
86395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	0($ctx),$A			# update context
86495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	4($ctx),@T[0]
86595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	8($ctx),$C
86695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	12($ctx),$D
86795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$A,0($ctx)
86895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	16($ctx),$E
86995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@T[0],4($ctx)
87095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@T[0],$B			# magic seed
87195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,8($ctx)
87295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,@T[1]
87395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$D,12($ctx)
87495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	$D,@T[1]
87595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$E,16($ctx)
87695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	@T[1],@T[0]
87795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Loop_ssse3
87895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
87995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
88095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Ldone_ssse3:
88195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
88295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley				$j=$saved_j; @V=@saved_V;
88395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
88495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xtail_ssse3(\&body_20_39);
88595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xtail_ssse3(\&body_20_39);
88695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xtail_ssse3(\&body_20_39);
88795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
88895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
88995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	0($ctx),$A			# update context
89095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	4($ctx),@T[0]
89195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	8($ctx),$C
89295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$A,0($ctx)
89395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	12($ctx),$D
89495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@T[0],4($ctx)
89595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	16($ctx),$E
89695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,8($ctx)
89795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$D,12($ctx)
89895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$E,16($ctx)
89995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
90095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($win64);
90195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-6*16(%r14),%xmm6
90295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-5*16(%r14),%xmm7
90395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-4*16(%r14),%xmm8
90495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-3*16(%r14),%xmm9
90595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-2*16(%r14),%xmm10
90695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-1*16(%r14),%xmm11
90795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
90895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
90995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	(%r14),%rsi
91095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-40(%rsi),%r14
91195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-32(%rsi),%r13
91295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-24(%rsi),%r12
91395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-16(%rsi),%rbp
91495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-8(%rsi),%rbx
91595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	(%rsi),%rsp
91695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lepilogue_ssse3:
91795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ret
91895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
91995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
92095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
92195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyif ($avx) {
92295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Xi=4;				# reset variables
92395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley@X=map("%xmm$_",(4..7,0..3));
92495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley@Tx=map("%xmm$_",(8..10));
92595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$j=0;
92695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$rx=0;
92795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
92895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $done_avx_label=".Ldone_avx";
92995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
93095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $_rol=sub { &shld(@_[0],@_) };
93195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $_ror=sub { &shrd(@_[0],@_) };
93295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
93395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
93495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	sha1_block_data_order_avx,\@function,3
93595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
93695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysha1_block_data_order_avx:
93795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley_avx_shortcut:
93895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rsp,%rax
93995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbx
94095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbp
94195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r12
94295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r13		# redundant, done to share Win64 SE handler
94395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r14
94495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
94595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzeroupper
94695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
94795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($win64);
94895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm6,-40-6*16(%rax)
94995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm7,-40-5*16(%rax)
95095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm8,-40-4*16(%rax)
95195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm9,-40-3*16(%rax)
95295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm10,-40-2*16(%rax)
95395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm11,-40-1*16(%rax)
95495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lprologue_avx:
95595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
95695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
95795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rax,%r14	# original %rsp
95895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	\$-64,%rsp
95995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdi,$ctx	# reassigned argument
96095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rsi,$inp	# reassigned argument
96195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdx,$num	# reassigned argument
96295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
96395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	shl	\$6,$num
96495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$inp,$num
96595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	K_XX_XX+64(%rip),$K_XX_XX
96695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
96795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	0($ctx),$A		# load context
96895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	4($ctx),$B
96995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	8($ctx),$C
97095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	12($ctx),$D
97195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$B,@T[0]		# magic seed
97295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	16($ctx),$E
97395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,@T[1]
97495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	$D,@T[1]
97595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	@T[1],@T[0]
97695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
97795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
97895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
97995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
98095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	16($inp),@X[-3&7]
98195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	32($inp),@X[-2&7]
98295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	48($inp),@X[-1&7]
98395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
98495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	\$64,$inp
98595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpshufb	@X[2],@X[-3&7],@X[-3&7]
98695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpshufb	@X[2],@X[-2&7],@X[-2&7]
98795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpshufb	@X[2],@X[-1&7],@X[-1&7]
98895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
98995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpaddd	$Kx,@X[-3&7],@X[1]
99095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpaddd	$Kx,@X[-2&7],@X[2]
99195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
99295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqa	@X[1],16(%rsp)
99395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqa	@X[2],32(%rsp)
99495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Loop_avx
99595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
99695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
99795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
99895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
99995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
100095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
100195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
100295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
100395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
100495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
100595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
100695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
100795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
100895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
100995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
101095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
101195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
101295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
101395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
101495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
101595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
101695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
101795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
101895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
101995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
102095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
102195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
102295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
102395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
102495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
102595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
102695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
102795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
102895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
102995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
103095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
103195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
103295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrld	(@Tx[0],@X[0],31);
103395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
103495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
103595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
103695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
103795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
103895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
103995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpaddd	(@X[0],@X[0],@X[0]);
104095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
104195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
104295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
104395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
104495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
104595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrld	(@Tx[1],@Tx[2],30);
104695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
104795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
104895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
104995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
105095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
105195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
105295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpslld	(@Tx[2],@Tx[2],2);
105395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[1]);
105495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
105595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
105695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
105795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
105895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
105995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
106095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
106195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
106295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
106395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
106495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
106595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
106695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
106795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }	# remaining instructions [if any]
106895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
106995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
107095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
107195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
107295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xupdate_avx_32_79()
107395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
107495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
107595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
107695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
107795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
107895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
107995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
108095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# body_20_39
108195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
108295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
108395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
108495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
108595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
108695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
108795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
108895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
108995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
109095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# ror
109195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
109295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
109395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
109495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# body_20_39
109595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
109695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
109795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
109895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
109995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrld	(@Tx[0],@X[0],30);
110095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
110195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
110295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
110395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# ror
110495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
110595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
110695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpslld	(@X[0],@X[0],2);
110795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# body_20_39
110895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
110995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
111095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
111195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
111295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
111395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# ror
111495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
111595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
111695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
111795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# body_20_39
111895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
111995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
112095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
112195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
112295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
112395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));		# rol
112495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
112595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
112695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }	# remaining instructions
112795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
112895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
112995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
113095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
113195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xuplast_avx_80()
113295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
113395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
113495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
113595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
113695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
113795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
113895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
113995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
114095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
114195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
114295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
114395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
114495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
114595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
114695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }		# remaining instructions
114795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
114895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&cmp	($inp,$num);
114995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&je	($done_avx_label);
115095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
115195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
115295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
115395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu(@X[-4&7],"0($inp)");		# load input
115495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu(@X[-3&7],"16($inp)");
115595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu(@X[-2&7],"32($inp)");
115695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu(@X[-1&7],"48($inp)");
115795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
115895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&add	($inp,64);
115995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
116095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  $Xi=0;
116195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
116295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
116395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xloop_avx()
116495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
116595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
116695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
116795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
116895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
116995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
117095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
117195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
117295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
117395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
117495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
117595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
117695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
117795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
117895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
117995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
118095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
118195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
118295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
118395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	foreach (@insns) { eval; }
118495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  $Xi++;
118595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
118695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
118795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xtail_avx()
118895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
118995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
119095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
119195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
119295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
119395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	foreach (@insns) { eval; }
119495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
119595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
119695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
119795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
119895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Loop_avx:
119995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
120095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_16_31(\&body_00_19);
120195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_16_31(\&body_00_19);
120295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_16_31(\&body_00_19);
120395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_16_31(\&body_00_19);
120495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_00_19);
120595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_20_39);
120695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_20_39);
120795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_20_39);
120895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_20_39);
120995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_20_39);
121095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_40_59);
121195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_40_59);
121295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_40_59);
121395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_40_59);
121495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_40_59);
121595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx_32_79(\&body_20_39);
121695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
121795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
121895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley				$saved_j=$j; @saved_V=@V;
121995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
122095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx(\&body_20_39);
122195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx(\&body_20_39);
122295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx(\&body_20_39);
122395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
122495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
122595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	0($ctx),$A			# update context
122695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	4($ctx),@T[0]
122795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	8($ctx),$C
122895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	12($ctx),$D
122995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$A,0($ctx)
123095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	16($ctx),$E
123195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@T[0],4($ctx)
123295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@T[0],$B			# magic seed
123395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,8($ctx)
123495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,@T[1]
123595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$D,12($ctx)
123695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	$D,@T[1]
123795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$E,16($ctx)
123895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	@T[1],@T[0]
123995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Loop_avx
124095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
124195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
124295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$done_avx_label:
124395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
124495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley				$j=$saved_j; @V=@saved_V;
124595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
124695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xtail_avx(\&body_20_39);
124795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xtail_avx(\&body_20_39);
124895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xtail_avx(\&body_20_39);
124995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
125095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
125195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzeroupper
125295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
125395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	0($ctx),$A			# update context
125495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	4($ctx),@T[0]
125595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	8($ctx),$C
125695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$A,0($ctx)
125795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	12($ctx),$D
125895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@T[0],4($ctx)
125995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	16($ctx),$E
126095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$C,8($ctx)
126195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$D,12($ctx)
126295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$E,16($ctx)
126395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
126495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($win64);
126595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-6*16(%r14),%xmm6
126695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-5*16(%r14),%xmm7
126795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-4*16(%r14),%xmm8
126895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-3*16(%r14),%xmm9
126995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-2*16(%r14),%xmm10
127095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-1*16(%r14),%xmm11
127195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
127295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
127395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	(%r14),%rsi
127495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-40(%rsi),%r14
127595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-32(%rsi),%r13
127695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-24(%rsi),%r12
127795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-16(%rsi),%rbp
127895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-8(%rsi),%rbx
127995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	(%rsi),%rsp
128095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lepilogue_avx:
128195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ret
128295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
128395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
128495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
128595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyif ($avx>1) {
128695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyuse integer;
128795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Xi=4;					# reset variables
128895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley@X=map("%ymm$_",(4..7,0..3));
128995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley@Tx=map("%ymm$_",(8..10));
129095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Kx="%ymm11";
129195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$j=0;
129295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
129395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
129495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($a5,$t0)=("%r12d","%edi");
129595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
129695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy ($A,$F,$B,$C,$D,$E)=@ROTX;
129795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $rx=0;
129895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleymy $frame="%r13";
129995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
130095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
130195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	sha1_block_data_order_avx2,\@function,3
130295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
130395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysha1_block_data_order_avx2:
130495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley_avx2_shortcut:
130595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rsp,%rax
130695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbx
130795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbp
130895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r12
130995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r13
131095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r14
131195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzeroupper
131295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
131395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($win64);
131495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	-6*16(%rsp),%rsp
131595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm6,-40-6*16(%rax)
131695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm7,-40-5*16(%rax)
131795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm8,-40-4*16(%rax)
131895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm9,-40-3*16(%rax)
131995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm10,-40-2*16(%rax)
132095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovaps	%xmm11,-40-1*16(%rax)
132195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lprologue_avx2:
132295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
132395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
132495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rax,%r14		# original %rsp
132595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdi,$ctx		# reassigned argument
132695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rsi,$inp		# reassigned argument
132795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdx,$num		# reassigned argument
132895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
132995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	-640(%rsp),%rsp
133095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	shl	\$6,$num
133195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 lea	64($inp),$frame
133295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	\$-128,%rsp
133395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	$inp,$num
133495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	K_XX_XX+64(%rip),$K_XX_XX
133595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
133695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	0($ctx),$A		# load context
133795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 cmp	$num,$frame
133895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 cmovae	$inp,$frame		# next or same block
133995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	4($ctx),$F
134095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	8($ctx),$C
134195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	12($ctx),$D
134295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	16($ctx),$E
134395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
134495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
134595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		($inp),%xmm0
134695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		16($inp),%xmm1
134795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		32($inp),%xmm2
134895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		48($inp),%xmm3
134995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea		64($inp),$inp
135095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
135195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
135295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpshufb		@X[2],@X[-4&7],@X[-4&7]
135395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
135495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpshufb		@X[2],@X[-3&7],@X[-3&7]
135595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
135695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpshufb		@X[2],@X[-2&7],@X[-2&7]
135795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
135895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpshufb		@X[2],@X[-1&7],@X[-1&7]
135995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
136095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
136195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpaddd	$Kx,@X[-3&7],@X[1]
136295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
136395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpaddd	$Kx,@X[-2&7],@X[2]
136495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	@X[1],32(%rsp)
136595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vpaddd	$Kx,@X[-1&7],@X[3]
136695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	@X[2],64(%rsp)
136795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	@X[3],96(%rsp)
136895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
136995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyfor (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
137095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley    use integer;
137195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
137295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
137395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
137495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
137595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
137695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
137795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrld	(@Tx[0],@X[0],31);
137895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
137995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
138095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpaddd	(@X[0],@X[0],@X[0]);
138195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrld	(@Tx[1],@Tx[2],30);
138295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
138395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpslld	(@Tx[2],@Tx[2],2);
138495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[1]);
138595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
138695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpaddd	(@Tx[1],@X[0],$Kx);
138795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
138895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
138995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push(@X,shift(@X));	# "rotate" X[]
139095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
139195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
139295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	128(%rsp),$frame
139395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Loop_avx2
139495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	32
139595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Loop_avx2:
139695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rorx	\$2,$F,$B
139795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	andn	$D,$F,$t0
139895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$C,$F
139995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	$t0,$F
140095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
140195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
140295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	# at start $f=(b&c)^(~b&d), $b>>>=2
140395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	return &bodyx_20_39() if ($rx==19); $rx++;
140495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	(
140595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
140695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
140795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
140895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
140995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&andn	($t0,$a,$c)',			# ~b&d for next round
141095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
141195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
141295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&rorx	($a5,$a,27)',			# a<<<5
141395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&rorx	($f,$a,2)',			# b>>>2 for next round
141495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&and	($a,$b)',			# b&c for next round
141595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
141695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,$a5)',			# e+=a<<<5
141795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
141895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
141995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'unshift(@ROTX,pop(@ROTX)); $j++;'
142095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	)
142195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
142295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
142395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
142495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	# on entry $f=b^c^d, $b>>>=2
142595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	return &bodyx_40_59() if ($rx==39); $rx++;
142695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	(
142795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
142895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
142995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
143095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
143195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
143295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&lea	($e,"($e,$f)")',		# e+=b^c^d
143395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&rorx	($a5,$a,27)',			# a<<<5
143495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
143595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
143695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
143795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,$a5)',			# e+=a<<<5
143895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
143995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
144095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'unshift(@ROTX,pop(@ROTX)); $j++;'
144195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	)
144295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
144395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
144495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
144595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	# on entry $f=((b^c)&(c^d)), $b>>>=2
144695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	$rx++;
144795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	(
144895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
144995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
145095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
145195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
145295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
145395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
145495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
145595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
145695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
145795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&rorx	($a5,$a,27)',			# a<<<5
145895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&rorx	($f,$a,2)',			# b>>>2 in next round
145995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($a,$b)',			# b^c for next round
146095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
146195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&add	($e,$a5)',			# e+=a<<<5
146295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
146395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
146495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
146595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	'unshift(@ROTX,pop(@ROTX)); $j++;'
146695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	)
146795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
146895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
146995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
147095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
147195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
147295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
147395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
147495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
147595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
147695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
147795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
147895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
147995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
148095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
148195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
148295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
148395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
148495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
148595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
148695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
148795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
148895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
148995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
149095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
149195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
149295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
149395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
149495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
149595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
149695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
149795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrld	(@Tx[0],@X[0],31);
149895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
149995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
150095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
150195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
150295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
150395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
150495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpaddd	(@X[0],@X[0],@X[0]);
150595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
150695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
150795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
150895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrld	(@Tx[1],@Tx[2],30);
150995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
151095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
151195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
151295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
151395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpslld	(@Tx[2],@Tx[2],2);
151495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[1]);
151595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
151695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
151795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
151895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
151995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
152095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
152195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
152295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
152395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpaddd	(@Tx[1],@X[0],$Kx);
152495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
152595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
152695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
152795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
152895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
152995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }	# remaining instructions [if any]
153095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
153195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	$Xi++;
153295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push(@X,shift(@X));	# "rotate" X[]
153395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
153495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
153595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xupdate_avx2_32_79()
153695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
153795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
153895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
153995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
154095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
154195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
154295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
154395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
154495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
154595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
154695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
154795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
154895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
154995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
155095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
155195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
155295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
155395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
155495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
155595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
155695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
155795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpsrld	(@Tx[0],@X[0],30);
155895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpslld	(@X[0],@X[0],2);
155995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
156095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
156195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
156295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
156395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	#&vpslld	(@X[0],@X[0],2);
156495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
156595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
156695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
156795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
156895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
156995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
157095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
157195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
157295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
157395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
157495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vpaddd	(@Tx[1],@X[0],$Kx);
157595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
157695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
157795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
157895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 eval(shift(@insns));
157995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
158095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
158195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
158295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }	# remaining instructions
158395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
158495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	$Xi++;
158595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push(@X,shift(@X));	# "rotate" X[]
158695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
158795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
158895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleysub Xloop_avx2()
158995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley{ use integer;
159095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my $body = shift;
159195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
159295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley  my ($a,$b,$c,$d,$e);
159395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
159495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 foreach (@insns) { eval; }
159595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
159695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
159795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&align32();
159895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_00_19);
159995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_00_19);
160095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_00_19);
160195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_00_19);
160295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
160395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_20_39);
160495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_20_39);
160595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_20_39);
160695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_20_39);
160795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
160895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&align32();
160995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_40_59);
161095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_40_59);
161195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_40_59);
161295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_32_79(\&bodyx_40_59);
161395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
161495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2(\&bodyx_20_39);
161595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2(\&bodyx_20_39);
161695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2(\&bodyx_20_39);
161795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2(\&bodyx_20_39);
161895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
161995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
162095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	128($inp),$frame
162195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	128($inp),%rdi			# borrow $t0
162295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	$num,$frame
162395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmovae	$inp,$frame			# next or previous block
162495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
162595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
162695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	0($ctx),@ROTX[0]		# update context
162795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	4($ctx),@ROTX[1]
162895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	8($ctx),@ROTX[3]
162995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[0],0($ctx)
163095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	12($ctx),@ROTX[4]
163195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[1],4($ctx)
163295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[0],$A			# A=d
163395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	16($ctx),@ROTX[5]
163495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[3],$a5
163595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[3],8($ctx)
163695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[4],$D			# D=b
163795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 #xchg	@ROTX[5],$F			# F=c, C=f
163895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[4],12($ctx)
163995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[1],$F			# F=e
164095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[5],16($ctx)
164195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	#mov	$F,16($ctx)
164295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[5],$E			# E=c
164395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	$a5,$C				# C=f
164495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 #xchg	$F,$E				# E=c, F=e
164595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
164695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	$num,$inp
164795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	je	.Ldone_avx2
164895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
164995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
165095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$Xi=4;				# reset variables
165195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley@X=map("%ymm$_",(4..7,0..3));
165295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
165395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
165495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
165595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	$num,%rdi			# borrowed $t0
165695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ja	.Last_avx2
165795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
165895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
165995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		-48(%rdi),%xmm1
166095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		-32(%rdi),%xmm2
166195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vmovdqu		-16(%rdi),%xmm3
166295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
166395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
166495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
166595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
166695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Last_avx2
166795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
166895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	32
166995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Last_avx2:
167095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	128+16(%rsp),$frame
167195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	rorx	\$2,$F,$B
167295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	andn	$D,$F,$t0
167395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	and	$C,$F
167495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	$t0,$F
167595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	\$-128,$inp
167695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
167795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
167895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
167995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_00_19);
168095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_00_19);
168195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_00_19);
168295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_00_19);
168395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
168495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_20_39);
168595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
168695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
168795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_20_39);
168895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
168995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
169095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_20_39);
169195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqu	("0(%rsp)",@Tx[0]);
169295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
169395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
169495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_20_39);
169595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqu	("32(%rsp)",@Tx[1]);
169695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
169795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpaddd	(@X[2],@X[-2&7],$Kx);
169895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
169995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_40_59);
170095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&align32	();
170195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqu	("64(%rsp)",@X[2]);
170295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vpaddd	(@X[3],@X[-1&7],$Kx);
170395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_40_59);
170495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	  &vmovdqu	("96(%rsp)",@X[3]);
170595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_40_59);
170695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_16_31(\&bodyx_40_59);
170795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
170895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_16_31(\&bodyx_20_39);
170995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_16_31(\&bodyx_20_39);
171095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xupdate_avx2_16_31(\&bodyx_20_39);
171195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	&Xloop_avx2	(\&bodyx_20_39);
171295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
171395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
171495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	128(%rsp),$frame
171595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
171695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
171795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	0($ctx),@ROTX[0]		# update context
171895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	4($ctx),@ROTX[1]
171995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	8($ctx),@ROTX[3]
172095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[0],0($ctx)
172195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	12($ctx),@ROTX[4]
172295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[1],4($ctx)
172395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[0],$A			# A=d
172495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	16($ctx),@ROTX[5]
172595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[3],$a5
172695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[3],8($ctx)
172795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[4],$D			# D=b
172895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 #xchg	@ROTX[5],$F			# F=c, C=f
172995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[4],12($ctx)
173095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[1],$F			# F=e
173195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	@ROTX[5],16($ctx)
173295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	#mov	$F,16($ctx)
173395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	@ROTX[5],$E			# E=c
173495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 mov	$a5,$C				# C=f
173595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	 #xchg	$F,$E				# E=c, F=e
173695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
173795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	$num,$inp
173895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jbe	.Loop_avx2
173995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
174095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Ldone_avx2:
174195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	vzeroupper
174295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
174395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($win64);
174495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-6*16(%r14),%xmm6
174595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-5*16(%r14),%xmm7
174695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-4*16(%r14),%xmm8
174795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-3*16(%r14),%xmm9
174895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-2*16(%r14),%xmm10
174995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	movaps	-40-1*16(%r14),%xmm11
175095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
175195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
175295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	(%r14),%rsi
175395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-40(%rsi),%r14
175495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-32(%rsi),%r13
175595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-24(%rsi),%r12
175695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-16(%rsi),%rbp
175795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-8(%rsi),%rbx
175895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	(%rsi),%rsp
175995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lepilogue_avx2:
176095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ret
176195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
176295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
176395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
176495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
176595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
176695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	64
176795c29f3cd1f6c08c6c0927868683392eea727ccAdam LangleyK_XX_XX:
176895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
176995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
177095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
177195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
177295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
177395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
177495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
177595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
177695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
177795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1778cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
177995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
178095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}}}
178195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
178295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
178395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	64
178495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
178595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
178695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
178795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
178895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyif ($win64) {
178995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$rec="%rcx";
179095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$frame="%rdx";
179195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$context="%r8";
179295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$disp="%r9";
179395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
179495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
179595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.extern	__imp_RtlVirtualUnwind
179695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	se_handler,\@abi-omnipotent
179795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
179895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyse_handler:
179995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rsi
180095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rdi
180195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbx
180295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbp
180395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r12
180495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r13
180595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r14
180695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r15
180795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pushfq
180895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	\$64,%rsp
180995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
181095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	120($context),%rax	# pull context->Rax
181195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	248($context),%rbx	# pull context->Rip
181295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
181395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	.Lprologue(%rip),%r10
181495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	%r10,%rbx		# context->Rip<.Lprologue
181595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jb	.Lcommon_seh_tail
181695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
181795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	152($context),%rax	# pull context->Rsp
181895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
181995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	.Lepilogue(%rip),%r10
182095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
182195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jae	.Lcommon_seh_tail
182295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
182395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	`16*4`(%rax),%rax	# pull saved stack pointer
182495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
182595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-8(%rax),%rbx
182695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-16(%rax),%rbp
182795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-24(%rax),%r12
182895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-32(%rax),%r13
18295c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	-40(%rax),%r14
183095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rbx,144($context)	# restore context->Rbx
183195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rbp,160($context)	# restore context->Rbp
183295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%r12,216($context)	# restore context->R12
183395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%r13,224($context)	# restore context->R13
18345c6ca976c8f57d389d10dddc9ab5898ff5f03196Adam Langley	mov	%r14,232($context)	# restore context->R14
183595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
183695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jmp	.Lcommon_seh_tail
183795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	se_handler,.-se_handler
18383ffd70ec3692f577a947295152fb041ff4b8607bAdam Langley___
183995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
18403ffd70ec3692f577a947295152fb041ff4b8607bAdam Langley$code.=<<___ if ($shaext);
1841cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.type	shaext_handler,\@abi-omnipotent
1842cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.align	16
1843cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleyshaext_handler:
1844cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push	%rsi
1845cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push	%rdi
1846cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push	%rbx
1847cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push	%rbp
1848cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push	%r12
1849cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push	%r13
1850cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push	%r14
1851cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push	%r15
1852cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	pushfq
1853cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	sub	\$64,%rsp
1854cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
1855cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	mov	120($context),%rax	# pull context->Rax
1856cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	mov	248($context),%rbx	# pull context->Rip
1857cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
1858cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	lea	.Lprologue_shaext(%rip),%r10
1859cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	cmp	%r10,%rbx		# context->Rip<.Lprologue
1860cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	jb	.Lcommon_seh_tail
1861cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
1862cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	lea	.Lepilogue_shaext(%rip),%r10
1863cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1864cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	jae	.Lcommon_seh_tail
1865cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
1866cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	lea	-8-4*16(%rax),%rsi
1867cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	lea	512($context),%rdi	# &context.Xmm6
1868cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	mov	\$8,%ecx
1869cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	.long	0xa548f3fc		# cld; rep movsq
1870cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
1871cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	jmp	.Lcommon_seh_tail
1872cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.size	shaext_handler,.-shaext_handler
18733ffd70ec3692f577a947295152fb041ff4b8607bAdam Langley___
1874cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
18753ffd70ec3692f577a947295152fb041ff4b8607bAdam Langley$code.=<<___;
187695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.type	ssse3_handler,\@abi-omnipotent
187795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	16
187895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyssse3_handler:
187995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rsi
188095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rdi
188195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbx
188295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%rbp
188395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r12
188495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r13
188595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r14
188695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	push	%r15
188795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pushfq
188895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	sub	\$64,%rsp
188995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
189095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	120($context),%rax	# pull context->Rax
189195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	248($context),%rbx	# pull context->Rip
189295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
189395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	8($disp),%rsi		# disp->ImageBase
189495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	56($disp),%r11		# disp->HandlerData
189595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
189695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	0(%r11),%r10d		# HandlerData[0]
189795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	(%rsi,%r10),%r10	# prologue label
189895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	%r10,%rbx		# context->Rip<prologue label
189995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jb	.Lcommon_seh_tail
190095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
190195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	152($context),%rax	# pull context->Rsp
190295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
190395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	4(%r11),%r10d		# HandlerData[1]
190495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	(%rsi,%r10),%r10	# epilogue label
190595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	cmp	%r10,%rbx		# context->Rip>=epilogue label
190695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	jae	.Lcommon_seh_tail
190795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
190895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	232($context),%rax	# pull context->R14
190995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
191095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	-40-6*16(%rax),%rsi
191195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	512($context),%rdi	# &context.Xmm6
191295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	\$12,%ecx
191395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.long	0xa548f3fc		# cld; rep movsq
191495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
191595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-8(%rax),%rbx
191695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-16(%rax),%rbp
191795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-24(%rax),%r12
191895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-32(%rax),%r13
191995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	-40(%rax),%r14
192095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rbx,144($context)	# restore context->Rbx
192195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rbp,160($context)	# restore context->Rbp
192295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%r12,216($context)	# restore cotnext->R12
192395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%r13,224($context)	# restore cotnext->R13
192495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%r14,232($context)	# restore cotnext->R14
192595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
192695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.Lcommon_seh_tail:
192795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	8(%rax),%rdi
192895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	16(%rax),%rsi
192995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rax,152($context)	# restore context->Rsp
193095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rsi,168($context)	# restore context->Rsi
193195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rdi,176($context)	# restore context->Rdi
193295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
193395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	40($disp),%rdi		# disp->ContextRecord
193495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$context,%rsi		# context
193595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	\$154,%ecx		# sizeof(CONTEXT)
193695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.long	0xa548f3fc		# cld; rep movsq
193795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
193895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	$disp,%rsi
193995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
194095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
194195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	0(%rsi),%r8		# arg3, disp->ControlPc
194295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
194395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	40(%rsi),%r10		# disp->ContextRecord
194495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	56(%rsi),%r11		# &disp->HandlerData
194595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	lea	24(%rsi),%r12		# &disp->EstablisherFrame
194695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%r10,32(%rsp)		# arg5
194795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%r11,40(%rsp)		# arg6
194895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%r12,48(%rsp)		# arg7
194995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	%rcx,56(%rsp)		# arg8, (NULL)
195095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	call	*__imp_RtlVirtualUnwind(%rip)
195195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
195295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	mov	\$1,%eax		# ExceptionContinueSearch
195395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	add	\$64,%rsp
195495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	popfq
195595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pop	%r15
195695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pop	%r14
195795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pop	%r13
195895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pop	%r12
195995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pop	%rbp
196095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pop	%rbx
196195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pop	%rdi
196295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	pop	%rsi
196395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	ret
196495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.size	ssse3_handler,.-ssse3_handler
196595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
196695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.section	.pdata
196795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	4
196895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_begin_sha1_block_data_order
196995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_end_sha1_block_data_order
197095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_info_sha1_block_data_order
1971006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley___
1972006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley$code.=<<___ if ($shaext);
1973cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	.rva	.LSEH_begin_sha1_block_data_order_shaext
1974cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	.rva	.LSEH_end_sha1_block_data_order_shaext
1975cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	.rva	.LSEH_info_sha1_block_data_order_shaext
1976006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley___
1977006779a02c515e5e6c8406c4aea26950b37676e0Adam Langley$code.=<<___;
197895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_begin_sha1_block_data_order_ssse3
197995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_end_sha1_block_data_order_ssse3
198095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_info_sha1_block_data_order_ssse3
198195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
198295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($avx);
198395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_begin_sha1_block_data_order_avx
198495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_end_sha1_block_data_order_avx
198595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_info_sha1_block_data_order_avx
198695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
198795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($avx>1);
198895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_begin_sha1_block_data_order_avx2
198995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_end_sha1_block_data_order_avx2
199095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.LSEH_info_sha1_block_data_order_avx2
199195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
199295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___;
199395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.section	.xdata
199495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.align	8
199595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LSEH_info_sha1_block_data_order:
199695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.byte	9,0,0,0
199795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	se_handler
19983ffd70ec3692f577a947295152fb041ff4b8607bAdam Langley___
19993ffd70ec3692f577a947295152fb041ff4b8607bAdam Langley$code.=<<___ if ($shaext);
2000cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley.LSEH_info_sha1_block_data_order_shaext:
2001cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	.byte	9,0,0,0
2002cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	.rva	shaext_handler
20033ffd70ec3692f577a947295152fb041ff4b8607bAdam Langley___
20043ffd70ec3692f577a947295152fb041ff4b8607bAdam Langley$code.=<<___;
200595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LSEH_info_sha1_block_data_order_ssse3:
200695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.byte	9,0,0,0
200795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	ssse3_handler
200895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
200995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
201095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($avx);
201195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LSEH_info_sha1_block_data_order_avx:
201295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.byte	9,0,0,0
201395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	ssse3_handler
201495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
201595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
201695c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley$code.=<<___ if ($avx>1);
201795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley.LSEH_info_sha1_block_data_order_avx2:
201895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.byte	9,0,0,0
201995c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	ssse3_handler
202095c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
202195c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley___
202295c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley}
202395c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
202495c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley####################################################################
202595c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
2026cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleysub sha1rnds4 {
2027cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2028cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley      my @opcode=(0x0f,0x3a,0xcc);
2029cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2030cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	my $c=$1;
2031cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push @opcode,$c=~/^0/?oct($c):$c;
2032cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	return ".byte\t".join(',',@opcode);
2033cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley    } else {
2034cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	return "sha1rnds4\t".@_[0];
2035cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley    }
2036cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley}
203795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
2038cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleysub sha1op38 {
2039cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley    my $instr = shift;
2040cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley    my %opcodelet = (
2041cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley		"sha1nexte" => 0xc8,
2042cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley  		"sha1msg1"  => 0xc9,
2043cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley		"sha1msg2"  => 0xca	);
2044cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
2045956665b32b548c32fd02240173dd79f7dd9ff290Adam Langley    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2046cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley      my @opcode=(0x0f,0x38);
2047956665b32b548c32fd02240173dd79f7dd9ff290Adam Langley      my $rex=0;
2048956665b32b548c32fd02240173dd79f7dd9ff290Adam Langley	$rex|=0x04			if ($2>=8);
2049956665b32b548c32fd02240173dd79f7dd9ff290Adam Langley	$rex|=0x01			if ($1>=8);
2050956665b32b548c32fd02240173dd79f7dd9ff290Adam Langley	unshift @opcode,0x40|$rex	if ($rex);
2051cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push @opcode,$opcodelet{$instr};
2052cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2053cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	return ".byte\t".join(',',@opcode);
2054cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley    } else {
2055cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	return $instr."\t".@_[0];
2056cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley    }
2057cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley}
205895c29f3cd1f6c08c6c0927868683392eea727ccAdam Langley
2059cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langleyforeach (split("\n",$code)) {
2060cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	s/\`([^\`]*)\`/eval $1/geo;
2061cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
2062cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2063cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2064cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley
2065cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley	print $_,"\n";
2066cb5dd63e5e5eecaf8ce12be2d92cecfbff298b4bAdam Langley}
206795c29f3cd1f6c08c6c0927868683392eea727ccAdam Langleyclose STDOUT;
2068