1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# "[Re]written" was achieved in two major overhauls. In 2004 BODY_*
11# functions were re-implemented to address P4 performance issue [see
12# commentary below], and in 2006 the rest was rewritten in order to
13# gain freedom to liberate licensing terms.
14
15# January, September 2004.
16#
17# It was noted that Intel IA-32 C compiler generates code which
18# performs ~30% *faster* on P4 CPU than original *hand-coded*
19# SHA1 assembler implementation. To address this problem (and
20# prove that humans are still better than machines:-), the
21# original code was overhauled, which resulted in following
22# performance changes:
23#
24#		compared with original	compared with Intel cc
25#		assembler impl.		generated code
26# Pentium	-16%			+48%
27# PIII/AMD	+8%			+16%
28# P4		+85%(!)			+45%
29#
30# As you can see Pentium came out as looser:-( Yet I reckoned that
31# improvement on P4 outweights the loss and incorporate this
32# re-tuned code to 0.9.7 and later.
33# ----------------------------------------------------------------
34#					<appro@fy.chalmers.se>
35
36# August 2009.
37#
38# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
39# '(c&d) + (b&(c^d))', which allows to accumulate partial results
40# and lighten "pressure" on scratch registers. This resulted in
41# >12% performance improvement on contemporary AMD cores (with no
42# degradation on other CPUs:-). Also, the code was revised to maximize
43# "distance" between instructions producing input to 'lea' instruction
44# and the 'lea' instruction itself, which is essential for Intel Atom
45# core and resulted in ~15% improvement.
46
47# October 2010.
48#
49# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
50# is to offload message schedule denoted by Wt in NIST specification,
51# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
52# and in SSE2 context was first explored by Dean Gaudet in 2004, see
53# http://arctic.org/~dean/crypto/sha1.html. Since then several things
54# have changed that made it interesting again:
55#
56# a) XMM units became faster and wider;
57# b) instruction set became more versatile;
58# c) an important observation was made by Max Locktykhin, which made
59#    it possible to reduce amount of instructions required to perform
60#    the operation in question, for further details see
61#    http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
62
63# April 2011.
64#
65# Add AVX code path, probably most controversial... The thing is that
66# switch to AVX alone improves performance by as little as 4% in
67# comparison to SSSE3 code path. But below result doesn't look like
68# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
69# pair of �-ops, and it's the additional �-ops, two per round, that
70# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
71# as single �-op by Sandy Bridge and it's replacing 'ro[rl]' with
72# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
73# cycles per processed byte. But 'sh[rl]d' is not something that used
74# to be fast, nor does it appear to be fast in upcoming Bulldozer
75# [according to its optimization manual]. Which is why AVX code path
76# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
77# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
78# makes no sense to keep the AVX code path. If somebody feels that
79# strongly, it's probably more appropriate to discuss possibility of
80# using vector rotate XOP on AMD...
81
82# March 2014.
83#
84# Add support for Intel SHA Extensions.
85
86######################################################################
87# Current performance is summarized in following table. Numbers are
88# CPU clock cycles spent to process single byte (less is better).
89#
90#		x86		SSSE3		AVX
91# Pentium	15.7		-
92# PIII		11.5		-
93# P4		10.6		-
94# AMD K8	7.1		-
95# Core2		7.3		6.0/+22%	-
96# Atom		12.5		9.3(*)/+35%	-
97# Westmere	7.3		5.5/+33%	-
98# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+73%
99# Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
100# Haswell	6.5		4.3/+51%	4.1(**)/+58%
101# Bulldozer	11.6		6.0/+92%
102# VIA Nano	10.6		7.5/+41%
103#
104# (*)	Loop is 1056 instructions long and expected result is ~8.25.
105#	It remains mystery [to me] why ILP is limited to 1.7.
106#
107# (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
108
109$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
110push(@INC,"${dir}","${dir}../../perlasm");
111require "x86asm.pl";
112
113&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
114
115$xmm=$ymm=0;
116for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
117
118$ymm=1 if ($xmm &&
119		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
120			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
121		$1>=2.19);	# first version supporting AVX
122
123$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
124		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
125		$1>=2.03);	# first version supporting AVX
126
127$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
128		`ml 2>&1` =~ /Version ([0-9]+)\./ &&
129		$1>=10);	# first version supporting AVX
130
131$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ &&
132		$2>=3.0);	# first version supporting AVX
133
134$shaext=$xmm;	### set to zero if compiling for 1.0.1
135
136&external_label("OPENSSL_ia32cap_P") if ($xmm);
137
138
139$A="eax";
140$B="ebx";
141$C="ecx";
142$D="edx";
143$E="edi";
144$T="esi";
145$tmp1="ebp";
146
147@V=($A,$B,$C,$D,$E,$T);
148
149$alt=0;	# 1 denotes alternative IALU implementation, which performs
150	# 8% *worse* on P4, same on Westmere and Atom, 2% better on
151	# Sandy Bridge...
152
153sub BODY_00_15
154	{
155	local($n,$a,$b,$c,$d,$e,$f)=@_;
156
157	&comment("00_15 $n");
158
159	&mov($f,$c);			# f to hold F_00_19(b,c,d)
160	 if ($n==0)  { &mov($tmp1,$a); }
161	 else        { &mov($a,$tmp1); }
162	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
163	 &xor($f,$d);
164	&add($tmp1,$e);			# tmp1+=e;
165	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
166	 				# with xi, also note that e becomes
167					# f in next round...
168	&and($f,$b);
169	&rotr($b,2);			# b=ROTATE(b,30)
170	 &xor($f,$d);			# f holds F_00_19(b,c,d)
171	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
172
173	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
174		      &add($f,$tmp1); }	# f+=tmp1
175	else        { &add($tmp1,$f); }	# f becomes a in next round
176	&mov($tmp1,$a)			if ($alt && $n==15);
177	}
178
179sub BODY_16_19
180	{
181	local($n,$a,$b,$c,$d,$e,$f)=@_;
182
183	&comment("16_19 $n");
184
185if ($alt) {
186	&xor($c,$d);
187	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
188	&and($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d), b&=c^d
189	 &xor($f,&swtmp(($n+8)%16));
190	&xor($tmp1,$d);			# tmp1=F_00_19(b,c,d)
191	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
192	&rotl($f,1);			# f=ROTATE(f,1)
193	 &add($e,$tmp1);		# e+=F_00_19(b,c,d)
194	&xor($c,$d);			# restore $c
195	 &mov($tmp1,$a);		# b in next round
196	&rotr($b,$n==16?2:7);		# b=ROTATE(b,30)
197	 &mov(&swtmp($n%16),$f);	# xi=f
198	&rotl($a,5);			# ROTATE(a,5)
199	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
200	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
201	 &add($f,$a);			# f+=ROTATE(a,5)
202} else {
203	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
204	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
205	&xor($tmp1,$d);
206	 &xor($f,&swtmp(($n+8)%16));
207	&and($tmp1,$b);
208	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
209	&rotl($f,1);			# f=ROTATE(f,1)
210	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
211	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
212	 &mov($tmp1,$a);
213	&rotr($b,2);			# b=ROTATE(b,30)
214	 &mov(&swtmp($n%16),$f);	# xi=f
215	&rotl($tmp1,5);			# ROTATE(a,5)
216	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
217	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
218	 &add($f,$tmp1);		# f+=ROTATE(a,5)
219}
220	}
221
222sub BODY_20_39
223	{
224	local($n,$a,$b,$c,$d,$e,$f)=@_;
225	local $K=($n<40)?0x6ed9eba1:0xca62c1d6;
226
227	&comment("20_39 $n");
228
229if ($alt) {
230	&xor($tmp1,$c);			# tmp1 to hold F_20_39(b,c,d), b^=c
231	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
232	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
233	 &xor($f,&swtmp(($n+8)%16));
234	&add($e,$tmp1);			# e+=F_20_39(b,c,d)
235	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
236	&rotl($f,1);			# f=ROTATE(f,1)
237	 &mov($tmp1,$a);		# b in next round
238	&rotr($b,7);			# b=ROTATE(b,30)
239	 &mov(&swtmp($n%16),$f)		if($n<77);# xi=f
240	&rotl($a,5);			# ROTATE(a,5)
241	 &xor($b,$c)			if($n==39);# warm up for BODY_40_59
242	&and($tmp1,$b)			if($n==39);
243	 &lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
244	&mov($e,&swtmp(($n+1)%16))	if($n<79);# pre-fetch f for next round
245	 &add($f,$a);			# f+=ROTATE(a,5)
246	&rotr($a,5)			if ($n==79);
247} else {
248	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
249	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
250	&xor($tmp1,$c);
251	 &xor($f,&swtmp(($n+8)%16));
252	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
253	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
254	&rotl($f,1);			# f=ROTATE(f,1)
255	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
256	&rotr($b,2);			# b=ROTATE(b,30)
257	 &mov($tmp1,$a);
258	&rotl($tmp1,5);			# ROTATE(a,5)
259	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
260	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
261	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
262	&add($f,$tmp1);			# f+=ROTATE(a,5)
263}
264	}
265
266sub BODY_40_59
267	{
268	local($n,$a,$b,$c,$d,$e,$f)=@_;
269
270	&comment("40_59 $n");
271
272if ($alt) {
273	&add($e,$tmp1);			# e+=b&(c^d)
274	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
275	&mov($tmp1,$d);
276	 &xor($f,&swtmp(($n+8)%16));
277	&xor($c,$d);			# restore $c
278	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
279	&rotl($f,1);			# f=ROTATE(f,1)
280	 &and($tmp1,$c);
281	&rotr($b,7);			# b=ROTATE(b,30)
282	 &add($e,$tmp1);		# e+=c&d
283	&mov($tmp1,$a);			# b in next round
284	 &mov(&swtmp($n%16),$f);	# xi=f
285	&rotl($a,5);			# ROTATE(a,5)
286	 &xor($b,$c)			if ($n<59);
287	&and($tmp1,$b)			if ($n<59);# tmp1 to hold F_40_59(b,c,d)
288	 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
289	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
290	 &add($f,$a);			# f+=ROTATE(a,5)
291} else {
292	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
293	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
294	&xor($tmp1,$d);
295	 &xor($f,&swtmp(($n+8)%16));
296	&and($tmp1,$b);
297	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
298	&rotl($f,1);			# f=ROTATE(f,1)
299	 &add($tmp1,$e);		# b&(c^d)+=e
300	&rotr($b,2);			# b=ROTATE(b,30)
301	 &mov($e,$a);			# e becomes volatile
302	&rotl($e,5);			# ROTATE(a,5)
303	 &mov(&swtmp($n%16),$f);	# xi=f
304	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
305	 &mov($tmp1,$c);
306	&add($f,$e);			# f+=ROTATE(a,5)
307	 &and($tmp1,$d);
308	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
309	 &add($f,$tmp1);		# f+=c&d
310}
311	}
312
313&function_begin("sha1_block_data_order");
314if ($xmm) {
315  &static_label("shaext_shortcut")	if ($shaext);
316  &static_label("ssse3_shortcut");
317  &static_label("avx_shortcut")		if ($ymm);
318  &static_label("K_XX_XX");
319
320	&call	(&label("pic_point"));	# make it PIC!
321  &set_label("pic_point");
322	&blindpop($tmp1);
323	&picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
324	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
325
326	&mov	($A,&DWP(0,$T));
327	&mov	($D,&DWP(4,$T));
328	&test	($D,1<<9);		# check SSSE3 bit
329	&jz	(&label("x86"));
330	&mov	($C,&DWP(8,$T));
331	&test	($A,1<<24);		# check FXSR bit
332	&jz	(&label("x86"));
333	if ($shaext) {
334		&test	($C,1<<29);		# check SHA bit
335		&jnz	(&label("shaext_shortcut"));
336	}
337	if ($ymm) {
338		&and	($D,1<<28);		# mask AVX bit
339		&and	($A,1<<30);		# mask "Intel CPU" bit
340		&or	($A,$D);
341		&cmp	($A,1<<28|1<<30);
342		&je	(&label("avx_shortcut"));
343	}
344	&jmp	(&label("ssse3_shortcut"));
345  &set_label("x86",16);
346}
347	&mov($tmp1,&wparam(0));	# SHA_CTX *c
348	&mov($T,&wparam(1));	# const void *input
349	&mov($A,&wparam(2));	# size_t num
350	&stack_push(16+3);	# allocate X[16]
351	&shl($A,6);
352	&add($A,$T);
353	&mov(&wparam(2),$A);	# pointer beyond the end of input
354	&mov($E,&DWP(16,$tmp1));# pre-load E
355	&jmp(&label("loop"));
356
357&set_label("loop",16);
358
359	# copy input chunk to X, but reversing byte order!
360	for ($i=0; $i<16; $i+=4)
361		{
362		&mov($A,&DWP(4*($i+0),$T));
363		&mov($B,&DWP(4*($i+1),$T));
364		&mov($C,&DWP(4*($i+2),$T));
365		&mov($D,&DWP(4*($i+3),$T));
366		&bswap($A);
367		&bswap($B);
368		&bswap($C);
369		&bswap($D);
370		&mov(&swtmp($i+0),$A);
371		&mov(&swtmp($i+1),$B);
372		&mov(&swtmp($i+2),$C);
373		&mov(&swtmp($i+3),$D);
374		}
375	&mov(&wparam(1),$T);	# redundant in 1st spin
376
377	&mov($A,&DWP(0,$tmp1));	# load SHA_CTX
378	&mov($B,&DWP(4,$tmp1));
379	&mov($C,&DWP(8,$tmp1));
380	&mov($D,&DWP(12,$tmp1));
381	# E is pre-loaded
382
383	for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
384	for(;$i<20;$i++)	{ &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
385	for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
386	for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
387	for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
388
389	(($V[5] eq $D) and ($V[0] eq $E)) or die;	# double-check
390
391	&mov($tmp1,&wparam(0));	# re-load SHA_CTX*
392	&mov($D,&wparam(1));	# D is last "T" and is discarded
393
394	&add($E,&DWP(0,$tmp1));	# E is last "A"...
395	&add($T,&DWP(4,$tmp1));
396	&add($A,&DWP(8,$tmp1));
397	&add($B,&DWP(12,$tmp1));
398	&add($C,&DWP(16,$tmp1));
399
400	&mov(&DWP(0,$tmp1),$E);	# update SHA_CTX
401	 &add($D,64);		# advance input pointer
402	&mov(&DWP(4,$tmp1),$T);
403	 &cmp($D,&wparam(2));	# have we reached the end yet?
404	&mov(&DWP(8,$tmp1),$A);
405	 &mov($E,$C);		# C is last "E" which needs to be "pre-loaded"
406	&mov(&DWP(12,$tmp1),$B);
407	 &mov($T,$D);		# input pointer
408	&mov(&DWP(16,$tmp1),$C);
409	&jb(&label("loop"));
410
411	&stack_pop(16+3);
412&function_end("sha1_block_data_order");
413
414if ($xmm) {
415if ($shaext) {
416######################################################################
417# Intel SHA Extensions implementation of SHA1 update function.
418#
419my ($ctx,$inp,$num)=("edi","esi","ecx");
420my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3));
421my @MSG=map("xmm$_",(4..7));
422
423sub sha1rnds4 {
424 my ($dst,$src,$imm)=@_;
425    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
426    {	&data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm);	}
427}
428sub sha1op38 {
429 my ($opcodelet,$dst,$src)=@_;
430    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
431    {	&data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);	}
432}
433sub sha1nexte	{ sha1op38(0xc8,@_); }
434sub sha1msg1	{ sha1op38(0xc9,@_); }
435sub sha1msg2	{ sha1op38(0xca,@_); }
436
437&function_begin("_sha1_block_data_order_shaext");
438	&call	(&label("pic_point"));	# make it PIC!
439	&set_label("pic_point");
440	&blindpop($tmp1);
441	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
442&set_label("shaext_shortcut");
443	&mov	($ctx,&wparam(0));
444	&mov	("ebx","esp");
445	&mov	($inp,&wparam(1));
446	&mov	($num,&wparam(2));
447	&sub	("esp",32);
448
449	&movdqu	($ABCD,&QWP(0,$ctx));
450	&movd	($E,&QWP(16,$ctx));
451	&and	("esp",-32);
452	&movdqa	($BSWAP,&QWP(0x50,$tmp1));	# byte-n-word swap
453
454	&movdqu	(@MSG[0],&QWP(0,$inp));
455	&pshufd	($ABCD,$ABCD,0b00011011);	# flip word order
456	&movdqu	(@MSG[1],&QWP(0x10,$inp));
457	&pshufd	($E,$E,0b00011011);		# flip word order
458	&movdqu	(@MSG[2],&QWP(0x20,$inp));
459	&pshufb	(@MSG[0],$BSWAP);
460	&movdqu	(@MSG[3],&QWP(0x30,$inp));
461	&pshufb	(@MSG[1],$BSWAP);
462	&pshufb	(@MSG[2],$BSWAP);
463	&pshufb	(@MSG[3],$BSWAP);
464	&jmp	(&label("loop_shaext"));
465
466&set_label("loop_shaext",16);
467	&dec		($num);
468	&lea		("eax",&DWP(0x40,$inp));
469	&movdqa		(&QWP(0,"esp"),$E);	# offload $E
470	&paddd		($E,@MSG[0]);
471	&cmovne		($inp,"eax");
472	&movdqa		(&QWP(16,"esp"),$ABCD);	# offload $ABCD
473
474for($i=0;$i<20-4;$i+=2) {
475	&sha1msg1	(@MSG[0],@MSG[1]);
476	&movdqa		($E_,$ABCD);
477	&sha1rnds4	($ABCD,$E,int($i/5));	# 0-3...
478	&sha1nexte	($E_,@MSG[1]);
479	&pxor		(@MSG[0],@MSG[2]);
480	&sha1msg1	(@MSG[1],@MSG[2]);
481	&sha1msg2	(@MSG[0],@MSG[3]);
482
483	&movdqa		($E,$ABCD);
484	&sha1rnds4	($ABCD,$E_,int(($i+1)/5));
485	&sha1nexte	($E,@MSG[2]);
486	&pxor		(@MSG[1],@MSG[3]);
487	&sha1msg2	(@MSG[1],@MSG[0]);
488
489	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
490}
491	&movdqu		(@MSG[0],&QWP(0,$inp));
492	&movdqa		($E_,$ABCD);
493	&sha1rnds4	($ABCD,$E,3);		# 64-67
494	&sha1nexte	($E_,@MSG[1]);
495	&movdqu		(@MSG[1],&QWP(0x10,$inp));
496	&pshufb		(@MSG[0],$BSWAP);
497
498	&movdqa		($E,$ABCD);
499	&sha1rnds4	($ABCD,$E_,3);		# 68-71
500	&sha1nexte	($E,@MSG[2]);
501	&movdqu		(@MSG[2],&QWP(0x20,$inp));
502	&pshufb		(@MSG[1],$BSWAP);
503
504	&movdqa		($E_,$ABCD);
505	&sha1rnds4	($ABCD,$E,3);		# 72-75
506	&sha1nexte	($E_,@MSG[3]);
507	&movdqu		(@MSG[3],&QWP(0x30,$inp));
508	&pshufb		(@MSG[2],$BSWAP);
509
510	&movdqa		($E,$ABCD);
511	&sha1rnds4	($ABCD,$E_,3);		# 76-79
512	&movdqa		($E_,&QWP(0,"esp"));
513	&pshufb		(@MSG[3],$BSWAP);
514	&sha1nexte	($E,$E_);
515	&paddd		($ABCD,&QWP(16,"esp"));
516
517	&jnz		(&label("loop_shaext"));
518
519	&pshufd	($ABCD,$ABCD,0b00011011);
520	&pshufd	($E,$E,0b00011011);
521	&movdqu	(&QWP(0,$ctx),$ABCD)
522	&movd	(&DWP(16,$ctx),$E);
523	&mov	("esp","ebx");
524&function_end("_sha1_block_data_order_shaext");
525}
526######################################################################
527# The SSSE3 implementation.
528#
529# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
530# 32 elements of the message schedule or Xupdate outputs. First 4
531# quadruples are simply byte-swapped input, next 4 are calculated
532# according to method originally suggested by Dean Gaudet (modulo
533# being implemented in SSSE3). Once 8 quadruples or 32 elements are
534# collected, it switches to routine proposed by Max Locktyukhin.
535#
536# Calculations inevitably require temporary reqisters, and there are
537# no %xmm registers left to spare. For this reason part of the ring
538# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
539# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
540# X[-5], and X[4] - X[-4]...
541#
542# Another notable optimization is aggressive stack frame compression
543# aiming to minimize amount of 9-byte instructions...
544#
545# Yet another notable optimization is "jumping" $B variable. It means
546# that there is no register permanently allocated for $B value. This
547# allowed to eliminate one instruction from body_20_39...
548#
549my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
550my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
551my @V=($A,$B,$C,$D,$E);
552my $j=0;			# hash round
553my $rx=0;
554my @T=($T,$tmp1);
555my $inp;
556
557my $_rol=sub { &rol(@_) };
558my $_ror=sub { &ror(@_) };
559
560&function_begin("_sha1_block_data_order_ssse3");
561	&call	(&label("pic_point"));	# make it PIC!
562	&set_label("pic_point");
563	&blindpop($tmp1);
564	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
565&set_label("ssse3_shortcut");
566
567	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
568	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
569	&movdqa	(@X[5],&QWP(32,$tmp1));		# K_40_59
570	&movdqa	(@X[6],&QWP(48,$tmp1));		# K_60_79
571	&movdqa	(@X[2],&QWP(64,$tmp1));		# pbswap mask
572
573	&mov	($E,&wparam(0));		# load argument block
574	&mov	($inp=@T[1],&wparam(1));
575	&mov	($D,&wparam(2));
576	&mov	(@T[0],"esp");
577
578	# stack frame layout
579	#
580	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
581	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
582	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
583	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
584	#
585	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
586	#	X[4]	X[5]	X[6]	X[7]
587	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
588	#
589	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
590	#	K_40_59	K_40_59	K_40_59	K_40_59
591	#	K_60_79	K_60_79	K_60_79	K_60_79
592	#	K_00_19	K_00_19	K_00_19	K_00_19
593	#	pbswap mask
594	#
595	# +192	ctx				# argument block
596	# +196	inp
597	# +200	end
598	# +204	esp
599	&sub	("esp",208);
600	&and	("esp",-64);
601
602	&movdqa	(&QWP(112+0,"esp"),@X[4]);	# copy constants
603	&movdqa	(&QWP(112+16,"esp"),@X[5]);
604	&movdqa	(&QWP(112+32,"esp"),@X[6]);
605	&shl	($D,6);				# len*64
606	&movdqa	(&QWP(112+48,"esp"),@X[3]);
607	&add	($D,$inp);			# end of input
608	&movdqa	(&QWP(112+64,"esp"),@X[2]);
609	&add	($inp,64);
610	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
611	&mov	(&DWP(192+4,"esp"),$inp);
612	&mov	(&DWP(192+8,"esp"),$D);
613	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
614
615	&mov	($A,&DWP(0,$E));		# load context
616	&mov	($B,&DWP(4,$E));
617	&mov	($C,&DWP(8,$E));
618	&mov	($D,&DWP(12,$E));
619	&mov	($E,&DWP(16,$E));
620	&mov	(@T[0],$B);			# magic seed
621
622	&movdqu	(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
623	&movdqu	(@X[-3&7],&QWP(-48,$inp));
624	&movdqu	(@X[-2&7],&QWP(-32,$inp));
625	&movdqu	(@X[-1&7],&QWP(-16,$inp));
626	&pshufb	(@X[-4&7],@X[2]);		# byte swap
627	&pshufb	(@X[-3&7],@X[2]);
628	&pshufb	(@X[-2&7],@X[2]);
629	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
630	&pshufb	(@X[-1&7],@X[2]);
631	&paddd	(@X[-4&7],@X[3]);		# add K_00_19
632	&paddd	(@X[-3&7],@X[3]);
633	&paddd	(@X[-2&7],@X[3]);
634	&movdqa	(&QWP(0,"esp"),@X[-4&7]);	# X[]+K xfer to IALU
635	&psubd	(@X[-4&7],@X[3]);		# restore X[]
636	&movdqa	(&QWP(0+16,"esp"),@X[-3&7]);
637	&psubd	(@X[-3&7],@X[3]);
638	&movdqa	(&QWP(0+32,"esp"),@X[-2&7]);
639	&mov	(@T[1],$C);
640	&psubd	(@X[-2&7],@X[3]);
641	&xor	(@T[1],$D);
642	&pshufd	(@X[0],@X[-4&7],0xee);		# was &movdqa	(@X[0],@X[-3&7]);
643	&and	(@T[0],@T[1]);
644	&jmp	(&label("loop"));
645
646######################################################################
647# SSE instruction sequence is first broken to groups of indepentent
648# instructions, independent in respect to their inputs and shifter
649# (not all architectures have more than one). Then IALU instructions
650# are "knitted in" between the SSE groups. Distance is maintained for
651# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
652# [which allegedly also implements SSSE3]...
653#
654# Temporary registers usage. X[2] is volatile at the entry and at the
655# end is restored from backtrace ring buffer. X[3] is expected to
656# contain current K_XX_XX constant and is used to caclulate X[-1]+K
657# from previous round, it becomes volatile the moment the value is
658# saved to stack for transfer to IALU. X[4] becomes volatile whenever
659# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
660# end it is loaded with next K_XX_XX [which becomes X[3] in next
661# round]...
662#
663sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
664{ use integer;
665  my $body = shift;
666  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
667  my ($a,$b,$c,$d,$e);
668
669	 eval(shift(@insns));		# ror
670	 eval(shift(@insns));
671	 eval(shift(@insns));
672	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
673	&movdqa	(@X[2],@X[-1&7]);
674	 eval(shift(@insns));
675	 eval(shift(@insns));
676
677	  &paddd	(@X[3],@X[-1&7]);
678	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
679	 eval(shift(@insns));		# rol
680	 eval(shift(@insns));
681	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
682	 eval(shift(@insns));
683	 eval(shift(@insns));
684	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
685	 eval(shift(@insns));
686	 eval(shift(@insns));		# ror
687
688	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
689	 eval(shift(@insns));
690	 eval(shift(@insns));
691	 eval(shift(@insns));
692
693	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
694	 eval(shift(@insns));
695	 eval(shift(@insns));		# rol
696	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
697	 eval(shift(@insns));
698	 eval(shift(@insns));
699
700	&movdqa	(@X[4],@X[0]);
701	 eval(shift(@insns));
702	 eval(shift(@insns));
703	 eval(shift(@insns));		# ror
704	&movdqa (@X[2],@X[0]);
705	 eval(shift(@insns));
706
707	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
708	&paddd	(@X[0],@X[0]);
709	 eval(shift(@insns));
710	 eval(shift(@insns));
711
712	&psrld	(@X[2],31);
713	 eval(shift(@insns));
714	 eval(shift(@insns));		# rol
715	&movdqa	(@X[3],@X[4]);
716	 eval(shift(@insns));
717	 eval(shift(@insns));
718	 eval(shift(@insns));
719
720	&psrld	(@X[4],30);
721	 eval(shift(@insns));
722	 eval(shift(@insns));		# ror
723	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
724	 eval(shift(@insns));
725	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
726	 eval(shift(@insns));
727	 eval(shift(@insns));
728
729	&pslld	(@X[3],2);
730	 eval(shift(@insns));
731	 eval(shift(@insns));		# rol
732	&pxor   (@X[0],@X[4]);
733	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
734	 eval(shift(@insns));
735	 eval(shift(@insns));
736
737	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
738	  &pshufd	(@X[1],@X[-3&7],0xee)	if ($Xi<7);	# was &movdqa	(@X[1],@X[-2&7])
739	  &pshufd	(@X[3],@X[-1&7],0xee)	if ($Xi==7);
740	 eval(shift(@insns));
741	 eval(shift(@insns));
742
743	 foreach (@insns) { eval; }	# remaining instructions [if any]
744
745  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
746}
747
748sub Xupdate_ssse3_32_79()
749{ use integer;
750  my $body = shift;
751  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
752  my ($a,$b,$c,$d,$e);
753
754	 eval(shift(@insns));		# body_20_39
755	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
756	&punpcklqdq(@X[2],@X[-1&7]);	# compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
757	 eval(shift(@insns));
758	 eval(shift(@insns));
759	 eval(shift(@insns));		# rol
760
761	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
762	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
763	 eval(shift(@insns));
764	 eval(shift(@insns));
765	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
766	 if ($Xi%5) {
767	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
768	 } else {			# ... or load next one
769	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
770	 }
771	 eval(shift(@insns));		# ror
772	  &paddd	(@X[3],@X[-1&7]);
773	 eval(shift(@insns));
774
775	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
776	 eval(shift(@insns));		# body_20_39
777	 eval(shift(@insns));
778	 eval(shift(@insns));
779	 eval(shift(@insns));		# rol
780
781	&movdqa	(@X[2],@X[0]);
782	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
783	 eval(shift(@insns));
784	 eval(shift(@insns));
785	 eval(shift(@insns));		# ror
786	 eval(shift(@insns));
787	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
788
789	&pslld	(@X[0],2);
790	 eval(shift(@insns));		# body_20_39
791	 eval(shift(@insns));
792	&psrld	(@X[2],30);
793	 eval(shift(@insns));
794	 eval(shift(@insns));		# rol
795	 eval(shift(@insns));
796	 eval(shift(@insns));
797	 eval(shift(@insns));		# ror
798	 eval(shift(@insns));
799	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
800	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
801
802	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
803	 eval(shift(@insns));		# body_20_39
804	 eval(shift(@insns));
805	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
806	 eval(shift(@insns));
807	 eval(shift(@insns));		# rol
808	 eval(shift(@insns));
809	 eval(shift(@insns));
810	 eval(shift(@insns));		# ror
811	  &pshufd	(@X[3],@X[-1],0xee)	if ($Xi<19);	# was &movdqa	(@X[3],@X[0])
812	 eval(shift(@insns));
813
814	 foreach (@insns) { eval; }	# remaining instructions
815
816  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
817}
818
819sub Xuplast_ssse3_80()
820{ use integer;
821  my $body = shift;
822  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
823  my ($a,$b,$c,$d,$e);
824
825	 eval(shift(@insns));
826	 eval(shift(@insns));
827	 eval(shift(@insns));
828	 eval(shift(@insns));
829	 eval(shift(@insns));
830	 eval(shift(@insns));
831	 eval(shift(@insns));
832	  &paddd	(@X[3],@X[-1&7]);
833	 eval(shift(@insns));
834	 eval(shift(@insns));
835	 eval(shift(@insns));
836	 eval(shift(@insns));
837
838	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
839
840	 foreach (@insns) { eval; }		# remaining instructions
841
842	&mov	($inp=@T[1],&DWP(192+4,"esp"));
843	&cmp	($inp,&DWP(192+8,"esp"));
844	&je	(&label("done"));
845
846	&movdqa	(@X[3],&QWP(112+48,"esp"));	# K_00_19
847	&movdqa	(@X[2],&QWP(112+64,"esp"));	# pbswap mask
848	&movdqu	(@X[-4&7],&QWP(0,$inp));	# load input
849	&movdqu	(@X[-3&7],&QWP(16,$inp));
850	&movdqu	(@X[-2&7],&QWP(32,$inp));
851	&movdqu	(@X[-1&7],&QWP(48,$inp));
852	&add	($inp,64);
853	&pshufb	(@X[-4&7],@X[2]);		# byte swap
854	&mov	(&DWP(192+4,"esp"),$inp);
855	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
856
857  $Xi=0;
858}
859
860sub Xloop_ssse3()
861{ use integer;
862  my $body = shift;
863  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
864  my ($a,$b,$c,$d,$e);
865
866	 eval(shift(@insns));
867	 eval(shift(@insns));
868	 eval(shift(@insns));
869	 eval(shift(@insns));
870	 eval(shift(@insns));
871	 eval(shift(@insns));
872	 eval(shift(@insns));
873	&pshufb	(@X[($Xi-3)&7],@X[2]);
874	 eval(shift(@insns));
875	 eval(shift(@insns));
876	 eval(shift(@insns));
877	 eval(shift(@insns));
878	&paddd	(@X[($Xi-4)&7],@X[3]);
879	 eval(shift(@insns));
880	 eval(shift(@insns));
881	 eval(shift(@insns));
882	 eval(shift(@insns));
883	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
884	 eval(shift(@insns));
885	 eval(shift(@insns));
886	 eval(shift(@insns));
887	 eval(shift(@insns));
888	&psubd	(@X[($Xi-4)&7],@X[3]);
889
890	foreach (@insns) { eval; }
891  $Xi++;
892}
893
894sub Xtail_ssse3()
895{ use integer;
896  my $body = shift;
897  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
898  my ($a,$b,$c,$d,$e);
899
900	foreach (@insns) { eval; }
901}
902
903sub body_00_19 () {	# ((c^d)&b)^d
904	# on start @T[0]=(c^d)&b
905	return &body_20_39()	if ($rx==19);	$rx++;
906	(
907	'($a,$b,$c,$d,$e)=@V;'.
908	'&$_ror	($b,$j?7:2);',	# $b>>>2
909	'&xor	(@T[0],$d);',
910	'&mov	(@T[1],$a);',	# $b in next round
911
912	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
913	'&xor	($b,$c);',	# $c^$d for next round
914
915	'&$_rol	($a,5);',
916	'&add	($e,@T[0]);',
917	'&and	(@T[1],$b);',	# ($b&($c^$d)) for next round
918
919	'&xor	($b,$c);',	# restore $b
920	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
921	);
922}
923
924sub body_20_39 () {	# b^d^c
925	# on entry @T[0]=b^d
926	return &body_40_59()	if ($rx==39);	$rx++;
927	(
928	'($a,$b,$c,$d,$e)=@V;'.
929	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
930	'&xor	(@T[0],$d)	if($j==19);'.
931	'&xor	(@T[0],$c)	if($j> 19);',	# ($b^$d^$c)
932	'&mov	(@T[1],$a);',	# $b in next round
933
934	'&$_rol	($a,5);',
935	'&add	($e,@T[0]);',
936	'&xor	(@T[1],$c)	if ($j< 79);',	# $b^$d for next round
937
938	'&$_ror	($b,7);',	# $b>>>2
939	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
940	);
941}
942
943sub body_40_59 () {	# ((b^c)&(c^d))^c
944	# on entry @T[0]=(b^c), (c^=d)
945	$rx++;
946	(
947	'($a,$b,$c,$d,$e)=@V;'.
948	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
949	'&and	(@T[0],$c)	if ($j>=40);',	# (b^c)&(c^d)
950	'&xor	($c,$d)		if ($j>=40);',	# restore $c
951
952	'&$_ror	($b,7);',	# $b>>>2
953	'&mov	(@T[1],$a);',	# $b for next round
954	'&xor	(@T[0],$c);',
955
956	'&$_rol	($a,5);',
957	'&add	($e,@T[0]);',
958	'&xor	(@T[1],$c)	if ($j==59);'.
959	'&xor	(@T[1],$b)	if ($j< 59);',	# b^c for next round
960
961	'&xor	($b,$c)		if ($j< 59);',	# c^d for next round
962	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
963	);
964}
965######
966sub bodyx_00_19 () {	# ((c^d)&b)^d
967	# on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
968	return &bodyx_20_39()	if ($rx==19);	$rx++;
969	(
970	'($a,$b,$c,$d,$e)=@V;'.
971
972	'&rorx	($b,$b,2)			if ($j==0);'.	# $b>>>2
973	'&rorx	($b,@T[1],7)			if ($j!=0);',	# $b>>>2
974	'&lea	($e,&DWP(0,$e,@T[0]));',
975	'&rorx	(@T[0],$a,5);',
976
977	'&andn	(@T[1],$a,$c);',
978	'&and	($a,$b)',
979	'&add	($d,&DWP(4*(($j+1)&15),"esp"));',	# X[]+K xfer
980
981	'&xor	(@T[1],$a)',
982	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
983	);
984}
985
986sub bodyx_20_39 () {	# b^d^c
987	# on start $b=b^c^d
988	return &bodyx_40_59()	if ($rx==39);	$rx++;
989	(
990	'($a,$b,$c,$d,$e)=@V;'.
991
992	'&add	($e,($j==19?@T[0]:$b))',
993	'&rorx	($b,@T[1],7);',	# $b>>>2
994	'&rorx	(@T[0],$a,5);',
995
996	'&xor	($a,$b)				if ($j<79);',
997	'&add	($d,&DWP(4*(($j+1)&15),"esp"))	if ($j<79);',	# X[]+K xfer
998	'&xor	($a,$c)				if ($j<79);',
999	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
1000	);
1001}
1002
1003sub bodyx_40_59 () {	# ((b^c)&(c^d))^c
1004	# on start $b=((b^c)&(c^d))^c
1005	return &bodyx_20_39()	if ($rx==59);	$rx++;
1006	(
1007	'($a,$b,$c,$d,$e)=@V;'.
1008
1009	'&rorx	(@T[0],$a,5)',
1010	'&lea	($e,&DWP(0,$e,$b))',
1011	'&rorx	($b,@T[1],7)',	# $b>>>2
1012	'&add	($d,&DWP(4*(($j+1)&15),"esp"))',	# X[]+K xfer
1013
1014	'&mov	(@T[1],$c)',
1015	'&xor	($a,$b)',	# b^c for next round
1016	'&xor	(@T[1],$b)',	# c^d for next round
1017
1018	'&and	($a,@T[1])',
1019	'&add	($e,@T[0])',
1020	'&xor	($a,$b)'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
1021	);
1022}
1023
1024&set_label("loop",16);
1025	&Xupdate_ssse3_16_31(\&body_00_19);
1026	&Xupdate_ssse3_16_31(\&body_00_19);
1027	&Xupdate_ssse3_16_31(\&body_00_19);
1028	&Xupdate_ssse3_16_31(\&body_00_19);
1029	&Xupdate_ssse3_32_79(\&body_00_19);
1030	&Xupdate_ssse3_32_79(\&body_20_39);
1031	&Xupdate_ssse3_32_79(\&body_20_39);
1032	&Xupdate_ssse3_32_79(\&body_20_39);
1033	&Xupdate_ssse3_32_79(\&body_20_39);
1034	&Xupdate_ssse3_32_79(\&body_20_39);
1035	&Xupdate_ssse3_32_79(\&body_40_59);
1036	&Xupdate_ssse3_32_79(\&body_40_59);
1037	&Xupdate_ssse3_32_79(\&body_40_59);
1038	&Xupdate_ssse3_32_79(\&body_40_59);
1039	&Xupdate_ssse3_32_79(\&body_40_59);
1040	&Xupdate_ssse3_32_79(\&body_20_39);
1041	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
1042
1043				$saved_j=$j; @saved_V=@V;
1044
1045	&Xloop_ssse3(\&body_20_39);
1046	&Xloop_ssse3(\&body_20_39);
1047	&Xloop_ssse3(\&body_20_39);
1048
1049	&mov	(@T[1],&DWP(192,"esp"));	# update context
1050	&add	($A,&DWP(0,@T[1]));
1051	&add	(@T[0],&DWP(4,@T[1]));		# $b
1052	&add	($C,&DWP(8,@T[1]));
1053	&mov	(&DWP(0,@T[1]),$A);
1054	&add	($D,&DWP(12,@T[1]));
1055	&mov	(&DWP(4,@T[1]),@T[0]);
1056	&add	($E,&DWP(16,@T[1]));
1057	&mov	(&DWP(8,@T[1]),$C);
1058	&mov	($B,$C);
1059	&mov	(&DWP(12,@T[1]),$D);
1060	&xor	($B,$D);
1061	&mov	(&DWP(16,@T[1]),$E);
1062	&mov	(@T[1],@T[0]);
1063	&pshufd	(@X[0],@X[-4&7],0xee);		# was &movdqa	(@X[0],@X[-3&7]);
1064	&and	(@T[0],$B);
1065	&mov	($B,$T[1]);
1066
1067	&jmp	(&label("loop"));
1068
1069&set_label("done",16);		$j=$saved_j; @V=@saved_V;
1070
1071	&Xtail_ssse3(\&body_20_39);
1072	&Xtail_ssse3(\&body_20_39);
1073	&Xtail_ssse3(\&body_20_39);
1074
1075	&mov	(@T[1],&DWP(192,"esp"));	# update context
1076	&add	($A,&DWP(0,@T[1]));
1077	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
1078	&add	(@T[0],&DWP(4,@T[1]));		# $b
1079	&add	($C,&DWP(8,@T[1]));
1080	&mov	(&DWP(0,@T[1]),$A);
1081	&add	($D,&DWP(12,@T[1]));
1082	&mov	(&DWP(4,@T[1]),@T[0]);
1083	&add	($E,&DWP(16,@T[1]));
1084	&mov	(&DWP(8,@T[1]),$C);
1085	&mov	(&DWP(12,@T[1]),$D);
1086	&mov	(&DWP(16,@T[1]),$E);
1087
1088&function_end("_sha1_block_data_order_ssse3");
1089
1090$rx=0;	# reset
1091
1092if ($ymm) {
1093my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
1094my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
1095my @V=($A,$B,$C,$D,$E);
1096my $j=0;			# hash round
1097my @T=($T,$tmp1);
1098my $inp;
1099
1100my $_rol=sub { &shld(@_[0],@_) };
1101my $_ror=sub { &shrd(@_[0],@_) };
1102
1103&function_begin("_sha1_block_data_order_avx");
1104	&call	(&label("pic_point"));	# make it PIC!
1105	&set_label("pic_point");
1106	&blindpop($tmp1);
1107	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
1108&set_label("avx_shortcut");
1109	&vzeroall();
1110
1111	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
1112	&vmovdqa(@X[4],&QWP(16,$tmp1));		# K_20_39
1113	&vmovdqa(@X[5],&QWP(32,$tmp1));		# K_40_59
1114	&vmovdqa(@X[6],&QWP(48,$tmp1));		# K_60_79
1115	&vmovdqa(@X[2],&QWP(64,$tmp1));		# pbswap mask
1116
1117	&mov	($E,&wparam(0));		# load argument block
1118	&mov	($inp=@T[1],&wparam(1));
1119	&mov	($D,&wparam(2));
1120	&mov	(@T[0],"esp");
1121
1122	# stack frame layout
1123	#
1124	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
1125	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
1126	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
1127	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
1128	#
1129	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
1130	#	X[4]	X[5]	X[6]	X[7]
1131	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
1132	#
1133	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
1134	#	K_40_59	K_40_59	K_40_59	K_40_59
1135	#	K_60_79	K_60_79	K_60_79	K_60_79
1136	#	K_00_19	K_00_19	K_00_19	K_00_19
1137	#	pbswap mask
1138	#
1139	# +192	ctx				# argument block
1140	# +196	inp
1141	# +200	end
1142	# +204	esp
1143	&sub	("esp",208);
1144	&and	("esp",-64);
1145
1146	&vmovdqa(&QWP(112+0,"esp"),@X[4]);	# copy constants
1147	&vmovdqa(&QWP(112+16,"esp"),@X[5]);
1148	&vmovdqa(&QWP(112+32,"esp"),@X[6]);
1149	&shl	($D,6);				# len*64
1150	&vmovdqa(&QWP(112+48,"esp"),@X[3]);
1151	&add	($D,$inp);			# end of input
1152	&vmovdqa(&QWP(112+64,"esp"),@X[2]);
1153	&add	($inp,64);
1154	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
1155	&mov	(&DWP(192+4,"esp"),$inp);
1156	&mov	(&DWP(192+8,"esp"),$D);
1157	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
1158
1159	&mov	($A,&DWP(0,$E));		# load context
1160	&mov	($B,&DWP(4,$E));
1161	&mov	($C,&DWP(8,$E));
1162	&mov	($D,&DWP(12,$E));
1163	&mov	($E,&DWP(16,$E));
1164	&mov	(@T[0],$B);			# magic seed
1165
1166	&vmovdqu(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
1167	&vmovdqu(@X[-3&7],&QWP(-48,$inp));
1168	&vmovdqu(@X[-2&7],&QWP(-32,$inp));
1169	&vmovdqu(@X[-1&7],&QWP(-16,$inp));
1170	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1171	&vpshufb(@X[-3&7],@X[-3&7],@X[2]);
1172	&vpshufb(@X[-2&7],@X[-2&7],@X[2]);
1173	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
1174	&vpshufb(@X[-1&7],@X[-1&7],@X[2]);
1175	&vpaddd	(@X[0],@X[-4&7],@X[3]);		# add K_00_19
1176	&vpaddd	(@X[1],@X[-3&7],@X[3]);
1177	&vpaddd	(@X[2],@X[-2&7],@X[3]);
1178	&vmovdqa(&QWP(0,"esp"),@X[0]);		# X[]+K xfer to IALU
1179	&mov	(@T[1],$C);
1180	&vmovdqa(&QWP(0+16,"esp"),@X[1]);
1181	&xor	(@T[1],$D);
1182	&vmovdqa(&QWP(0+32,"esp"),@X[2]);
1183	&and	(@T[0],@T[1]);
1184	&jmp	(&label("loop"));
1185
1186sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
1187{ use integer;
1188  my $body = shift;
1189  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1190  my ($a,$b,$c,$d,$e);
1191
1192	 eval(shift(@insns));
1193	 eval(shift(@insns));
1194	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1195	 eval(shift(@insns));
1196	 eval(shift(@insns));
1197
1198	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
1199	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
1200	 eval(shift(@insns));
1201	 eval(shift(@insns));
1202	&vpsrldq(@X[2],@X[-1&7],4);		# "X[-3]", 3 dwords
1203	 eval(shift(@insns));
1204	 eval(shift(@insns));
1205	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1206	 eval(shift(@insns));
1207	 eval(shift(@insns));
1208
1209	&vpxor	(@X[2],@X[2],@X[-2&7]);		# "X[-3]"^"X[-8]"
1210	 eval(shift(@insns));
1211	 eval(shift(@insns));
1212	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
1213	 eval(shift(@insns));
1214	 eval(shift(@insns));
1215
1216	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
1217	 eval(shift(@insns));
1218	 eval(shift(@insns));
1219	 eval(shift(@insns));
1220	 eval(shift(@insns));
1221
1222	&vpsrld	(@X[2],@X[0],31);
1223	 eval(shift(@insns));
1224	 eval(shift(@insns));
1225	 eval(shift(@insns));
1226	 eval(shift(@insns));
1227
1228	&vpslldq(@X[4],@X[0],12);		# "X[0]"<<96, extract one dword
1229	&vpaddd	(@X[0],@X[0],@X[0]);
1230	 eval(shift(@insns));
1231	 eval(shift(@insns));
1232	 eval(shift(@insns));
1233	 eval(shift(@insns));
1234
1235	&vpsrld	(@X[3],@X[4],30);
1236	&vpor	(@X[0],@X[0],@X[2]);		# "X[0]"<<<=1
1237	 eval(shift(@insns));
1238	 eval(shift(@insns));
1239	 eval(shift(@insns));
1240	 eval(shift(@insns));
1241
1242	&vpslld	(@X[4],@X[4],2);
1243	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
1244	 eval(shift(@insns));
1245	 eval(shift(@insns));
1246	&vpxor	(@X[0],@X[0],@X[3]);
1247	 eval(shift(@insns));
1248	 eval(shift(@insns));
1249	 eval(shift(@insns));
1250	 eval(shift(@insns));
1251
1252	&vpxor	(@X[0],@X[0],@X[4]);		# "X[0]"^=("X[0]"<<96)<<<2
1253	 eval(shift(@insns));
1254	 eval(shift(@insns));
1255	  &vmovdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
1256	 eval(shift(@insns));
1257	 eval(shift(@insns));
1258
1259	 foreach (@insns) { eval; }	# remaining instructions [if any]
1260
1261  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1262}
1263
1264sub Xupdate_avx_32_79()
1265{ use integer;
1266  my $body = shift;
1267  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1268  my ($a,$b,$c,$d,$e);
1269
1270	&vpalignr(@X[2],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1271	&vpxor	(@X[0],@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
1272	 eval(shift(@insns));		# body_20_39
1273	 eval(shift(@insns));
1274	 eval(shift(@insns));
1275	 eval(shift(@insns));		# rol
1276
1277	&vpxor	(@X[0],@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
1278	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
1279	 eval(shift(@insns));
1280	 eval(shift(@insns));
1281	 if ($Xi%5) {
1282	  &vmovdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
1283	 } else {			# ... or load next one
1284	  &vmovdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
1285	 }
1286	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
1287	 eval(shift(@insns));		# ror
1288	 eval(shift(@insns));
1289
1290	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-6]"
1291	 eval(shift(@insns));		# body_20_39
1292	 eval(shift(@insns));
1293	 eval(shift(@insns));
1294	 eval(shift(@insns));		# rol
1295
1296	&vpsrld	(@X[2],@X[0],30);
1297	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
1298	 eval(shift(@insns));
1299	 eval(shift(@insns));
1300	 eval(shift(@insns));		# ror
1301	 eval(shift(@insns));
1302
1303	&vpslld	(@X[0],@X[0],2);
1304	 eval(shift(@insns));		# body_20_39
1305	 eval(shift(@insns));
1306	 eval(shift(@insns));
1307	 eval(shift(@insns));		# rol
1308	 eval(shift(@insns));
1309	 eval(shift(@insns));
1310	 eval(shift(@insns));		# ror
1311	 eval(shift(@insns));
1312
1313	&vpor	(@X[0],@X[0],@X[2]);	# "X[0]"<<<=2
1314	 eval(shift(@insns));		# body_20_39
1315	 eval(shift(@insns));
1316	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
1317	 eval(shift(@insns));
1318	 eval(shift(@insns));		# rol
1319	 eval(shift(@insns));
1320	 eval(shift(@insns));
1321	 eval(shift(@insns));		# ror
1322	 eval(shift(@insns));
1323
1324	 foreach (@insns) { eval; }	# remaining instructions
1325
1326  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1327}
1328
1329sub Xuplast_avx_80()
1330{ use integer;
1331  my $body = shift;
1332  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1333  my ($a,$b,$c,$d,$e);
1334
1335	 eval(shift(@insns));
1336	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
1337	 eval(shift(@insns));
1338	 eval(shift(@insns));
1339	 eval(shift(@insns));
1340	 eval(shift(@insns));
1341
1342	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
1343
1344	 foreach (@insns) { eval; }		# remaining instructions
1345
1346	&mov	($inp=@T[1],&DWP(192+4,"esp"));
1347	&cmp	($inp,&DWP(192+8,"esp"));
1348	&je	(&label("done"));
1349
1350	&vmovdqa(@X[3],&QWP(112+48,"esp"));	# K_00_19
1351	&vmovdqa(@X[2],&QWP(112+64,"esp"));	# pbswap mask
1352	&vmovdqu(@X[-4&7],&QWP(0,$inp));	# load input
1353	&vmovdqu(@X[-3&7],&QWP(16,$inp));
1354	&vmovdqu(@X[-2&7],&QWP(32,$inp));
1355	&vmovdqu(@X[-1&7],&QWP(48,$inp));
1356	&add	($inp,64);
1357	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);		# byte swap
1358	&mov	(&DWP(192+4,"esp"),$inp);
1359	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
1360
1361  $Xi=0;
1362}
1363
1364sub Xloop_avx()
1365{ use integer;
1366  my $body = shift;
1367  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1368  my ($a,$b,$c,$d,$e);
1369
1370	 eval(shift(@insns));
1371	 eval(shift(@insns));
1372	&vpshufb	(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1373	 eval(shift(@insns));
1374	 eval(shift(@insns));
1375	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
1376	 eval(shift(@insns));
1377	 eval(shift(@insns));
1378	 eval(shift(@insns));
1379	 eval(shift(@insns));
1380	&vmovdqa	(&QWP(0+16*$Xi,"esp"),@X[$Xi&7]);	# X[]+K xfer to IALU
1381	 eval(shift(@insns));
1382	 eval(shift(@insns));
1383
1384	foreach (@insns) { eval; }
1385  $Xi++;
1386}
1387
1388sub Xtail_avx()
1389{ use integer;
1390  my $body = shift;
1391  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1392  my ($a,$b,$c,$d,$e);
1393
1394	foreach (@insns) { eval; }
1395}
1396
1397&set_label("loop",16);
1398	&Xupdate_avx_16_31(\&body_00_19);
1399	&Xupdate_avx_16_31(\&body_00_19);
1400	&Xupdate_avx_16_31(\&body_00_19);
1401	&Xupdate_avx_16_31(\&body_00_19);
1402	&Xupdate_avx_32_79(\&body_00_19);
1403	&Xupdate_avx_32_79(\&body_20_39);
1404	&Xupdate_avx_32_79(\&body_20_39);
1405	&Xupdate_avx_32_79(\&body_20_39);
1406	&Xupdate_avx_32_79(\&body_20_39);
1407	&Xupdate_avx_32_79(\&body_20_39);
1408	&Xupdate_avx_32_79(\&body_40_59);
1409	&Xupdate_avx_32_79(\&body_40_59);
1410	&Xupdate_avx_32_79(\&body_40_59);
1411	&Xupdate_avx_32_79(\&body_40_59);
1412	&Xupdate_avx_32_79(\&body_40_59);
1413	&Xupdate_avx_32_79(\&body_20_39);
1414	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1415
1416				$saved_j=$j; @saved_V=@V;
1417
1418	&Xloop_avx(\&body_20_39);
1419	&Xloop_avx(\&body_20_39);
1420	&Xloop_avx(\&body_20_39);
1421
1422	&mov	(@T[1],&DWP(192,"esp"));	# update context
1423	&add	($A,&DWP(0,@T[1]));
1424	&add	(@T[0],&DWP(4,@T[1]));		# $b
1425	&add	($C,&DWP(8,@T[1]));
1426	&mov	(&DWP(0,@T[1]),$A);
1427	&add	($D,&DWP(12,@T[1]));
1428	&mov	(&DWP(4,@T[1]),@T[0]);
1429	&add	($E,&DWP(16,@T[1]));
1430	&mov	($B,$C);
1431	&mov	(&DWP(8,@T[1]),$C);
1432	&xor	($B,$D);
1433	&mov	(&DWP(12,@T[1]),$D);
1434	&mov	(&DWP(16,@T[1]),$E);
1435	&mov	(@T[1],@T[0]);
1436	&and	(@T[0],$B);
1437	&mov	($B,@T[1]);
1438
1439	&jmp	(&label("loop"));
1440
1441&set_label("done",16);		$j=$saved_j; @V=@saved_V;
1442
1443	&Xtail_avx(\&body_20_39);
1444	&Xtail_avx(\&body_20_39);
1445	&Xtail_avx(\&body_20_39);
1446
1447	&vzeroall();
1448
1449	&mov	(@T[1],&DWP(192,"esp"));	# update context
1450	&add	($A,&DWP(0,@T[1]));
1451	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
1452	&add	(@T[0],&DWP(4,@T[1]));		# $b
1453	&add	($C,&DWP(8,@T[1]));
1454	&mov	(&DWP(0,@T[1]),$A);
1455	&add	($D,&DWP(12,@T[1]));
1456	&mov	(&DWP(4,@T[1]),@T[0]);
1457	&add	($E,&DWP(16,@T[1]));
1458	&mov	(&DWP(8,@T[1]),$C);
1459	&mov	(&DWP(12,@T[1]),$D);
1460	&mov	(&DWP(16,@T[1]),$E);
1461&function_end("_sha1_block_data_order_avx");
1462}
1463&set_label("K_XX_XX",64);
1464&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
1465&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1);	# K_20_39
1466&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
1467&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
1468&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
1469&data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0);
1470}
1471&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
1472
1473&asm_finish();
1474