1d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#!/usr/bin/env perl
2d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
3d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# ====================================================================
4d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# project. The module is, however, dual licensed under OpenSSL and
6d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# CRYPTOGAMS licenses depending on where you obtain it. For further
7d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# details see http://www.openssl.org/~appro/cryptogams/.
8d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# ====================================================================
9d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
10d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# June 2011
11d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
12d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
13d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# http://download.intel.com/design/intarch/papers/323686.pdf, is that
14d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# since both algorithms exhibit instruction-level parallelism, ILP,
15d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# below theoretical maximum, interleaving them would allow to utilize
16d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# processor resources better and achieve better performance. RC4
17d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# instruction sequence is virtually identical to rc4-x86_64.pl, which
18d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
19d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
20d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# minimize register usage, which was used as "main thread" with RC4
21d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# weaved into it, one RC4 round per one MD5 round. In addition to the
22d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# stiched subroutine the script can generate standalone replacement
23d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# md5_block_asm_data_order and RC4. Below are performance numbers in
24d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# cycles per processed byte, less is better, for these the standalone
25d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# subroutines, sum of them, and stitched one:
26d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
27d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#		RC4	MD5	RC4+MD5	stitch	gain
28d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Opteron	6.5(*)	5.4	11.9	7.0	+70%(*)
29d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Core2		6.5	5.8	12.3	7.7	+60%
30d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Westmere	4.3	5.2	9.5	7.0	+36%
31d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Sandy Bridge	4.2	5.5	9.7	6.8	+43%
32d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# Atom		9.3	6.5	15.8	11.1	+42%
33d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#
34d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
35d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#	is +53%...
36d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
37d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($rc4,$md5)=(1,1);	# what to generate?
38d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $D="#" if (!$md5);	# if set to "#", MD5 is stitched into RC4(),
39d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			# but its result is discarded. Idea here is
40d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			# to be able to use 'openssl speed rc4' for
41d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			# benchmarking the stitched subroutine...
42d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
43d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $flavour = shift;
44d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $output  = shift;
45d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
47d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
49d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
50d9e397b599b13d642138480a28c14db7a136bf0Adam Langley( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51d9e397b599b13d642138480a28c14db7a136bf0Adam Langley( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52d9e397b599b13d642138480a28c14db7a136bf0Adam Langleydie "can't locate x86_64-xlate.pl";
53d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
54d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyopen OUT,"| \"$^X\" $xlate $flavour $output";
55d9e397b599b13d642138480a28c14db7a136bf0Adam Langley*STDOUT=*OUT;
56d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
57d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
58d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
59d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif ($rc4 && !$md5) {
60d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
61d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  $func="RC4";				$nargs=4;
62d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} elsif ($md5 && !$rc4) {
63d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
64d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  $func="md5_block_asm_data_order";	$nargs=3;
65d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} else {
66d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
67d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  $func="rc4_md5_enc";			$nargs=6;
68d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  # void rc4_md5_enc(
69d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  #		RC4_KEY *key,		#
70d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  #		const void *in0,	# RC4 input
71d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  #		void *out,		# RC4 output
72d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  #		MD5_CTX *ctx,		#
73d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  #		const void *inp,	# MD5 input
74d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  #		size_t len);		# number of 64-byte blocks
75d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
76d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
77d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
78d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
79d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
80d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
81d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
82d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
83d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
84d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
85d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
86d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
87d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
88d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
89d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
90d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
91d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
92d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
93d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
94d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
95d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391	);
96d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
97d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @V=("%r8d","%r9d","%r10d","%r11d");	# MD5 registers
98d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $tmp="%r12d";
99d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
100d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @XX=("%rbp","%rsi");			# RC4 registers
101d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy @TX=("%rax","%rbx");
102d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $YY="%rcx";
103d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $TY="%rdx";
104d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
105d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $MOD=32;				# 16, 32 or 64
106d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
107d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
108d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.text
109d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align 16
110d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
111d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	$func
112d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	$func,\@function,$nargs
113d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$func:
114d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	\$0,$len
115d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	je	.Labort
116d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
117d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
118d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
119d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
120d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
121d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
122d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$40,%rsp
123d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lbody:
124d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
125d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif ($rc4) {
126d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
127d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$D#md5#	mov	$ctx,%r11		# reassign arguments
128d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$len,%r12
129d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$in0,%r13
130d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$out,%r14
131d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$D#md5#	mov	$inp,%r15
132d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
133d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $ctx="%r11"	if ($md5);		# reassign arguments
134d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $len="%r12";
135d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $in0="%r13";
136d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $out="%r14";
137d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $inp="%r15"	if ($md5);
138d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $inp=$in0	if (!$md5);
139d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
140d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$XX[0],$XX[0]
141d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$YY,$YY
142d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
143d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	8($dat),$dat
144d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	-8($dat),$XX[0]#b
145d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	-4($dat),$YY#b
146d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
147d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	inc	$XX[0]#b
148d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$in0,$out
149d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	($dat,$XX[0],4),$TX[0]#d
150d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
151d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if (!$md5);
152d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$TX[1],$TX[1]
153d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	test	\$-128,$len
154d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Loop1
155d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$XX[0],$TX[1]
156d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	and	\$`$MOD-1`,$TX[1]
157d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jz	.Loop${MOD}_is_hot
158d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	$TX[1],$len
159d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Loop${MOD}_warmup:
160d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$TX[0]#b,$YY#b
161d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	($dat,$YY,4),$TY#d
162d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	$TX[0]#d,($dat,$YY,4)
163d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	$TY#d,($dat,$XX[0],4)
164d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$TY#b,$TX[0]#b
165d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	inc	$XX[0]#b
166d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	($dat,$TX[0],4),$TY#d
167d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	($dat,$XX[0],4),$TX[0]#d
168d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xorb	($in0),$TY#b
169d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movb	$TY#b,($out,$in0)
170d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	1($in0),$in0
171d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$TX[1]
172d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Loop${MOD}_warmup
173d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
174d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$YY,$TX[1]
175d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$YY,$YY
176d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$TX[1]#b,$YY#b
177d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
178d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Loop${MOD}_is_hot:
179d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$len,32(%rsp)		# save original $len
180d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shr	\$6,$len		# number of 64-byte blocks
181d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
182d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  if ($D && !$md5) {			# stitch in dummy MD5
183d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $md5=1;
184d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $ctx="%r11";
185d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $inp="%r15";
186d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
187d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsp,$ctx
188d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$in0,$inp
189d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
190d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  }
191d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
192d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
193d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TX[0]#b,$YY#b
194d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	lea	($dat,$XX[0],4),$XX[1]
195d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	shl	\$6,$len
196d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$inp,$len		# pointer to the end of input
197d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$len,16(%rsp)
198d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
199d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$ctx,24(%rsp)		# save pointer to MD5_CTX
200d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	0*4($ctx),$V[0]		# load current hash value from MD5_CTX
201d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	1*4($ctx),$V[1]
202d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	2*4($ctx),$V[2]
203d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	3*4($ctx),$V[3]
204d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Loop
205d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
206d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
207d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Loop:
208d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[0],0*4(%rsp)		# put aside current hash value
209d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[1],1*4(%rsp)
210d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[2],2*4(%rsp)
211d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[3],$tmp		# forward reference
212d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[3],3*4(%rsp)
213d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
214d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
215d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub R0 {
216d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my ($i,$a,$b,$c,$d)=@_;
217d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my @rot0=(7,12,17,22);
218d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $j=$i%16;
219d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $k=$i%$MOD;
220d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $xmm="%xmm".($j&1);
221d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	movdqu	($in0),%xmm2\n"		if ($rc4 && $j==15);
222d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
223d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
224d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
225d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	($dat,$YY,4),$TY#d
226d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	xor	$c,$tmp
227d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$TX[0]#d,($dat,$YY,4)
228d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	and	$b,$tmp
229d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	4*`$j`($inp),$a
230d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TY#b,$TX[0]#b
231d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
232d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	\$$K[$i],$a
233d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	xor	$d,$tmp
234d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movz	$TX[0]#b,$TX[0]#d
235d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$TY#d,4*$k($XX[1])
236d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	$tmp,$a
237d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TX[1]#b,$YY#b
238d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	rol	\$$rot0[$j%4],$a
239d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	`$j==15?"$b":"$c"`,$tmp		# forward reference
240d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
241d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	$b,$a
242d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
243d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
244d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$YY,$XX[1]
245d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$YY,$YY				# keyword to partial register
246d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$XX[1]#b,$YY#b
247d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($dat,$XX[0],4),$XX[1]
248d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
249d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($rc4 && $j==15);
250d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$8,%xmm1
251d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0,%xmm2
252d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm1,%xmm2
253d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
254d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
255d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub R1 {
256d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my ($i,$a,$b,$c,$d)=@_;
257d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my @rot1=(5,9,14,20);
258d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $j=$i%16;
259d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $k=$i%$MOD;
260d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $xmm="%xmm".($j&1);
261d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	movdqu	16($in0),%xmm3\n"	if ($rc4 && $j==15);
262d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
263d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
264d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
265d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	($dat,$YY,4),$TY#d
266d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	xor	$b,$tmp
267d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$TX[0]#d,($dat,$YY,4)
268d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	and	$d,$tmp
269d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	4*`((1+5*$j)%16)`($inp),$a
270d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TY#b,$TX[0]#b
271d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
272d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	\$$K[$i],$a
273d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	xor	$c,$tmp
274d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movz	$TX[0]#b,$TX[0]#d
275d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$TY#d,4*$k($XX[1])
276d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	$tmp,$a
277d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TX[1]#b,$YY#b
278d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	rol	\$$rot1[$j%4],$a
279d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	`$j==15?"$c":"$b"`,$tmp		# forward reference
280d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
281d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	$b,$a
282d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
283d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
284d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$YY,$XX[1]
285d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$YY,$YY				# keyword to partial register
286d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$XX[1]#b,$YY#b
287d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($dat,$XX[0],4),$XX[1]
288d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
289d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($rc4 && $j==15);
290d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$8,%xmm1
291d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0,%xmm3
292d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm1,%xmm3
293d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
294d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
295d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub R2 {
296d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my ($i,$a,$b,$c,$d)=@_;
297d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my @rot2=(4,11,16,23);
298d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $j=$i%16;
299d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $k=$i%$MOD;
300d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $xmm="%xmm".($j&1);
301d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	movdqu	32($in0),%xmm4\n"	if ($rc4 && $j==15);
302d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
303d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
304d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
305d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	($dat,$YY,4),$TY#d
306d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	xor	$c,$tmp
307d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$TX[0]#d,($dat,$YY,4)
308d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	xor	$b,$tmp
309d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	4*`((5+3*$j)%16)`($inp),$a
310d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TY#b,$TX[0]#b
311d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
312d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	\$$K[$i],$a
313d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movz	$TX[0]#b,$TX[0]#d
314d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	$tmp,$a
315d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$TY#d,4*$k($XX[1])
316d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TX[1]#b,$YY#b
317d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	rol	\$$rot2[$j%4],$a
318d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	`$j==15?"\\\$-1":"$c"`,$tmp	# forward reference
319d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
320d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	$b,$a
321d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
322d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
323d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$YY,$XX[1]
324d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$YY,$YY				# keyword to partial register
325d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$XX[1]#b,$YY#b
326d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($dat,$XX[0],4),$XX[1]
327d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
328d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($rc4 && $j==15);
329d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$8,%xmm1
330d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0,%xmm4
331d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm1,%xmm4
332d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
333d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
334d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub R3 {
335d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my ($i,$a,$b,$c,$d)=@_;
336d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my @rot3=(6,10,15,21);
337d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $j=$i%16;
338d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $k=$i%$MOD;
339d9e397b599b13d642138480a28c14db7a136bf0Adam Langley  my $xmm="%xmm".($j&1);
340d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	movdqu	48($in0),%xmm5\n"	if ($rc4 && $j==15);
341d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
342d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
343d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___;
344d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	($dat,$YY,4),$TY#d
345d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	xor	$d,$tmp
346d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$TX[0]#d,($dat,$YY,4)
347d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	or	$b,$tmp
348d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	4*`((7*$j)%16)`($inp),$a
349d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TY#b,$TX[0]#b
350d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
351d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	\$$K[$i],$a
352d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movz	$TX[0]#b,$TX[0]#d
353d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	xor	$c,$tmp
354d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$TY#d,4*$k($XX[1])
355d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	$tmp,$a
356d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	add	$TX[1]#b,$YY#b
357d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	rol	\$$rot3[$j%4],$a
358d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	\$-1,$tmp			# forward reference
359d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
360d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	$b,$a
361d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
362d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    $code.=<<___ if ($rc4 && $j==15);
363d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$XX[0],$XX[1]
364d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$XX[0],$XX[0]			# keyword to partial register
365d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$XX[1]#b,$XX[0]#b
366d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$YY,$XX[1]
367d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$YY,$YY				# keyword to partial register
368d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$XX[1]#b,$YY#b
369d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($dat,$XX[0],4),$XX[1]
370d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	psllq	\$8,%xmm1
371d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm0,%xmm5
372d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pxor	%xmm1,%xmm5
373d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
374d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
375d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
376d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $i=0;
377d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyfor(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
378d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyfor(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
379d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyfor(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
380d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyfor(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
381d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
382d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
383d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	0*4(%rsp),$V[0]		# accumulate hash value
384d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	1*4(%rsp),$V[1]
385d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	2*4(%rsp),$V[2]
386d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	add	3*4(%rsp),$V[3]
387d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
388d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movdqu	%xmm2,($out,$in0)	# write RC4 output
389d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movdqu	%xmm3,16($out,$in0)
390d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movdqu	%xmm4,32($out,$in0)
391d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movdqu	%xmm5,48($out,$in0)
392d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	lea	64($inp),$inp
393d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	lea	64($in0),$in0
394d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	16(%rsp),$inp		# are we done?
395d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Loop
396d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
397d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	24(%rsp),$len		# restore pointer to MD5_CTX
398d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	sub	$TX[0]#b,$YY#b		# correct $YY
399d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[0],0*4($len)		# write MD5_CTX
400d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[1],1*4($len)
401d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[2],2*4($len)
402d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#md5#	mov	$V[3],3*4($len)
403d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
404d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___ if ($rc4 && (!$md5 || $D));
405d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	32(%rsp),$len		# restore original $len
406d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	and	\$63,$len		# remaining bytes
407d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Loop1
408d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Ldone
409d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
410d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
411d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Loop1:
412d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$TX[0]#b,$YY#b
413d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	($dat,$YY,4),$TY#d
414d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	$TX[0]#d,($dat,$YY,4)
415d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	$TY#d,($dat,$XX[0],4)
416d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	$TY#b,$TX[0]#b
417d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	inc	$XX[0]#b
418d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	($dat,$TX[0],4),$TY#d
419d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movl	($dat,$XX[0],4),$TX[0]#d
420d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xorb	($in0),$TY#b
421d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	movb	$TY#b,($out,$in0)
422d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	1($in0),$in0
423d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	dec	$len
424d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnz	.Loop1
425d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
426d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Ldone:
427d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
428d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
429d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	sub	\$1,$XX[0]#b
430d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$XX[0]#d,-8($dat)
431d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#rc4#	movl	$YY#d,-4($dat)
432d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
433d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	40(%rsp),%r15
434d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	48(%rsp),%r14
435d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	56(%rsp),%r13
436d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	64(%rsp),%r12
437d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	72(%rsp),%rbp
438d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	80(%rsp),%rbx
439d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	88(%rsp),%rsp
440d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lepilogue:
441d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Labort:
442d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
443d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size $func,.-$func
444d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
445d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
446d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif ($rc4 && $D) {	# sole purpose of this section is to provide
447d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			# option to use the generated module as drop-in
448d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			# replacement for rc4-x86_64.pl for debugging
449d9e397b599b13d642138480a28c14db7a136bf0Adam Langley			# and testing purposes...
450d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($idx,$ido)=("%r8","%r9");
451d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
452d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
453d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
454d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	RC4_set_key
455d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	RC4_set_key,\@function,3
456d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
457d9e397b599b13d642138480a28c14db7a136bf0Adam LangleyRC4_set_key:
458d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	8($dat),$dat
459d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	($inp,$len),$inp
460d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	neg	$len
461d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$len,%rcx
462d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	%eax,%eax
463d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$ido,$ido
464d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	%r10,%r10
465d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	%r11,%r11
466d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jmp	.Lw1stloop
467d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
468d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
469d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lw1stloop:
470d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax,($dat,%rax,4)
471d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$1,%al
472d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnc	.Lw1stloop
473d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
474d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$ido,$ido
475d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	$idx,$idx
476d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
477d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lw2ndloop:
478d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	($dat,$ido,4),%r10d
479d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	($inp,$len,1),$idx#b
480d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	%r10b,$idx#b
481d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$1,$len
482d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	($dat,$idx,4),%r11d
483d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmovz	%rcx,$len
484d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r10d,($dat,$idx,4)
485d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r11d,($dat,$ido,4)
486d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$1,$ido#b
487d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jnc	.Lw2ndloop
488d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
489d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	%eax,%eax
490d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax,-8($dat)
491d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%eax,-4($dat)
492d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
493d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	RC4_set_key,.-RC4_set_key
494d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
495d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.globl	RC4_options
496d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	RC4_options,\@abi-omnipotent
497d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
498d9e397b599b13d642138480a28c14db7a136bf0Adam LangleyRC4_options:
499d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.Lopts(%rip),%rax
500d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
501d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	64
502d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lopts:
503d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.asciz	"rc4(64x,int)"
504d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	64
505d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	RC4_options,.-RC4_options
506d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
507d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
508d9e397b599b13d642138480a28c14db7a136bf0Adam Langley# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
509d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
510d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyif ($win64) {
511d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $rec="%rcx";
512d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $frame="%rdx";
513d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $context="%r8";
514d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy $disp="%r9";
515d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
516d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code.=<<___;
517d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.extern	__imp_RtlVirtualUnwind
518d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.type	se_handler,\@abi-omnipotent
519d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	16
520d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyse_handler:
521d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rsi
522d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rdi
523d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbx
524d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%rbp
525d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r12
526d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r13
527d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r14
528d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	push	%r15
529d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pushfq
530d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	sub	\$64,%rsp
531d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
532d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	120($context),%rax	# pull context->Rax
533d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	248($context),%rbx	# pull context->Rip
534d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
535d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.Lbody(%rip),%r10
536d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%r10,%rbx		# context->Rip<.Lbody
537d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jb	.Lin_prologue
538d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
539d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	152($context),%rax	# pull context->Rsp
540d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
541d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	.Lepilogue(%rip),%r10
542d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
543d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	jae	.Lin_prologue
544d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
545d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	40(%rax),%r15
546d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	48(%rax),%r14
547d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	56(%rax),%r13
548d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	64(%rax),%r12
549d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	72(%rax),%rbp
550d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	80(%rax),%rbx
551d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	88(%rax),%rax
552d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
553d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rbx,144($context)	# restore context->Rbx
554d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rbp,160($context)	# restore context->Rbp
555d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r12,216($context)	# restore context->R12
556d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r13,224($context)	# restore context->R12
557d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r14,232($context)	# restore context->R14
558d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r15,240($context)	# restore context->R15
559d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
560d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.Lin_prologue:
561d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	8(%rax),%rdi
562d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	16(%rax),%rsi
563d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rax,152($context)	# restore context->Rsp
564d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rsi,168($context)	# restore context->Rsi
565d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rdi,176($context)	# restore context->Rdi
566d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
567d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	40($disp),%rdi		# disp->ContextRecord
568d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$context,%rsi		# context
569d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	\$154,%ecx		# sizeof(CONTEXT)
570d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.long	0xa548f3fc		# cld; rep movsq
571d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
572d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	$disp,%rsi
573d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
574d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
575d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	0(%rsi),%r8		# arg3, disp->ControlPc
576d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
577d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	40(%rsi),%r10		# disp->ContextRecord
578d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	56(%rsi),%r11		# &disp->HandlerData
579d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	lea	24(%rsi),%r12		# &disp->EstablisherFrame
580d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r10,32(%rsp)		# arg5
581d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r11,40(%rsp)		# arg6
582d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%r12,48(%rsp)		# arg7
583d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	%rcx,56(%rsp)		# arg8, (NULL)
584d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	call	*__imp_RtlVirtualUnwind(%rip)
585d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
586d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	mov	\$1,%eax		# ExceptionContinueSearch
587d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	add	\$64,%rsp
588d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	popfq
589d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%r15
590d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%r14
591d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%r13
592d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%r12
593d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%rbp
594d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%rbx
595d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%rdi
596d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	pop	%rsi
597d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	ret
598d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.size	se_handler,.-se_handler
599d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
600d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.section	.pdata
601d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	4
602d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.LSEH_begin_$func
603d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.LSEH_end_$func
604d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	.LSEH_info_$func
605d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
606d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.section	.xdata
607d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.align	8
608d9e397b599b13d642138480a28c14db7a136bf0Adam Langley.LSEH_info_$func:
609d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.byte	9,0,0,0
610d9e397b599b13d642138480a28c14db7a136bf0Adam Langley	.rva	se_handler
611d9e397b599b13d642138480a28c14db7a136bf0Adam Langley___
612d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
613d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
614d9e397b599b13d642138480a28c14db7a136bf0Adam Langleysub reg_part {
615d9e397b599b13d642138480a28c14db7a136bf0Adam Langleymy ($reg,$conv)=@_;
616d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
617d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
618d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
619d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
620d9e397b599b13d642138480a28c14db7a136bf0Adam Langley    return $reg;
621d9e397b599b13d642138480a28c14db7a136bf0Adam Langley}
622d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
623d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
624d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/\`([^\`]*)\`/eval $1/gem;
625d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/pinsrw\s+\$0,/movd	/gm;
626d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
627d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/#md5#//gm	if ($md5);
628d9e397b599b13d642138480a28c14db7a136bf0Adam Langley$code =~ s/#rc4#//gm	if ($rc4);
629d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
630d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyprint $code;
631d9e397b599b13d642138480a28c14db7a136bf0Adam Langley
632d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyclose STDOUT;
633