1// Original source:
2//	http://www.zorinaq.com/papers/rc4-amd64.html
3//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
4
5#include "textflag.h"
6
7// Local modifications:
8//
9// Transliterated from GNU to 6a assembly syntax by the Go authors.
10// The comments and spacing are from the original.
11//
12// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
13//
14// The original code accumulated 64 bits of key stream in an integer
15// register and then XOR'ed the key stream into the data 8 bytes at a time.
16// Modified to accumulate 128 bits of key stream into an XMM register
17// and then XOR the key stream into the data 16 bytes at a time.
18// Approximately doubles throughput.
19
20// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
21// but makes the code run 2.0x slower on Xeon.
22#define EXTEND(r) MOVBLZX r, r
23
24/*
25** RC4 implementation optimized for AMD64.
26**
27** Author: Marc Bevand <bevand_m (at) epita.fr>
28** Licence: I hereby disclaim the copyright on this code and place it
29** in the public domain.
30**
31** The code has been designed to be easily integrated into openssl:
32** the exported RC4() function can replace the actual implementations
33** openssl already contains. Please note that when linking with openssl,
34** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
35** with -DRC4_INT='unsigned long'.
36**
37** The throughput achieved by this code is about 320 MBytes/sec, on
38** a 1.8 GHz AMD Opteron (rev C0) processor.
39*/
40
41TEXT ·xorKeyStream(SB),NOSPLIT,$0
42	MOVQ	n+16(FP),	BX		// rbx = ARG(len)
43	MOVQ	src+8(FP),	SI		// in = ARG(in)
44	MOVQ	dst+0(FP),	DI		// out = ARG(out)
45	MOVQ	state+24(FP),	BP		// d = ARG(data)
46	MOVQ	i+32(FP),	AX
47	MOVBQZX	0(AX),		CX		// x = *xp
48	MOVQ	j+40(FP),	AX
49	MOVBQZX	0(AX),		DX		// y = *yp
50
51	LEAQ	(SI)(BX*1),	R9		// limit = in+len
52
53l1:	CMPQ	SI,		R9		// cmp in with in+len
54	JGE	finished			// jump if (in >= in+len)
55
56	INCB	CX
57	EXTEND(CX)
58	TESTL	$15,		CX
59	JZ	wordloop
60
61	MOVBLZX	(BP)(CX*4),	AX
62
63	ADDB	AX,		DX		// y += tx
64	EXTEND(DX)
65	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
66	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
67	ADDB	AX,		BX		// val = ty+tx
68	EXTEND(BX)
69	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
70	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
71	XORB	(SI),		R8		// xor 1 byte
72	MOVB	R8,		(DI)
73	INCQ	SI				// in++
74	INCQ	DI				// out++
75	JMP l1
76
77wordloop:
78	SUBQ	$16,		R9
79	CMPQ	SI,		R9
80	JGT	end
81
82start:
83	ADDQ	$16,		SI		// increment in
84	ADDQ	$16,		DI		// increment out
85
86	// Each KEYROUND generates one byte of key and
87	// inserts it into an XMM register at the given 16-bit index.
88	// The key state array is uint32 words only using the bottom
89	// byte of each word, so the 16-bit OR only copies 8 useful bits.
90	// We accumulate alternating bytes into X0 and X1, and then at
91	// the end we OR X1<<8 into X0 to produce the actual key.
92	//
93	// At the beginning of the loop, CX%16 == 0, so the 16 loads
94	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
95	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
96	// without fear of the byte computation CX+15 wrapping around.
97	//
98	// The first round needs R12[0], the second needs R12[1], and so on.
99	// We can avoid memory stalls by starting the load for round n+1
100	// before the end of round n, using the LOAD macro.
101	LEAQ	(BP)(CX*4),	R12
102
103#define KEYROUND(xmm, load, off, r1, r2, index) \
104	MOVBLZX	(BP)(DX*4),	R8; \
105	MOVB	r1,		(BP)(DX*4); \
106	load((off+1), r2); \
107	MOVB	R8,		(off*4)(R12); \
108	ADDB	r1,		R8; \
109	EXTEND(R8); \
110	PINSRW	$index, (BP)(R8*4), xmm
111
112#define LOAD(off, reg) \
113	MOVBLZX	(off*4)(R12),	reg; \
114	ADDB	reg,		DX; \
115	EXTEND(DX)
116
117#define SKIP(off, reg)
118
119	LOAD(0, AX)
120	KEYROUND(X0, LOAD, 0, AX, BX, 0)
121	KEYROUND(X1, LOAD, 1, BX, AX, 0)
122	KEYROUND(X0, LOAD, 2, AX, BX, 1)
123	KEYROUND(X1, LOAD, 3, BX, AX, 1)
124	KEYROUND(X0, LOAD, 4, AX, BX, 2)
125	KEYROUND(X1, LOAD, 5, BX, AX, 2)
126	KEYROUND(X0, LOAD, 6, AX, BX, 3)
127	KEYROUND(X1, LOAD, 7, BX, AX, 3)
128	KEYROUND(X0, LOAD, 8, AX, BX, 4)
129	KEYROUND(X1, LOAD, 9, BX, AX, 4)
130	KEYROUND(X0, LOAD, 10, AX, BX, 5)
131	KEYROUND(X1, LOAD, 11, BX, AX, 5)
132	KEYROUND(X0, LOAD, 12, AX, BX, 6)
133	KEYROUND(X1, LOAD, 13, BX, AX, 6)
134	KEYROUND(X0, LOAD, 14, AX, BX, 7)
135	KEYROUND(X1, SKIP, 15, BX, AX, 7)
136
137	ADDB	$16,		CX
138
139	PSLLQ	$8,		X1
140	PXOR	X1,		X0
141	MOVOU	-16(SI),	X2
142	PXOR	X0,		X2
143	MOVOU	X2,		-16(DI)
144
145	CMPQ	SI,		R9		// cmp in with in+len-16
146	JLE	start				// jump if (in <= in+len-16)
147
148end:
149	DECB	CX
150	ADDQ	$16,		R9		// tmp = in+len
151
152	// handle the last bytes, one by one
153l2:	CMPQ	SI,		R9		// cmp in with in+len
154	JGE	finished			// jump if (in >= in+len)
155
156	INCB	CX
157	EXTEND(CX)
158	MOVBLZX	(BP)(CX*4),	AX
159
160	ADDB	AX,		DX		// y += tx
161	EXTEND(DX)
162	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
163	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
164	ADDB	AX,		BX		// val = ty+tx
165	EXTEND(BX)
166	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
167	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
168	XORB	(SI),		R8		// xor 1 byte
169	MOVB	R8,		(DI)
170	INCQ	SI				// in++
171	INCQ	DI				// out++
172	JMP l2
173
174finished:
175	MOVQ	j+40(FP),	BX
176	MOVB	DX, 0(BX)
177	MOVQ	i+32(FP),	AX
178	MOVB	CX, 0(AX)
179	RET
180