1// Original source:
2//	http://www.zorinaq.com/papers/rc4-amd64.html
3//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
4
5#include "textflag.h"
6
7// Local modifications:
8//
9// Transliterated from GNU to 6a assembly syntax by the Go authors.
10// The comments and spacing are from the original.
11//
12// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
13//
14// The original code accumulated 64 bits of key stream in an integer
15// register and then XOR'ed the key stream into the data 8 bytes at a time.
16// Modified to accumulate 128 bits of key stream into an XMM register
17// and then XOR the key stream into the data 16 bytes at a time.
18// Approximately doubles throughput.
19//
20// Converted to amd64p32.
21//
22// To make safe for Native Client, avoid use of BP, R15,
23// and two-register addressing modes.
24
25// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
26// but makes the code run 2.0x slower on Xeon.
27#define EXTEND(r) MOVBLZX r, r
28
29/*
30** RC4 implementation optimized for AMD64.
31**
32** Author: Marc Bevand <bevand_m (at) epita.fr>
33** Licence: I hereby disclaim the copyright on this code and place it
34** in the public domain.
35**
36** The code has been designed to be easily integrated into openssl:
37** the exported RC4() function can replace the actual implementations
38** openssl already contains. Please note that when linking with openssl,
39** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
40** with -DRC4_INT='unsigned long'.
41**
42** The throughput achieved by this code is about 320 MBytes/sec, on
43** a 1.8 GHz AMD Opteron (rev C0) processor.
44*/
45
46TEXT ·xorKeyStream(SB),NOSPLIT,$0
47	MOVL	n+8(FP),	BX		// rbx = ARG(len)
48	MOVL	src+4(FP),	SI		// in = ARG(in)
49	MOVL	dst+0(FP),	DI		// out = ARG(out)
50	MOVL	state+12(FP),	R10		// d = ARG(data)
51	MOVL	i+16(FP),	AX
52	MOVBQZX	0(AX),		CX		// x = *xp
53	MOVL	j+20(FP),	AX
54	MOVBQZX	0(AX),		DX		// y = *yp
55
56	LEAQ	(SI)(BX*1),	R9		// limit = in+len
57
58l1:	CMPQ	SI,		R9		// cmp in with in+len
59	JGE	finished			// jump if (in >= in+len)
60
61	INCB	CX
62	EXTEND(CX)
63	TESTL	$15,		CX
64	JZ	wordloop
65	LEAL	(R10)(CX*4), R12
66
67	MOVBLZX	(R12),	AX
68
69	ADDB	AX,		DX		// y += tx
70	EXTEND(DX)
71	LEAL (R10)(DX*4), R11
72	MOVBLZX	(R11),	BX		// ty = d[y]
73	MOVB	BX,		(R12)	// d[x] = ty
74	ADDB	AX,		BX		// val = ty+tx
75	EXTEND(BX)
76	LEAL (R10)(BX*4), R13
77	MOVB	AX,		(R11)	// d[y] = tx
78	MOVBLZX	(R13),	R8		// val = d[val]
79	XORB	(SI),		R8		// xor 1 byte
80	MOVB	R8,		(DI)
81	INCQ	SI				// in++
82	INCQ	DI				// out++
83	JMP l1
84
85wordloop:
86	SUBQ	$16,		R9
87	CMPQ	SI,		R9
88	JGT	end
89
90start:
91	ADDQ	$16,		SI		// increment in
92	ADDQ	$16,		DI		// increment out
93
94	// Each KEYROUND generates one byte of key and
95	// inserts it into an XMM register at the given 16-bit index.
96	// The key state array is uint32 words only using the bottom
97	// byte of each word, so the 16-bit OR only copies 8 useful bits.
98	// We accumulate alternating bytes into X0 and X1, and then at
99	// the end we OR X1<<8 into X0 to produce the actual key.
100	//
101	// At the beginning of the loop, CX%16 == 0, so the 16 loads
102	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
103	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
104	// without fear of the byte computation CX+15 wrapping around.
105	//
106	// The first round needs R12[0], the second needs R12[1], and so on.
107	// We can avoid memory stalls by starting the load for round n+1
108	// before the end of round n, using the LOAD macro.
109	LEAQ	(R10)(CX*4),	R12
110
111#define KEYROUND(xmm, load, off, r1, r2, index) \
112	LEAL (R10)(DX*4), R11; \
113	MOVBLZX	(R11),	R8; \
114	MOVB	r1,		(R11); \
115	load((off+1), r2); \
116	MOVB	R8,		(off*4)(R12); \
117	ADDB	r1,		R8; \
118	EXTEND(R8); \
119	LEAL (R10)(R8*4), R14; \
120	PINSRW	$index, (R14), xmm
121
122#define LOAD(off, reg) \
123	MOVBLZX	(off*4)(R12),	reg; \
124	ADDB	reg,		DX; \
125	EXTEND(DX)
126
127#define SKIP(off, reg)
128
129	LOAD(0, AX)
130	KEYROUND(X0, LOAD, 0, AX, BX, 0)
131	KEYROUND(X1, LOAD, 1, BX, AX, 0)
132	KEYROUND(X0, LOAD, 2, AX, BX, 1)
133	KEYROUND(X1, LOAD, 3, BX, AX, 1)
134	KEYROUND(X0, LOAD, 4, AX, BX, 2)
135	KEYROUND(X1, LOAD, 5, BX, AX, 2)
136	KEYROUND(X0, LOAD, 6, AX, BX, 3)
137	KEYROUND(X1, LOAD, 7, BX, AX, 3)
138	KEYROUND(X0, LOAD, 8, AX, BX, 4)
139	KEYROUND(X1, LOAD, 9, BX, AX, 4)
140	KEYROUND(X0, LOAD, 10, AX, BX, 5)
141	KEYROUND(X1, LOAD, 11, BX, AX, 5)
142	KEYROUND(X0, LOAD, 12, AX, BX, 6)
143	KEYROUND(X1, LOAD, 13, BX, AX, 6)
144	KEYROUND(X0, LOAD, 14, AX, BX, 7)
145	KEYROUND(X1, SKIP, 15, BX, AX, 7)
146
147	ADDB	$16,		CX
148
149	PSLLQ	$8,		X1
150	PXOR	X1,		X0
151	MOVOU	-16(SI),	X2
152	PXOR	X0,		X2
153	MOVOU	X2,		-16(DI)
154
155	CMPQ	SI,		R9		// cmp in with in+len-16
156	JLE	start				// jump if (in <= in+len-16)
157
158end:
159	DECB	CX
160	ADDQ	$16,		R9		// tmp = in+len
161
162	// handle the last bytes, one by one
163l2:	CMPQ	SI,		R9		// cmp in with in+len
164	JGE	finished			// jump if (in >= in+len)
165
166	INCB	CX
167	EXTEND(CX)
168	LEAL (R10)(CX*4), R12
169	MOVBLZX	(R12),	AX
170
171	ADDB	AX,		DX		// y += tx
172	EXTEND(DX)
173	LEAL (R10)(DX*4), R11
174	MOVBLZX	(R11),	BX		// ty = d[y]
175	MOVB	BX,		(R12)	// d[x] = ty
176	ADDB	AX,		BX		// val = ty+tx
177	EXTEND(BX)
178	LEAL (R10)(BX*4), R13
179	MOVB	AX,		(R11)	// d[y] = tx
180	MOVBLZX	(R13),	R8		// val = d[val]
181	XORB	(SI),		R8		// xor 1 byte
182	MOVB	R8,		(DI)
183	INCQ	SI				// in++
184	INCQ	DI				// out++
185	JMP l2
186
187finished:
188	MOVL	j+20(FP),	BX
189	MOVB	DX, 0(BX)
190	MOVL	i+16(FP),	AX
191	MOVB	CX, 0(AX)
192	RET
193