1/* Copyright (c) 2014, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15/* ====================================================================
16 *
17 * When updating this file, also update chacha_vec_arm.S
18 *
19 * ==================================================================== */
20
21
22/* This implementation is by Ted Krovetz and was submitted to SUPERCOP and
23 * marked as public domain. It was been altered to allow for non-aligned inputs
24 * and to allow the block counter to be passed in specifically. */
25
26#include <openssl/chacha.h>
27
28#if !defined(OPENSSL_WINDOWS) && (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) && defined(__SSE2__)
29
30#define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */
31
32/* Architecture-neutral way to specify 16-byte vector of ints              */
33typedef unsigned vec __attribute__((vector_size(16)));
34
35/* This implementation is designed for Neon, SSE and AltiVec machines. The
36 * following specify how to do certain vector operations efficiently on
37 * each architecture, using intrinsics.
38 * This implementation supports parallel processing of multiple blocks,
39 * including potentially using general-purpose registers. */
40#if __ARM_NEON__
41#include <arm_neon.h>
42#define GPR_TOO 1
43#define VBPI 2
44#define ONE (vec) vsetq_lane_u32(1, vdupq_n_u32(0), 0)
45#define LOAD(m) (vec)(*((vec *)(m)))
46#define STORE(m, r) (*((vec *)(m))) = (r)
47#define ROTV1(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 1)
48#define ROTV2(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 2)
49#define ROTV3(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 3)
50#define ROTW16(x) (vec) vrev32q_u16((uint16x8_t)x)
51#if __clang__
52#define ROTW7(x) (x << ((vec) {7, 7, 7, 7})) ^ (x >> ((vec) {25, 25, 25, 25}))
53#define ROTW8(x) (x << ((vec) {8, 8, 8, 8})) ^ (x >> ((vec) {24, 24, 24, 24}))
54#define ROTW12(x) \
55  (x << ((vec) {12, 12, 12, 12})) ^ (x >> ((vec) {20, 20, 20, 20}))
56#else
57#define ROTW7(x) \
58  (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 7), (uint32x4_t)x, 25)
59#define ROTW8(x) \
60  (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 8), (uint32x4_t)x, 24)
61#define ROTW12(x) \
62  (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 12), (uint32x4_t)x, 20)
63#endif
64#elif __SSE2__
65#include <emmintrin.h>
66#define GPR_TOO 0
67#if __clang__
68#define VBPI 4
69#else
70#define VBPI 3
71#endif
72#define ONE (vec) _mm_set_epi32(0, 0, 0, 1)
73#define LOAD(m) (vec) _mm_loadu_si128((__m128i *)(m))
74#define STORE(m, r) _mm_storeu_si128((__m128i *)(m), (__m128i)(r))
75#define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(0, 3, 2, 1))
76#define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(1, 0, 3, 2))
77#define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(2, 1, 0, 3))
78#define ROTW7(x) \
79  (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x, 25))
80#define ROTW12(x) \
81  (vec)(_mm_slli_epi32((__m128i)x, 12) ^ _mm_srli_epi32((__m128i)x, 20))
82#if __SSSE3__
83#include <tmmintrin.h>
84#define ROTW8(x)                                                            \
85  (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, \
86                                                  11, 6, 5, 4, 7, 2, 1, 0, 3))
87#define ROTW16(x)                                                           \
88  (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, \
89                                                  10, 5, 4, 7, 6, 1, 0, 3, 2))
90#else
91#define ROTW8(x) \
92  (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x, 24))
93#define ROTW16(x) \
94  (vec)(_mm_slli_epi32((__m128i)x, 16) ^ _mm_srli_epi32((__m128i)x, 16))
95#endif
96#else
97#error-- Implementation supports only machines with neon or SSE2
98#endif
99
100#ifndef REVV_BE
101#define REVV_BE(x)  (x)
102#endif
103
104#ifndef REVW_BE
105#define REVW_BE(x)  (x)
106#endif
107
108#define BPI      (VBPI + GPR_TOO)  /* Blocks computed per loop iteration   */
109
110#define DQROUND_VECTORS(a,b,c,d)                \
111    a += b; d ^= a; d = ROTW16(d);              \
112    c += d; b ^= c; b = ROTW12(b);              \
113    a += b; d ^= a; d = ROTW8(d);               \
114    c += d; b ^= c; b = ROTW7(b);               \
115    b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);  \
116    a += b; d ^= a; d = ROTW16(d);              \
117    c += d; b ^= c; b = ROTW12(b);              \
118    a += b; d ^= a; d = ROTW8(d);               \
119    c += d; b ^= c; b = ROTW7(b);               \
120    b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
121
122#define QROUND_WORDS(a,b,c,d) \
123  a = a+b; d ^= a; d = d<<16 | d>>16; \
124  c = c+d; b ^= c; b = b<<12 | b>>20; \
125  a = a+b; d ^= a; d = d<< 8 | d>>24; \
126  c = c+d; b ^= c; b = b<< 7 | b>>25;
127
128#define WRITE_XOR(in, op, d, v0, v1, v2, v3)                   \
129	STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0));      \
130	STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1));      \
131	STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2));      \
132	STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3));
133
134#if __ARM_NEON__
135/* For ARM, we can't depend on NEON support, so this function is compiled with
136 * a different name, along with the generic code, and can be enabled at
137 * run-time. */
138void CRYPTO_chacha_20_neon(
139#else
140void CRYPTO_chacha_20(
141#endif
142	uint8_t *out,
143	const uint8_t *in,
144	size_t inlen,
145	const uint8_t key[32],
146	const uint8_t nonce[8],
147	size_t counter)
148	{
149	unsigned iters, i, *op=(unsigned *)out, *ip=(unsigned *)in, *kp;
150#if defined(__ARM_NEON__)
151	unsigned *np;
152#endif
153	vec s0, s1, s2, s3;
154#if !defined(__ARM_NEON__) && !defined(__SSE2__)
155	__attribute__ ((aligned (16))) unsigned key[8], nonce[4];
156#endif
157	__attribute__ ((aligned (16))) unsigned chacha_const[] =
158		{0x61707865,0x3320646E,0x79622D32,0x6B206574};
159#if defined(__ARM_NEON__) || defined(__SSE2__)
160	kp = (unsigned *)key;
161#else
162	((vec *)key)[0] = REVV_BE(((vec *)key)[0]);
163	((vec *)key)[1] = REVV_BE(((vec *)key)[1]);
164	nonce[0] = REVW_BE(((unsigned *)nonce)[0]);
165	nonce[1] = REVW_BE(((unsigned *)nonce)[1]);
166	nonce[2] = REVW_BE(((unsigned *)nonce)[2]);
167	nonce[3] = REVW_BE(((unsigned *)nonce)[3]);
168	kp = (unsigned *)key;
169	np = (unsigned *)nonce;
170#endif
171#if defined(__ARM_NEON__)
172	np = (unsigned*) nonce;
173#endif
174	s0 = LOAD(chacha_const);
175	s1 = LOAD(&((vec*)kp)[0]);
176	s2 = LOAD(&((vec*)kp)[1]);
177	s3 = (vec){
178		counter & 0xffffffff,
179#if __ARM_NEON__ || defined(OPENSSL_X86)
180		0,  /* can't right-shift 32 bits on a 32-bit system. */
181#else
182		counter >> 32,
183#endif
184		((uint32_t*)nonce)[0],
185		((uint32_t*)nonce)[1]
186	};
187
188	for (iters = 0; iters < inlen/(BPI*64); iters++)
189		{
190#if GPR_TOO
191		register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8,
192				  x9, x10, x11, x12, x13, x14, x15;
193#endif
194#if VBPI > 2
195		vec v8,v9,v10,v11;
196#endif
197#if VBPI > 3
198		vec v12,v13,v14,v15;
199#endif
200
201		vec v0,v1,v2,v3,v4,v5,v6,v7;
202		v4 = v0 = s0; v5 = v1 = s1; v6 = v2 = s2; v3 = s3;
203		v7 = v3 + ONE;
204#if VBPI > 2
205		v8 = v4; v9 = v5; v10 = v6;
206		v11 =  v7 + ONE;
207#endif
208#if VBPI > 3
209		v12 = v8; v13 = v9; v14 = v10;
210		v15 = v11 + ONE;
211#endif
212#if GPR_TOO
213		x0 = chacha_const[0]; x1 = chacha_const[1];
214		x2 = chacha_const[2]; x3 = chacha_const[3];
215		x4 = kp[0]; x5 = kp[1]; x6  = kp[2]; x7  = kp[3];
216		x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7];
217		x12 = counter+BPI*iters+(BPI-1); x13 = 0;
218		x14 = np[0]; x15 = np[1];
219#endif
220		for (i = CHACHA_RNDS/2; i; i--)
221			{
222			DQROUND_VECTORS(v0,v1,v2,v3)
223			DQROUND_VECTORS(v4,v5,v6,v7)
224#if VBPI > 2
225			DQROUND_VECTORS(v8,v9,v10,v11)
226#endif
227#if VBPI > 3
228			DQROUND_VECTORS(v12,v13,v14,v15)
229#endif
230#if GPR_TOO
231			QROUND_WORDS( x0, x4, x8,x12)
232			QROUND_WORDS( x1, x5, x9,x13)
233			QROUND_WORDS( x2, x6,x10,x14)
234			QROUND_WORDS( x3, x7,x11,x15)
235			QROUND_WORDS( x0, x5,x10,x15)
236			QROUND_WORDS( x1, x6,x11,x12)
237			QROUND_WORDS( x2, x7, x8,x13)
238			QROUND_WORDS( x3, x4, x9,x14)
239#endif
240			}
241
242		WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)
243		s3 += ONE;
244		WRITE_XOR(ip, op, 16, v4+s0, v5+s1, v6+s2, v7+s3)
245		s3 += ONE;
246#if VBPI > 2
247		WRITE_XOR(ip, op, 32, v8+s0, v9+s1, v10+s2, v11+s3)
248		s3 += ONE;
249#endif
250#if VBPI > 3
251		WRITE_XOR(ip, op, 48, v12+s0, v13+s1, v14+s2, v15+s3)
252		s3 += ONE;
253#endif
254		ip += VBPI*16;
255		op += VBPI*16;
256#if GPR_TOO
257		op[0]  = REVW_BE(REVW_BE(ip[0])  ^ (x0  + chacha_const[0]));
258		op[1]  = REVW_BE(REVW_BE(ip[1])  ^ (x1  + chacha_const[1]));
259		op[2]  = REVW_BE(REVW_BE(ip[2])  ^ (x2  + chacha_const[2]));
260		op[3]  = REVW_BE(REVW_BE(ip[3])  ^ (x3  + chacha_const[3]));
261		op[4]  = REVW_BE(REVW_BE(ip[4])  ^ (x4  + kp[0]));
262		op[5]  = REVW_BE(REVW_BE(ip[5])  ^ (x5  + kp[1]));
263		op[6]  = REVW_BE(REVW_BE(ip[6])  ^ (x6  + kp[2]));
264		op[7]  = REVW_BE(REVW_BE(ip[7])  ^ (x7  + kp[3]));
265		op[8]  = REVW_BE(REVW_BE(ip[8])  ^ (x8  + kp[4]));
266		op[9]  = REVW_BE(REVW_BE(ip[9])  ^ (x9  + kp[5]));
267		op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6]));
268		op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7]));
269		op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter+BPI*iters+(BPI-1)));
270		op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13));
271		op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[0]));
272		op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[1]));
273		s3 += ONE;
274		ip += 16;
275		op += 16;
276#endif
277		}
278
279	for (iters = inlen%(BPI*64)/64; iters != 0; iters--)
280		{
281		vec v0 = s0, v1 = s1, v2 = s2, v3 = s3;
282		for (i = CHACHA_RNDS/2; i; i--)
283			{
284			DQROUND_VECTORS(v0,v1,v2,v3);
285			}
286		WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3)
287		s3 += ONE;
288		ip += 16;
289		op += 16;
290		}
291
292	inlen = inlen % 64;
293	if (inlen)
294		{
295		__attribute__ ((aligned (16))) vec buf[4];
296		vec v0,v1,v2,v3;
297		v0 = s0; v1 = s1; v2 = s2; v3 = s3;
298		for (i = CHACHA_RNDS/2; i; i--)
299			{
300			DQROUND_VECTORS(v0,v1,v2,v3);
301			}
302
303		if (inlen >= 16)
304			{
305			STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0));
306			if (inlen >= 32)
307				{
308				STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1));
309				if (inlen >= 48)
310					{
311					STORE(op + 8, LOAD(ip +  8) ^
312						      REVV_BE(v2 + s2));
313					buf[3] = REVV_BE(v3 + s3);
314					}
315				else
316					buf[2] = REVV_BE(v2 + s2);
317				}
318			else
319				buf[1] = REVV_BE(v1 + s1);
320			}
321		else
322			buf[0] = REVV_BE(v0 + s0);
323
324		for (i=inlen & ~15; i<inlen; i++)
325			((char *)op)[i] = ((char *)ip)[i] ^ ((char *)buf)[i];
326		}
327	}
328
329#endif /* !OPENSSL_WINDOWS && (OPENSSL_X86_64 || OPENSSL_X86) && SSE2 */
330