1/* Copyright (c) 2014, Google Inc. 2 * 3 * Permission to use, copy, modify, and/or distribute this software for any 4 * purpose with or without fee is hereby granted, provided that the above 5 * copyright notice and this permission notice appear in all copies. 6 * 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14 15/* ==================================================================== 16 * 17 * When updating this file, also update chacha_vec_arm.S 18 * 19 * ==================================================================== */ 20 21 22/* This implementation is by Ted Krovetz and was submitted to SUPERCOP and 23 * marked as public domain. It was been altered to allow for non-aligned inputs 24 * and to allow the block counter to be passed in specifically. */ 25 26#include <openssl/chacha.h> 27 28#if !defined(OPENSSL_WINDOWS) && (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) && defined(__SSE2__) 29 30#define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */ 31 32/* Architecture-neutral way to specify 16-byte vector of ints */ 33typedef unsigned vec __attribute__((vector_size(16))); 34 35/* This implementation is designed for Neon, SSE and AltiVec machines. The 36 * following specify how to do certain vector operations efficiently on 37 * each architecture, using intrinsics. 38 * This implementation supports parallel processing of multiple blocks, 39 * including potentially using general-purpose registers. */ 40#if __ARM_NEON__ 41#include <arm_neon.h> 42#define GPR_TOO 1 43#define VBPI 2 44#define ONE (vec) vsetq_lane_u32(1, vdupq_n_u32(0), 0) 45#define LOAD(m) (vec)(*((vec *)(m))) 46#define STORE(m, r) (*((vec *)(m))) = (r) 47#define ROTV1(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 1) 48#define ROTV2(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 2) 49#define ROTV3(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 3) 50#define ROTW16(x) (vec) vrev32q_u16((uint16x8_t)x) 51#if __clang__ 52#define ROTW7(x) (x << ((vec) {7, 7, 7, 7})) ^ (x >> ((vec) {25, 25, 25, 25})) 53#define ROTW8(x) (x << ((vec) {8, 8, 8, 8})) ^ (x >> ((vec) {24, 24, 24, 24})) 54#define ROTW12(x) \ 55 (x << ((vec) {12, 12, 12, 12})) ^ (x >> ((vec) {20, 20, 20, 20})) 56#else 57#define ROTW7(x) \ 58 (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 7), (uint32x4_t)x, 25) 59#define ROTW8(x) \ 60 (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 8), (uint32x4_t)x, 24) 61#define ROTW12(x) \ 62 (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 12), (uint32x4_t)x, 20) 63#endif 64#elif __SSE2__ 65#include <emmintrin.h> 66#define GPR_TOO 0 67#if __clang__ 68#define VBPI 4 69#else 70#define VBPI 3 71#endif 72#define ONE (vec) _mm_set_epi32(0, 0, 0, 1) 73#define LOAD(m) (vec) _mm_loadu_si128((__m128i *)(m)) 74#define STORE(m, r) _mm_storeu_si128((__m128i *)(m), (__m128i)(r)) 75#define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(0, 3, 2, 1)) 76#define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(1, 0, 3, 2)) 77#define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(2, 1, 0, 3)) 78#define ROTW7(x) \ 79 (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x, 25)) 80#define ROTW12(x) \ 81 (vec)(_mm_slli_epi32((__m128i)x, 12) ^ _mm_srli_epi32((__m128i)x, 20)) 82#if __SSSE3__ 83#include <tmmintrin.h> 84#define ROTW8(x) \ 85 (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, \ 86 11, 6, 5, 4, 7, 2, 1, 0, 3)) 87#define ROTW16(x) \ 88 (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, \ 89 10, 5, 4, 7, 6, 1, 0, 3, 2)) 90#else 91#define ROTW8(x) \ 92 (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x, 24)) 93#define ROTW16(x) \ 94 (vec)(_mm_slli_epi32((__m128i)x, 16) ^ _mm_srli_epi32((__m128i)x, 16)) 95#endif 96#else 97#error-- Implementation supports only machines with neon or SSE2 98#endif 99 100#ifndef REVV_BE 101#define REVV_BE(x) (x) 102#endif 103 104#ifndef REVW_BE 105#define REVW_BE(x) (x) 106#endif 107 108#define BPI (VBPI + GPR_TOO) /* Blocks computed per loop iteration */ 109 110#define DQROUND_VECTORS(a,b,c,d) \ 111 a += b; d ^= a; d = ROTW16(d); \ 112 c += d; b ^= c; b = ROTW12(b); \ 113 a += b; d ^= a; d = ROTW8(d); \ 114 c += d; b ^= c; b = ROTW7(b); \ 115 b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); \ 116 a += b; d ^= a; d = ROTW16(d); \ 117 c += d; b ^= c; b = ROTW12(b); \ 118 a += b; d ^= a; d = ROTW8(d); \ 119 c += d; b ^= c; b = ROTW7(b); \ 120 b = ROTV3(b); c = ROTV2(c); d = ROTV1(d); 121 122#define QROUND_WORDS(a,b,c,d) \ 123 a = a+b; d ^= a; d = d<<16 | d>>16; \ 124 c = c+d; b ^= c; b = b<<12 | b>>20; \ 125 a = a+b; d ^= a; d = d<< 8 | d>>24; \ 126 c = c+d; b ^= c; b = b<< 7 | b>>25; 127 128#define WRITE_XOR(in, op, d, v0, v1, v2, v3) \ 129 STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \ 130 STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \ 131 STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \ 132 STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3)); 133 134#if __ARM_NEON__ 135/* For ARM, we can't depend on NEON support, so this function is compiled with 136 * a different name, along with the generic code, and can be enabled at 137 * run-time. */ 138void CRYPTO_chacha_20_neon( 139#else 140void CRYPTO_chacha_20( 141#endif 142 uint8_t *out, 143 const uint8_t *in, 144 size_t inlen, 145 const uint8_t key[32], 146 const uint8_t nonce[8], 147 size_t counter) 148 { 149 unsigned iters, i, *op=(unsigned *)out, *ip=(unsigned *)in, *kp; 150#if defined(__ARM_NEON__) 151 unsigned *np; 152#endif 153 vec s0, s1, s2, s3; 154#if !defined(__ARM_NEON__) && !defined(__SSE2__) 155 __attribute__ ((aligned (16))) unsigned key[8], nonce[4]; 156#endif 157 __attribute__ ((aligned (16))) unsigned chacha_const[] = 158 {0x61707865,0x3320646E,0x79622D32,0x6B206574}; 159#if defined(__ARM_NEON__) || defined(__SSE2__) 160 kp = (unsigned *)key; 161#else 162 ((vec *)key)[0] = REVV_BE(((vec *)key)[0]); 163 ((vec *)key)[1] = REVV_BE(((vec *)key)[1]); 164 nonce[0] = REVW_BE(((unsigned *)nonce)[0]); 165 nonce[1] = REVW_BE(((unsigned *)nonce)[1]); 166 nonce[2] = REVW_BE(((unsigned *)nonce)[2]); 167 nonce[3] = REVW_BE(((unsigned *)nonce)[3]); 168 kp = (unsigned *)key; 169 np = (unsigned *)nonce; 170#endif 171#if defined(__ARM_NEON__) 172 np = (unsigned*) nonce; 173#endif 174 s0 = LOAD(chacha_const); 175 s1 = LOAD(&((vec*)kp)[0]); 176 s2 = LOAD(&((vec*)kp)[1]); 177 s3 = (vec){ 178 counter & 0xffffffff, 179#if __ARM_NEON__ || defined(OPENSSL_X86) 180 0, /* can't right-shift 32 bits on a 32-bit system. */ 181#else 182 counter >> 32, 183#endif 184 ((uint32_t*)nonce)[0], 185 ((uint32_t*)nonce)[1] 186 }; 187 188 for (iters = 0; iters < inlen/(BPI*64); iters++) 189 { 190#if GPR_TOO 191 register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8, 192 x9, x10, x11, x12, x13, x14, x15; 193#endif 194#if VBPI > 2 195 vec v8,v9,v10,v11; 196#endif 197#if VBPI > 3 198 vec v12,v13,v14,v15; 199#endif 200 201 vec v0,v1,v2,v3,v4,v5,v6,v7; 202 v4 = v0 = s0; v5 = v1 = s1; v6 = v2 = s2; v3 = s3; 203 v7 = v3 + ONE; 204#if VBPI > 2 205 v8 = v4; v9 = v5; v10 = v6; 206 v11 = v7 + ONE; 207#endif 208#if VBPI > 3 209 v12 = v8; v13 = v9; v14 = v10; 210 v15 = v11 + ONE; 211#endif 212#if GPR_TOO 213 x0 = chacha_const[0]; x1 = chacha_const[1]; 214 x2 = chacha_const[2]; x3 = chacha_const[3]; 215 x4 = kp[0]; x5 = kp[1]; x6 = kp[2]; x7 = kp[3]; 216 x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7]; 217 x12 = counter+BPI*iters+(BPI-1); x13 = 0; 218 x14 = np[0]; x15 = np[1]; 219#endif 220 for (i = CHACHA_RNDS/2; i; i--) 221 { 222 DQROUND_VECTORS(v0,v1,v2,v3) 223 DQROUND_VECTORS(v4,v5,v6,v7) 224#if VBPI > 2 225 DQROUND_VECTORS(v8,v9,v10,v11) 226#endif 227#if VBPI > 3 228 DQROUND_VECTORS(v12,v13,v14,v15) 229#endif 230#if GPR_TOO 231 QROUND_WORDS( x0, x4, x8,x12) 232 QROUND_WORDS( x1, x5, x9,x13) 233 QROUND_WORDS( x2, x6,x10,x14) 234 QROUND_WORDS( x3, x7,x11,x15) 235 QROUND_WORDS( x0, x5,x10,x15) 236 QROUND_WORDS( x1, x6,x11,x12) 237 QROUND_WORDS( x2, x7, x8,x13) 238 QROUND_WORDS( x3, x4, x9,x14) 239#endif 240 } 241 242 WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3) 243 s3 += ONE; 244 WRITE_XOR(ip, op, 16, v4+s0, v5+s1, v6+s2, v7+s3) 245 s3 += ONE; 246#if VBPI > 2 247 WRITE_XOR(ip, op, 32, v8+s0, v9+s1, v10+s2, v11+s3) 248 s3 += ONE; 249#endif 250#if VBPI > 3 251 WRITE_XOR(ip, op, 48, v12+s0, v13+s1, v14+s2, v15+s3) 252 s3 += ONE; 253#endif 254 ip += VBPI*16; 255 op += VBPI*16; 256#if GPR_TOO 257 op[0] = REVW_BE(REVW_BE(ip[0]) ^ (x0 + chacha_const[0])); 258 op[1] = REVW_BE(REVW_BE(ip[1]) ^ (x1 + chacha_const[1])); 259 op[2] = REVW_BE(REVW_BE(ip[2]) ^ (x2 + chacha_const[2])); 260 op[3] = REVW_BE(REVW_BE(ip[3]) ^ (x3 + chacha_const[3])); 261 op[4] = REVW_BE(REVW_BE(ip[4]) ^ (x4 + kp[0])); 262 op[5] = REVW_BE(REVW_BE(ip[5]) ^ (x5 + kp[1])); 263 op[6] = REVW_BE(REVW_BE(ip[6]) ^ (x6 + kp[2])); 264 op[7] = REVW_BE(REVW_BE(ip[7]) ^ (x7 + kp[3])); 265 op[8] = REVW_BE(REVW_BE(ip[8]) ^ (x8 + kp[4])); 266 op[9] = REVW_BE(REVW_BE(ip[9]) ^ (x9 + kp[5])); 267 op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6])); 268 op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7])); 269 op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter+BPI*iters+(BPI-1))); 270 op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13)); 271 op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[0])); 272 op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[1])); 273 s3 += ONE; 274 ip += 16; 275 op += 16; 276#endif 277 } 278 279 for (iters = inlen%(BPI*64)/64; iters != 0; iters--) 280 { 281 vec v0 = s0, v1 = s1, v2 = s2, v3 = s3; 282 for (i = CHACHA_RNDS/2; i; i--) 283 { 284 DQROUND_VECTORS(v0,v1,v2,v3); 285 } 286 WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3) 287 s3 += ONE; 288 ip += 16; 289 op += 16; 290 } 291 292 inlen = inlen % 64; 293 if (inlen) 294 { 295 __attribute__ ((aligned (16))) vec buf[4]; 296 vec v0,v1,v2,v3; 297 v0 = s0; v1 = s1; v2 = s2; v3 = s3; 298 for (i = CHACHA_RNDS/2; i; i--) 299 { 300 DQROUND_VECTORS(v0,v1,v2,v3); 301 } 302 303 if (inlen >= 16) 304 { 305 STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0)); 306 if (inlen >= 32) 307 { 308 STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1)); 309 if (inlen >= 48) 310 { 311 STORE(op + 8, LOAD(ip + 8) ^ 312 REVV_BE(v2 + s2)); 313 buf[3] = REVV_BE(v3 + s3); 314 } 315 else 316 buf[2] = REVV_BE(v2 + s2); 317 } 318 else 319 buf[1] = REVV_BE(v1 + s1); 320 } 321 else 322 buf[0] = REVV_BE(v0 + s0); 323 324 for (i=inlen & ~15; i<inlen; i++) 325 ((char *)op)[i] = ((char *)ip)[i] ^ ((char *)buf)[i]; 326 } 327 } 328 329#endif /* !OPENSSL_WINDOWS && (OPENSSL_X86_64 || OPENSSL_X86) && SSE2 */ 330