1/* Copyright (c) 2014, Google Inc. 2 * 3 * Permission to use, copy, modify, and/or distribute this software for any 4 * purpose with or without fee is hereby granted, provided that the above 5 * copyright notice and this permission notice appear in all copies. 6 * 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14 15/* This implementation was taken from the public domain, neon2 version in 16 * SUPERCOP by D. J. Bernstein and Peter Schwabe. */ 17 18#include <openssl/poly1305.h> 19 20#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) 21 22#include <string.h> 23 24 25typedef struct { 26 uint32_t v[12]; /* for alignment; only using 10 */ 27} fe1305x2; 28 29#define addmulmod openssl_poly1305_neon2_addmulmod 30#define blocks openssl_poly1305_neon2_blocks 31 32extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y, 33 const fe1305x2 *c); 34 35extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in, 36 unsigned int inlen); 37 38static void freeze(fe1305x2 *r) { 39 int i; 40 41 uint32_t x0 = r->v[0]; 42 uint32_t x1 = r->v[2]; 43 uint32_t x2 = r->v[4]; 44 uint32_t x3 = r->v[6]; 45 uint32_t x4 = r->v[8]; 46 uint32_t y0; 47 uint32_t y1; 48 uint32_t y2; 49 uint32_t y3; 50 uint32_t y4; 51 uint32_t swap; 52 53 for (i = 0; i < 3; ++i) { 54 x1 += x0 >> 26; 55 x0 &= 0x3ffffff; 56 x2 += x1 >> 26; 57 x1 &= 0x3ffffff; 58 x3 += x2 >> 26; 59 x2 &= 0x3ffffff; 60 x4 += x3 >> 26; 61 x3 &= 0x3ffffff; 62 x0 += 5 * (x4 >> 26); 63 x4 &= 0x3ffffff; 64 } 65 66 y0 = x0 + 5; 67 y1 = x1 + (y0 >> 26); 68 y0 &= 0x3ffffff; 69 y2 = x2 + (y1 >> 26); 70 y1 &= 0x3ffffff; 71 y3 = x3 + (y2 >> 26); 72 y2 &= 0x3ffffff; 73 y4 = x4 + (y3 >> 26); 74 y3 &= 0x3ffffff; 75 swap = -(y4 >> 26); 76 y4 &= 0x3ffffff; 77 78 y0 ^= x0; 79 y1 ^= x1; 80 y2 ^= x2; 81 y3 ^= x3; 82 y4 ^= x4; 83 84 y0 &= swap; 85 y1 &= swap; 86 y2 &= swap; 87 y3 &= swap; 88 y4 &= swap; 89 90 y0 ^= x0; 91 y1 ^= x1; 92 y2 ^= x2; 93 y3 ^= x3; 94 y4 ^= x4; 95 96 r->v[0] = y0; 97 r->v[2] = y1; 98 r->v[4] = y2; 99 r->v[6] = y3; 100 r->v[8] = y4; 101} 102 103static void fe1305x2_tobytearray(uint8_t *r, fe1305x2 *x) { 104 uint32_t x0 = x->v[0]; 105 uint32_t x1 = x->v[2]; 106 uint32_t x2 = x->v[4]; 107 uint32_t x3 = x->v[6]; 108 uint32_t x4 = x->v[8]; 109 110 x1 += x0 >> 26; 111 x0 &= 0x3ffffff; 112 x2 += x1 >> 26; 113 x1 &= 0x3ffffff; 114 x3 += x2 >> 26; 115 x2 &= 0x3ffffff; 116 x4 += x3 >> 26; 117 x3 &= 0x3ffffff; 118 119 *(uint32_t *)r = x0 + (x1 << 26); 120 *(uint32_t *)(r + 4) = (x1 >> 6) + (x2 << 20); 121 *(uint32_t *)(r + 8) = (x2 >> 12) + (x3 << 14); 122 *(uint32_t *)(r + 12) = (x3 >> 18) + (x4 << 8); 123} 124 125/* load32 exists to avoid breaking strict aliasing rules in 126 * fe1305x2_frombytearray. */ 127static uint32_t load32(uint8_t *t) { 128 uint32_t tmp; 129 memcpy(&tmp, t, sizeof(tmp)); 130 return tmp; 131} 132 133static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, 134 unsigned long long xlen) { 135 int i; 136 uint8_t t[17]; 137 138 for (i = 0; (i < 16) && (i < xlen); i++) { 139 t[i] = x[i]; 140 } 141 xlen -= i; 142 x += i; 143 t[i++] = 1; 144 for (; i < 17; i++) { 145 t[i] = 0; 146 } 147 148 r->v[0] = 0x3ffffff & load32(t); 149 r->v[2] = 0x3ffffff & (load32(t + 3) >> 2); 150 r->v[4] = 0x3ffffff & (load32(t + 6) >> 4); 151 r->v[6] = 0x3ffffff & (load32(t + 9) >> 6); 152 r->v[8] = load32(t + 13); 153 154 if (xlen) { 155 for (i = 0; (i < 16) && (i < xlen); i++) { 156 t[i] = x[i]; 157 } 158 t[i++] = 1; 159 for (; i < 17; i++) { 160 t[i] = 0; 161 } 162 163 r->v[1] = 0x3ffffff & load32(t); 164 r->v[3] = 0x3ffffff & (load32(t + 3) >> 2); 165 r->v[5] = 0x3ffffff & (load32(t + 6) >> 4); 166 r->v[7] = 0x3ffffff & (load32(t + 9) >> 6); 167 r->v[9] = load32(t + 13); 168 } else { 169 r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0; 170 } 171} 172 173static const fe1305x2 zero __attribute__((aligned(16))); 174 175struct poly1305_state_st { 176 uint8_t data[sizeof(fe1305x2[5]) + 128]; 177 uint8_t buf[32]; 178 unsigned int buf_used; 179 uint8_t key[16]; 180}; 181 182void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) { 183 struct poly1305_state_st *st = (struct poly1305_state_st *)(state); 184 fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); 185 fe1305x2 *const h = r + 1; 186 fe1305x2 *const c = h + 1; 187 fe1305x2 *const precomp = c + 1; 188 unsigned int j; 189 190 r->v[1] = r->v[0] = 0x3ffffff & *(uint32_t *)key; 191 r->v[3] = r->v[2] = 0x3ffff03 & ((*(uint32_t *)(key + 3)) >> 2); 192 r->v[5] = r->v[4] = 0x3ffc0ff & ((*(uint32_t *)(key + 6)) >> 4); 193 r->v[7] = r->v[6] = 0x3f03fff & ((*(uint32_t *)(key + 9)) >> 6); 194 r->v[9] = r->v[8] = 0x00fffff & ((*(uint32_t *)(key + 12)) >> 8); 195 196 for (j = 0; j < 10; j++) { 197 h->v[j] = 0; /* XXX: should fast-forward a bit */ 198 } 199 200 addmulmod(precomp, r, r, &zero); /* precompute r^2 */ 201 addmulmod(precomp + 1, precomp, precomp, &zero); /* precompute r^4 */ 202 203 memcpy(st->key, key + 16, 16); 204 st->buf_used = 0; 205} 206 207void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in, 208 size_t in_len) { 209 struct poly1305_state_st *st = (struct poly1305_state_st *)(state); 210 fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); 211 fe1305x2 *const h = r + 1; 212 fe1305x2 *const c = h + 1; 213 fe1305x2 *const precomp = c + 1; 214 unsigned int i; 215 216 if (st->buf_used) { 217 unsigned int todo = 32 - st->buf_used; 218 if (todo > in_len) { 219 todo = in_len; 220 } 221 for (i = 0; i < todo; i++) { 222 st->buf[st->buf_used + i] = in[i]; 223 } 224 st->buf_used += todo; 225 in_len -= todo; 226 in += todo; 227 228 if (st->buf_used == sizeof(st->buf) && in_len) { 229 addmulmod(h, h, precomp, &zero); 230 fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); 231 for (i = 0; i < 10; i++) { 232 h->v[i] += c->v[i]; 233 } 234 st->buf_used = 0; 235 } 236 } 237 238 while (in_len > 32) { 239 unsigned int tlen = 1048576; 240 if (in_len < tlen) { 241 tlen = in_len; 242 } 243 tlen -= blocks(h, precomp, in, tlen); 244 in_len -= tlen; 245 in += tlen; 246 } 247 248 if (in_len) { 249 for (i = 0; i < in_len; i++) { 250 st->buf[i] = in[i]; 251 } 252 st->buf_used = in_len; 253 } 254} 255 256void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) { 257 struct poly1305_state_st *st = (struct poly1305_state_st *)(state); 258 fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); 259 fe1305x2 *const h = r + 1; 260 fe1305x2 *const c = h + 1; 261 fe1305x2 *const precomp = c + 1; 262 263 addmulmod(h, h, precomp, &zero); 264 265 if (st->buf_used > 16) { 266 fe1305x2_frombytearray(c, st->buf, st->buf_used); 267 precomp->v[1] = r->v[1]; 268 precomp->v[3] = r->v[3]; 269 precomp->v[5] = r->v[5]; 270 precomp->v[7] = r->v[7]; 271 precomp->v[9] = r->v[9]; 272 addmulmod(h, h, precomp, c); 273 } else if (st->buf_used > 0) { 274 fe1305x2_frombytearray(c, st->buf, st->buf_used); 275 r->v[1] = 1; 276 r->v[3] = 0; 277 r->v[5] = 0; 278 r->v[7] = 0; 279 r->v[9] = 0; 280 addmulmod(h, h, r, c); 281 } 282 283 h->v[0] += h->v[1]; 284 h->v[2] += h->v[3]; 285 h->v[4] += h->v[5]; 286 h->v[6] += h->v[7]; 287 h->v[8] += h->v[9]; 288 freeze(h); 289 290 fe1305x2_frombytearray(c, st->key, 16); 291 c->v[8] ^= (1 << 24); 292 293 h->v[0] += c->v[0]; 294 h->v[2] += c->v[2]; 295 h->v[4] += c->v[4]; 296 h->v[6] += c->v[6]; 297 h->v[8] += c->v[8]; 298 fe1305x2_tobytearray(mac, h); 299} 300 301#endif /* OPENSSL_ARM && !OPENSSL_NO_ASM */ 302