1/* Copyright (c) 2014, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15/* This implementation of poly1305 is by Andrew Moon
16 * (https://github.com/floodyberry/poly1305-donna) and released as public
17 * domain. It implements SIMD vectorization based on the algorithm described in
18 * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 * block size */
20
21#include <openssl/poly1305.h>
22
23
24#if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64)
25
26#include <emmintrin.h>
27
28#define ALIGN(x) __attribute__((aligned(x)))
29/* inline is not a keyword in C89. */
30#define INLINE
31#define U8TO64_LE(m) (*(uint64_t *)(m))
32#define U8TO32_LE(m) (*(uint32_t *)(m))
33#define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v
34
35typedef __m128i xmmi;
36typedef unsigned __int128 uint128_t;
37
38static const uint32_t ALIGN(16) poly1305_x64_sse2_message_mask[4] = {
39    (1 << 26) - 1, 0, (1 << 26) - 1, 0};
40static const uint32_t ALIGN(16) poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
41static const uint32_t ALIGN(16) poly1305_x64_sse2_1shl128[4] = {(1 << 24), 0,
42                                                                (1 << 24), 0};
43
44static uint128_t INLINE add128(uint128_t a, uint128_t b) { return a + b; }
45
46static uint128_t INLINE add128_64(uint128_t a, uint64_t b) { return a + b; }
47
48static uint128_t INLINE mul64x64_128(uint64_t a, uint64_t b) {
49  return (uint128_t)a * b;
50}
51
52static uint64_t INLINE lo128(uint128_t a) { return (uint64_t)a; }
53
54static uint64_t INLINE shr128(uint128_t v, const int shift) {
55  return (uint64_t)(v >> shift);
56}
57
58static uint64_t INLINE shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
59  return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
60}
61
62typedef struct poly1305_power_t {
63  union {
64    xmmi v;
65    uint64_t u[2];
66    uint32_t d[4];
67  } R20, R21, R22, R23, R24, S21, S22, S23, S24;
68} poly1305_power;
69
70typedef struct poly1305_state_internal_t {
71  poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
72                          bytes of free storage */
73  union {
74    xmmi H[5]; /*  80 bytes  */
75    uint64_t HH[10];
76  };
77  /* uint64_t r0,r1,r2;       [24 bytes] */
78  /* uint64_t pad0,pad1;      [16 bytes] */
79  uint64_t started;        /*   8 bytes  */
80  uint64_t leftover;       /*   8 bytes  */
81  uint8_t buffer[64];      /*  64 bytes  */
82} poly1305_state_internal; /* 448 bytes total + 63 bytes for
83                              alignment = 511 bytes raw */
84
85static poly1305_state_internal INLINE *poly1305_aligned_state(
86    poly1305_state *state) {
87  return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
88}
89
90/* copy 0-63 bytes */
91static void INLINE
92poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
93  size_t offset = src - dst;
94  if (bytes & 32) {
95    _mm_storeu_si128((xmmi *)(dst + 0),
96                     _mm_loadu_si128((xmmi *)(dst + offset + 0)));
97    _mm_storeu_si128((xmmi *)(dst + 16),
98                     _mm_loadu_si128((xmmi *)(dst + offset + 16)));
99    dst += 32;
100  }
101  if (bytes & 16) {
102    _mm_storeu_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset)));
103    dst += 16;
104  }
105  if (bytes & 8) {
106    *(uint64_t *)dst = *(uint64_t *)(dst + offset);
107    dst += 8;
108  }
109  if (bytes & 4) {
110    *(uint32_t *)dst = *(uint32_t *)(dst + offset);
111    dst += 4;
112  }
113  if (bytes & 2) {
114    *(uint16_t *)dst = *(uint16_t *)(dst + offset);
115    dst += 2;
116  }
117  if (bytes & 1) {
118    *(uint8_t *)dst = *(uint8_t *)(dst + offset);
119  }
120}
121
122/* zero 0-15 bytes */
123static void INLINE poly1305_block_zero(uint8_t *dst, size_t bytes) {
124  if (bytes & 8) {
125    *(uint64_t *)dst = 0;
126    dst += 8;
127  }
128  if (bytes & 4) {
129    *(uint32_t *)dst = 0;
130    dst += 4;
131  }
132  if (bytes & 2) {
133    *(uint16_t *)dst = 0;
134    dst += 2;
135  }
136  if (bytes & 1) {
137    *(uint8_t *)dst = 0;
138  }
139}
140
141static size_t INLINE poly1305_min(size_t a, size_t b) {
142  return (a < b) ? a : b;
143}
144
145void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
146  poly1305_state_internal *st = poly1305_aligned_state(state);
147  poly1305_power *p;
148  uint64_t r0, r1, r2;
149  uint64_t t0, t1;
150
151  /* clamp key */
152  t0 = U8TO64_LE(key + 0);
153  t1 = U8TO64_LE(key + 8);
154  r0 = t0 & 0xffc0fffffff;
155  t0 >>= 44;
156  t0 |= t1 << 20;
157  r1 = t0 & 0xfffffc0ffff;
158  t1 >>= 24;
159  r2 = t1 & 0x00ffffffc0f;
160
161  /* store r in un-used space of st->P[1] */
162  p = &st->P[1];
163  p->R20.d[1] = (uint32_t)(r0);
164  p->R20.d[3] = (uint32_t)(r0 >> 32);
165  p->R21.d[1] = (uint32_t)(r1);
166  p->R21.d[3] = (uint32_t)(r1 >> 32);
167  p->R22.d[1] = (uint32_t)(r2);
168  p->R22.d[3] = (uint32_t)(r2 >> 32);
169
170  /* store pad */
171  p->R23.d[1] = U8TO32_LE(key + 16);
172  p->R23.d[3] = U8TO32_LE(key + 20);
173  p->R24.d[1] = U8TO32_LE(key + 24);
174  p->R24.d[3] = U8TO32_LE(key + 28);
175
176  /* H = 0 */
177  st->H[0] = _mm_setzero_si128();
178  st->H[1] = _mm_setzero_si128();
179  st->H[2] = _mm_setzero_si128();
180  st->H[3] = _mm_setzero_si128();
181  st->H[4] = _mm_setzero_si128();
182
183  st->started = 0;
184  st->leftover = 0;
185}
186
187static void poly1305_first_block(poly1305_state_internal *st,
188                                 const uint8_t *m) {
189  const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
190  const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
191  const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
192  xmmi T5, T6;
193  poly1305_power *p;
194  uint128_t d[3];
195  uint64_t r0, r1, r2;
196  uint64_t r20, r21, r22, s22;
197  uint64_t pad0, pad1;
198  uint64_t c;
199  uint64_t i;
200
201  /* pull out stored info */
202  p = &st->P[1];
203
204  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
205  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
206  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
207  pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
208  pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
209
210  /* compute powers r^2,r^4 */
211  r20 = r0;
212  r21 = r1;
213  r22 = r2;
214  for (i = 0; i < 2; i++) {
215    s22 = r22 * (5 << 2);
216
217    d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
218    d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
219    d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
220
221    r20 = lo128(d[0]) & 0xfffffffffff;
222    c = shr128(d[0], 44);
223    d[1] = add128_64(d[1], c);
224    r21 = lo128(d[1]) & 0xfffffffffff;
225    c = shr128(d[1], 44);
226    d[2] = add128_64(d[2], c);
227    r22 = lo128(d[2]) & 0x3ffffffffff;
228    c = shr128(d[2], 42);
229    r20 += c * 5;
230    c = (r20 >> 44);
231    r20 = r20 & 0xfffffffffff;
232    r21 += c;
233
234    p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
235                                 _MM_SHUFFLE(1, 0, 1, 0));
236    p->R21.v = _mm_shuffle_epi32(
237        _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
238        _MM_SHUFFLE(1, 0, 1, 0));
239    p->R22.v =
240        _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
241                          _MM_SHUFFLE(1, 0, 1, 0));
242    p->R23.v = _mm_shuffle_epi32(
243        _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
244        _MM_SHUFFLE(1, 0, 1, 0));
245    p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
246                                 _MM_SHUFFLE(1, 0, 1, 0));
247    p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
248    p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
249    p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
250    p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
251    p--;
252  }
253
254  /* put saved info back */
255  p = &st->P[1];
256  p->R20.d[1] = (uint32_t)(r0);
257  p->R20.d[3] = (uint32_t)(r0 >> 32);
258  p->R21.d[1] = (uint32_t)(r1);
259  p->R21.d[3] = (uint32_t)(r1 >> 32);
260  p->R22.d[1] = (uint32_t)(r2);
261  p->R22.d[3] = (uint32_t)(r2 >> 32);
262  p->R23.d[1] = (uint32_t)(pad0);
263  p->R23.d[3] = (uint32_t)(pad0 >> 32);
264  p->R24.d[1] = (uint32_t)(pad1);
265  p->R24.d[3] = (uint32_t)(pad1 >> 32);
266
267  /* H = [Mx,My] */
268  T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
269                          _mm_loadl_epi64((xmmi *)(m + 16)));
270  T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
271                          _mm_loadl_epi64((xmmi *)(m + 24)));
272  st->H[0] = _mm_and_si128(MMASK, T5);
273  st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
274  T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
275  st->H[2] = _mm_and_si128(MMASK, T5);
276  st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
277  st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
278}
279
280static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
281                            size_t bytes) {
282  const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
283  const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
284  const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
285
286  poly1305_power *p;
287  xmmi H0, H1, H2, H3, H4;
288  xmmi T0, T1, T2, T3, T4, T5, T6;
289  xmmi M0, M1, M2, M3, M4;
290  xmmi C1, C2;
291
292  H0 = st->H[0];
293  H1 = st->H[1];
294  H2 = st->H[2];
295  H3 = st->H[3];
296  H4 = st->H[4];
297
298  while (bytes >= 64) {
299    /* H *= [r^4,r^4] */
300    p = &st->P[0];
301    T0 = _mm_mul_epu32(H0, p->R20.v);
302    T1 = _mm_mul_epu32(H0, p->R21.v);
303    T2 = _mm_mul_epu32(H0, p->R22.v);
304    T3 = _mm_mul_epu32(H0, p->R23.v);
305    T4 = _mm_mul_epu32(H0, p->R24.v);
306    T5 = _mm_mul_epu32(H1, p->S24.v);
307    T6 = _mm_mul_epu32(H1, p->R20.v);
308    T0 = _mm_add_epi64(T0, T5);
309    T1 = _mm_add_epi64(T1, T6);
310    T5 = _mm_mul_epu32(H2, p->S23.v);
311    T6 = _mm_mul_epu32(H2, p->S24.v);
312    T0 = _mm_add_epi64(T0, T5);
313    T1 = _mm_add_epi64(T1, T6);
314    T5 = _mm_mul_epu32(H3, p->S22.v);
315    T6 = _mm_mul_epu32(H3, p->S23.v);
316    T0 = _mm_add_epi64(T0, T5);
317    T1 = _mm_add_epi64(T1, T6);
318    T5 = _mm_mul_epu32(H4, p->S21.v);
319    T6 = _mm_mul_epu32(H4, p->S22.v);
320    T0 = _mm_add_epi64(T0, T5);
321    T1 = _mm_add_epi64(T1, T6);
322    T5 = _mm_mul_epu32(H1, p->R21.v);
323    T6 = _mm_mul_epu32(H1, p->R22.v);
324    T2 = _mm_add_epi64(T2, T5);
325    T3 = _mm_add_epi64(T3, T6);
326    T5 = _mm_mul_epu32(H2, p->R20.v);
327    T6 = _mm_mul_epu32(H2, p->R21.v);
328    T2 = _mm_add_epi64(T2, T5);
329    T3 = _mm_add_epi64(T3, T6);
330    T5 = _mm_mul_epu32(H3, p->S24.v);
331    T6 = _mm_mul_epu32(H3, p->R20.v);
332    T2 = _mm_add_epi64(T2, T5);
333    T3 = _mm_add_epi64(T3, T6);
334    T5 = _mm_mul_epu32(H4, p->S23.v);
335    T6 = _mm_mul_epu32(H4, p->S24.v);
336    T2 = _mm_add_epi64(T2, T5);
337    T3 = _mm_add_epi64(T3, T6);
338    T5 = _mm_mul_epu32(H1, p->R23.v);
339    T4 = _mm_add_epi64(T4, T5);
340    T5 = _mm_mul_epu32(H2, p->R22.v);
341    T4 = _mm_add_epi64(T4, T5);
342    T5 = _mm_mul_epu32(H3, p->R21.v);
343    T4 = _mm_add_epi64(T4, T5);
344    T5 = _mm_mul_epu32(H4, p->R20.v);
345    T4 = _mm_add_epi64(T4, T5);
346
347    /* H += [Mx,My]*[r^2,r^2] */
348    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
349                            _mm_loadl_epi64((xmmi *)(m + 16)));
350    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
351                            _mm_loadl_epi64((xmmi *)(m + 24)));
352    M0 = _mm_and_si128(MMASK, T5);
353    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
354    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
355    M2 = _mm_and_si128(MMASK, T5);
356    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
357    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
358
359    p = &st->P[1];
360    T5 = _mm_mul_epu32(M0, p->R20.v);
361    T6 = _mm_mul_epu32(M0, p->R21.v);
362    T0 = _mm_add_epi64(T0, T5);
363    T1 = _mm_add_epi64(T1, T6);
364    T5 = _mm_mul_epu32(M1, p->S24.v);
365    T6 = _mm_mul_epu32(M1, p->R20.v);
366    T0 = _mm_add_epi64(T0, T5);
367    T1 = _mm_add_epi64(T1, T6);
368    T5 = _mm_mul_epu32(M2, p->S23.v);
369    T6 = _mm_mul_epu32(M2, p->S24.v);
370    T0 = _mm_add_epi64(T0, T5);
371    T1 = _mm_add_epi64(T1, T6);
372    T5 = _mm_mul_epu32(M3, p->S22.v);
373    T6 = _mm_mul_epu32(M3, p->S23.v);
374    T0 = _mm_add_epi64(T0, T5);
375    T1 = _mm_add_epi64(T1, T6);
376    T5 = _mm_mul_epu32(M4, p->S21.v);
377    T6 = _mm_mul_epu32(M4, p->S22.v);
378    T0 = _mm_add_epi64(T0, T5);
379    T1 = _mm_add_epi64(T1, T6);
380    T5 = _mm_mul_epu32(M0, p->R22.v);
381    T6 = _mm_mul_epu32(M0, p->R23.v);
382    T2 = _mm_add_epi64(T2, T5);
383    T3 = _mm_add_epi64(T3, T6);
384    T5 = _mm_mul_epu32(M1, p->R21.v);
385    T6 = _mm_mul_epu32(M1, p->R22.v);
386    T2 = _mm_add_epi64(T2, T5);
387    T3 = _mm_add_epi64(T3, T6);
388    T5 = _mm_mul_epu32(M2, p->R20.v);
389    T6 = _mm_mul_epu32(M2, p->R21.v);
390    T2 = _mm_add_epi64(T2, T5);
391    T3 = _mm_add_epi64(T3, T6);
392    T5 = _mm_mul_epu32(M3, p->S24.v);
393    T6 = _mm_mul_epu32(M3, p->R20.v);
394    T2 = _mm_add_epi64(T2, T5);
395    T3 = _mm_add_epi64(T3, T6);
396    T5 = _mm_mul_epu32(M4, p->S23.v);
397    T6 = _mm_mul_epu32(M4, p->S24.v);
398    T2 = _mm_add_epi64(T2, T5);
399    T3 = _mm_add_epi64(T3, T6);
400    T5 = _mm_mul_epu32(M0, p->R24.v);
401    T4 = _mm_add_epi64(T4, T5);
402    T5 = _mm_mul_epu32(M1, p->R23.v);
403    T4 = _mm_add_epi64(T4, T5);
404    T5 = _mm_mul_epu32(M2, p->R22.v);
405    T4 = _mm_add_epi64(T4, T5);
406    T5 = _mm_mul_epu32(M3, p->R21.v);
407    T4 = _mm_add_epi64(T4, T5);
408    T5 = _mm_mul_epu32(M4, p->R20.v);
409    T4 = _mm_add_epi64(T4, T5);
410
411    /* H += [Mx,My] */
412    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 32)),
413                            _mm_loadl_epi64((xmmi *)(m + 48)));
414    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 40)),
415                            _mm_loadl_epi64((xmmi *)(m + 56)));
416    M0 = _mm_and_si128(MMASK, T5);
417    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
418    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
419    M2 = _mm_and_si128(MMASK, T5);
420    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
421    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
422
423    T0 = _mm_add_epi64(T0, M0);
424    T1 = _mm_add_epi64(T1, M1);
425    T2 = _mm_add_epi64(T2, M2);
426    T3 = _mm_add_epi64(T3, M3);
427    T4 = _mm_add_epi64(T4, M4);
428
429    /* reduce */
430    C1 = _mm_srli_epi64(T0, 26);
431    C2 = _mm_srli_epi64(T3, 26);
432    T0 = _mm_and_si128(T0, MMASK);
433    T3 = _mm_and_si128(T3, MMASK);
434    T1 = _mm_add_epi64(T1, C1);
435    T4 = _mm_add_epi64(T4, C2);
436    C1 = _mm_srli_epi64(T1, 26);
437    C2 = _mm_srli_epi64(T4, 26);
438    T1 = _mm_and_si128(T1, MMASK);
439    T4 = _mm_and_si128(T4, MMASK);
440    T2 = _mm_add_epi64(T2, C1);
441    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
442    C1 = _mm_srli_epi64(T2, 26);
443    C2 = _mm_srli_epi64(T0, 26);
444    T2 = _mm_and_si128(T2, MMASK);
445    T0 = _mm_and_si128(T0, MMASK);
446    T3 = _mm_add_epi64(T3, C1);
447    T1 = _mm_add_epi64(T1, C2);
448    C1 = _mm_srli_epi64(T3, 26);
449    T3 = _mm_and_si128(T3, MMASK);
450    T4 = _mm_add_epi64(T4, C1);
451
452    /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */
453    H0 = T0;
454    H1 = T1;
455    H2 = T2;
456    H3 = T3;
457    H4 = T4;
458
459    m += 64;
460    bytes -= 64;
461  }
462
463  st->H[0] = H0;
464  st->H[1] = H1;
465  st->H[2] = H2;
466  st->H[3] = H3;
467  st->H[4] = H4;
468}
469
470static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
471                               size_t bytes) {
472  const xmmi MMASK = _mm_load_si128((xmmi *)poly1305_x64_sse2_message_mask);
473  const xmmi HIBIT = _mm_load_si128((xmmi *)poly1305_x64_sse2_1shl128);
474  const xmmi FIVE = _mm_load_si128((xmmi *)poly1305_x64_sse2_5);
475
476  poly1305_power *p;
477  xmmi H0, H1, H2, H3, H4;
478  xmmi M0, M1, M2, M3, M4;
479  xmmi T0, T1, T2, T3, T4, T5, T6;
480  xmmi C1, C2;
481
482  uint64_t r0, r1, r2;
483  uint64_t t0, t1, t2, t3, t4;
484  uint64_t c;
485  size_t consumed = 0;
486
487  H0 = st->H[0];
488  H1 = st->H[1];
489  H2 = st->H[2];
490  H3 = st->H[3];
491  H4 = st->H[4];
492
493  /* p = [r^2,r^2] */
494  p = &st->P[1];
495
496  if (bytes >= 32) {
497    /* H *= [r^2,r^2] */
498    T0 = _mm_mul_epu32(H0, p->R20.v);
499    T1 = _mm_mul_epu32(H0, p->R21.v);
500    T2 = _mm_mul_epu32(H0, p->R22.v);
501    T3 = _mm_mul_epu32(H0, p->R23.v);
502    T4 = _mm_mul_epu32(H0, p->R24.v);
503    T5 = _mm_mul_epu32(H1, p->S24.v);
504    T6 = _mm_mul_epu32(H1, p->R20.v);
505    T0 = _mm_add_epi64(T0, T5);
506    T1 = _mm_add_epi64(T1, T6);
507    T5 = _mm_mul_epu32(H2, p->S23.v);
508    T6 = _mm_mul_epu32(H2, p->S24.v);
509    T0 = _mm_add_epi64(T0, T5);
510    T1 = _mm_add_epi64(T1, T6);
511    T5 = _mm_mul_epu32(H3, p->S22.v);
512    T6 = _mm_mul_epu32(H3, p->S23.v);
513    T0 = _mm_add_epi64(T0, T5);
514    T1 = _mm_add_epi64(T1, T6);
515    T5 = _mm_mul_epu32(H4, p->S21.v);
516    T6 = _mm_mul_epu32(H4, p->S22.v);
517    T0 = _mm_add_epi64(T0, T5);
518    T1 = _mm_add_epi64(T1, T6);
519    T5 = _mm_mul_epu32(H1, p->R21.v);
520    T6 = _mm_mul_epu32(H1, p->R22.v);
521    T2 = _mm_add_epi64(T2, T5);
522    T3 = _mm_add_epi64(T3, T6);
523    T5 = _mm_mul_epu32(H2, p->R20.v);
524    T6 = _mm_mul_epu32(H2, p->R21.v);
525    T2 = _mm_add_epi64(T2, T5);
526    T3 = _mm_add_epi64(T3, T6);
527    T5 = _mm_mul_epu32(H3, p->S24.v);
528    T6 = _mm_mul_epu32(H3, p->R20.v);
529    T2 = _mm_add_epi64(T2, T5);
530    T3 = _mm_add_epi64(T3, T6);
531    T5 = _mm_mul_epu32(H4, p->S23.v);
532    T6 = _mm_mul_epu32(H4, p->S24.v);
533    T2 = _mm_add_epi64(T2, T5);
534    T3 = _mm_add_epi64(T3, T6);
535    T5 = _mm_mul_epu32(H1, p->R23.v);
536    T4 = _mm_add_epi64(T4, T5);
537    T5 = _mm_mul_epu32(H2, p->R22.v);
538    T4 = _mm_add_epi64(T4, T5);
539    T5 = _mm_mul_epu32(H3, p->R21.v);
540    T4 = _mm_add_epi64(T4, T5);
541    T5 = _mm_mul_epu32(H4, p->R20.v);
542    T4 = _mm_add_epi64(T4, T5);
543
544    /* H += [Mx,My] */
545    T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 0)),
546                            _mm_loadl_epi64((xmmi *)(m + 16)));
547    T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((xmmi *)(m + 8)),
548                            _mm_loadl_epi64((xmmi *)(m + 24)));
549    M0 = _mm_and_si128(MMASK, T5);
550    M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
551    T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
552    M2 = _mm_and_si128(MMASK, T5);
553    M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
554    M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
555
556    T0 = _mm_add_epi64(T0, M0);
557    T1 = _mm_add_epi64(T1, M1);
558    T2 = _mm_add_epi64(T2, M2);
559    T3 = _mm_add_epi64(T3, M3);
560    T4 = _mm_add_epi64(T4, M4);
561
562    /* reduce */
563    C1 = _mm_srli_epi64(T0, 26);
564    C2 = _mm_srli_epi64(T3, 26);
565    T0 = _mm_and_si128(T0, MMASK);
566    T3 = _mm_and_si128(T3, MMASK);
567    T1 = _mm_add_epi64(T1, C1);
568    T4 = _mm_add_epi64(T4, C2);
569    C1 = _mm_srli_epi64(T1, 26);
570    C2 = _mm_srli_epi64(T4, 26);
571    T1 = _mm_and_si128(T1, MMASK);
572    T4 = _mm_and_si128(T4, MMASK);
573    T2 = _mm_add_epi64(T2, C1);
574    T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
575    C1 = _mm_srli_epi64(T2, 26);
576    C2 = _mm_srli_epi64(T0, 26);
577    T2 = _mm_and_si128(T2, MMASK);
578    T0 = _mm_and_si128(T0, MMASK);
579    T3 = _mm_add_epi64(T3, C1);
580    T1 = _mm_add_epi64(T1, C2);
581    C1 = _mm_srli_epi64(T3, 26);
582    T3 = _mm_and_si128(T3, MMASK);
583    T4 = _mm_add_epi64(T4, C1);
584
585    /* H = (H*[r^2,r^2] + [Mx,My]) */
586    H0 = T0;
587    H1 = T1;
588    H2 = T2;
589    H3 = T3;
590    H4 = T4;
591
592    consumed = 32;
593  }
594
595  /* finalize, H *= [r^2,r] */
596  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
597  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
598  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
599
600  p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
601  p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
602  p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
603  p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
604  p->R24.d[2] = (uint32_t)((r2 >> 16));
605  p->S21.d[2] = p->R21.d[2] * 5;
606  p->S22.d[2] = p->R22.d[2] * 5;
607  p->S23.d[2] = p->R23.d[2] * 5;
608  p->S24.d[2] = p->R24.d[2] * 5;
609
610  /* H *= [r^2,r] */
611  T0 = _mm_mul_epu32(H0, p->R20.v);
612  T1 = _mm_mul_epu32(H0, p->R21.v);
613  T2 = _mm_mul_epu32(H0, p->R22.v);
614  T3 = _mm_mul_epu32(H0, p->R23.v);
615  T4 = _mm_mul_epu32(H0, p->R24.v);
616  T5 = _mm_mul_epu32(H1, p->S24.v);
617  T6 = _mm_mul_epu32(H1, p->R20.v);
618  T0 = _mm_add_epi64(T0, T5);
619  T1 = _mm_add_epi64(T1, T6);
620  T5 = _mm_mul_epu32(H2, p->S23.v);
621  T6 = _mm_mul_epu32(H2, p->S24.v);
622  T0 = _mm_add_epi64(T0, T5);
623  T1 = _mm_add_epi64(T1, T6);
624  T5 = _mm_mul_epu32(H3, p->S22.v);
625  T6 = _mm_mul_epu32(H3, p->S23.v);
626  T0 = _mm_add_epi64(T0, T5);
627  T1 = _mm_add_epi64(T1, T6);
628  T5 = _mm_mul_epu32(H4, p->S21.v);
629  T6 = _mm_mul_epu32(H4, p->S22.v);
630  T0 = _mm_add_epi64(T0, T5);
631  T1 = _mm_add_epi64(T1, T6);
632  T5 = _mm_mul_epu32(H1, p->R21.v);
633  T6 = _mm_mul_epu32(H1, p->R22.v);
634  T2 = _mm_add_epi64(T2, T5);
635  T3 = _mm_add_epi64(T3, T6);
636  T5 = _mm_mul_epu32(H2, p->R20.v);
637  T6 = _mm_mul_epu32(H2, p->R21.v);
638  T2 = _mm_add_epi64(T2, T5);
639  T3 = _mm_add_epi64(T3, T6);
640  T5 = _mm_mul_epu32(H3, p->S24.v);
641  T6 = _mm_mul_epu32(H3, p->R20.v);
642  T2 = _mm_add_epi64(T2, T5);
643  T3 = _mm_add_epi64(T3, T6);
644  T5 = _mm_mul_epu32(H4, p->S23.v);
645  T6 = _mm_mul_epu32(H4, p->S24.v);
646  T2 = _mm_add_epi64(T2, T5);
647  T3 = _mm_add_epi64(T3, T6);
648  T5 = _mm_mul_epu32(H1, p->R23.v);
649  T4 = _mm_add_epi64(T4, T5);
650  T5 = _mm_mul_epu32(H2, p->R22.v);
651  T4 = _mm_add_epi64(T4, T5);
652  T5 = _mm_mul_epu32(H3, p->R21.v);
653  T4 = _mm_add_epi64(T4, T5);
654  T5 = _mm_mul_epu32(H4, p->R20.v);
655  T4 = _mm_add_epi64(T4, T5);
656
657  C1 = _mm_srli_epi64(T0, 26);
658  C2 = _mm_srli_epi64(T3, 26);
659  T0 = _mm_and_si128(T0, MMASK);
660  T3 = _mm_and_si128(T3, MMASK);
661  T1 = _mm_add_epi64(T1, C1);
662  T4 = _mm_add_epi64(T4, C2);
663  C1 = _mm_srli_epi64(T1, 26);
664  C2 = _mm_srli_epi64(T4, 26);
665  T1 = _mm_and_si128(T1, MMASK);
666  T4 = _mm_and_si128(T4, MMASK);
667  T2 = _mm_add_epi64(T2, C1);
668  T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
669  C1 = _mm_srli_epi64(T2, 26);
670  C2 = _mm_srli_epi64(T0, 26);
671  T2 = _mm_and_si128(T2, MMASK);
672  T0 = _mm_and_si128(T0, MMASK);
673  T3 = _mm_add_epi64(T3, C1);
674  T1 = _mm_add_epi64(T1, C2);
675  C1 = _mm_srli_epi64(T3, 26);
676  T3 = _mm_and_si128(T3, MMASK);
677  T4 = _mm_add_epi64(T4, C1);
678
679  /* H = H[0]+H[1] */
680  H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
681  H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
682  H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
683  H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
684  H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
685
686  t0 = _mm_cvtsi128_si32(H0);
687  c = (t0 >> 26);
688  t0 &= 0x3ffffff;
689  t1 = _mm_cvtsi128_si32(H1) + c;
690  c = (t1 >> 26);
691  t1 &= 0x3ffffff;
692  t2 = _mm_cvtsi128_si32(H2) + c;
693  c = (t2 >> 26);
694  t2 &= 0x3ffffff;
695  t3 = _mm_cvtsi128_si32(H3) + c;
696  c = (t3 >> 26);
697  t3 &= 0x3ffffff;
698  t4 = _mm_cvtsi128_si32(H4) + c;
699  c = (t4 >> 26);
700  t4 &= 0x3ffffff;
701  t0 = t0 + (c * 5);
702  c = (t0 >> 26);
703  t0 &= 0x3ffffff;
704  t1 = t1 + c;
705
706  st->HH[0] = ((t0) | (t1 << 26)) & 0xfffffffffffull;
707  st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & 0xfffffffffffull;
708  st->HH[2] = ((t3 >> 10) | (t4 << 16)) & 0x3ffffffffffull;
709
710  return consumed;
711}
712
713void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
714                            size_t bytes) {
715  poly1305_state_internal *st = poly1305_aligned_state(state);
716  size_t want;
717
718  /* need at least 32 initial bytes to start the accelerated branch */
719  if (!st->started) {
720    if ((st->leftover == 0) && (bytes > 32)) {
721      poly1305_first_block(st, m);
722      m += 32;
723      bytes -= 32;
724    } else {
725      want = poly1305_min(32 - st->leftover, bytes);
726      poly1305_block_copy(st->buffer + st->leftover, m, want);
727      bytes -= want;
728      m += want;
729      st->leftover += want;
730      if ((st->leftover < 32) || (bytes == 0)) {
731        return;
732      }
733      poly1305_first_block(st, st->buffer);
734      st->leftover = 0;
735    }
736    st->started = 1;
737  }
738
739  /* handle leftover */
740  if (st->leftover) {
741    want = poly1305_min(64 - st->leftover, bytes);
742    poly1305_block_copy(st->buffer + st->leftover, m, want);
743    bytes -= want;
744    m += want;
745    st->leftover += want;
746    if (st->leftover < 64) {
747      return;
748    }
749    poly1305_blocks(st, st->buffer, 64);
750    st->leftover = 0;
751  }
752
753  /* process 64 byte blocks */
754  if (bytes >= 64) {
755    want = (bytes & ~63);
756    poly1305_blocks(st, m, want);
757    m += want;
758    bytes -= want;
759  }
760
761  if (bytes) {
762    poly1305_block_copy(st->buffer + st->leftover, m, bytes);
763    st->leftover += bytes;
764  }
765}
766
767void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
768  poly1305_state_internal *st = poly1305_aligned_state(state);
769  size_t leftover = st->leftover;
770  uint8_t *m = st->buffer;
771  uint128_t d[3];
772  uint64_t h0, h1, h2;
773  uint64_t t0, t1;
774  uint64_t g0, g1, g2, c, nc;
775  uint64_t r0, r1, r2, s1, s2;
776  poly1305_power *p;
777
778  if (st->started) {
779    size_t consumed = poly1305_combine(st, m, leftover);
780    leftover -= consumed;
781    m += consumed;
782  }
783
784  /* st->HH will either be 0 or have the combined result */
785  h0 = st->HH[0];
786  h1 = st->HH[1];
787  h2 = st->HH[2];
788
789  p = &st->P[1];
790  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
791  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
792  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
793  s1 = r1 * (5 << 2);
794  s2 = r2 * (5 << 2);
795
796  if (leftover < 16) {
797    goto poly1305_donna_atmost15bytes;
798  }
799
800poly1305_donna_atleast16bytes:
801  t0 = U8TO64_LE(m + 0);
802  t1 = U8TO64_LE(m + 8);
803  h0 += t0 & 0xfffffffffff;
804  t0 = shr128_pair(t1, t0, 44);
805  h1 += t0 & 0xfffffffffff;
806  h2 += (t1 >> 24) | ((uint64_t)1 << 40);
807
808poly1305_donna_mul:
809  d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
810                mul64x64_128(h2, s1));
811  d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
812                mul64x64_128(h2, s2));
813  d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
814                mul64x64_128(h2, r0));
815  h0 = lo128(d[0]) & 0xfffffffffff;
816  c = shr128(d[0], 44);
817  d[1] = add128_64(d[1], c);
818  h1 = lo128(d[1]) & 0xfffffffffff;
819  c = shr128(d[1], 44);
820  d[2] = add128_64(d[2], c);
821  h2 = lo128(d[2]) & 0x3ffffffffff;
822  c = shr128(d[2], 42);
823  h0 += c * 5;
824
825  m += 16;
826  leftover -= 16;
827  if (leftover >= 16) {
828    goto poly1305_donna_atleast16bytes;
829  }
830
831/* final bytes */
832poly1305_donna_atmost15bytes:
833  if (!leftover) {
834    goto poly1305_donna_finish;
835  }
836
837  m[leftover++] = 1;
838  poly1305_block_zero(m + leftover, 16 - leftover);
839  leftover = 16;
840
841  t0 = U8TO64_LE(m + 0);
842  t1 = U8TO64_LE(m + 8);
843  h0 += t0 & 0xfffffffffff;
844  t0 = shr128_pair(t1, t0, 44);
845  h1 += t0 & 0xfffffffffff;
846  h2 += (t1 >> 24);
847
848  goto poly1305_donna_mul;
849
850poly1305_donna_finish:
851  c = (h0 >> 44);
852  h0 &= 0xfffffffffff;
853  h1 += c;
854  c = (h1 >> 44);
855  h1 &= 0xfffffffffff;
856  h2 += c;
857  c = (h2 >> 42);
858  h2 &= 0x3ffffffffff;
859  h0 += c * 5;
860
861  g0 = h0 + 5;
862  c = (g0 >> 44);
863  g0 &= 0xfffffffffff;
864  g1 = h1 + c;
865  c = (g1 >> 44);
866  g1 &= 0xfffffffffff;
867  g2 = h2 + c - ((uint64_t)1 << 42);
868
869  c = (g2 >> 63) - 1;
870  nc = ~c;
871  h0 = (h0 & nc) | (g0 & c);
872  h1 = (h1 & nc) | (g1 & c);
873  h2 = (h2 & nc) | (g2 & c);
874
875  /* pad */
876  t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
877  t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
878  h0 += (t0 & 0xfffffffffff);
879  c = (h0 >> 44);
880  h0 &= 0xfffffffffff;
881  t0 = shr128_pair(t1, t0, 44);
882  h1 += (t0 & 0xfffffffffff) + c;
883  c = (h1 >> 44);
884  h1 &= 0xfffffffffff;
885  t1 = (t1 >> 24);
886  h2 += (t1)+c;
887
888  U64TO8_LE(mac + 0, ((h0) | (h1 << 44)));
889  U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
890}
891
892#endif  /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */
893