1d9e397b599b13d642138480a28c14db7a136bf0Adam Langley/* Copyright (c) 2014, Google Inc. 2d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * 3d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * Permission to use, copy, modify, and/or distribute this software for any 4d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * purpose with or without fee is hereby granted, provided that the above 5d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * copyright notice and this permission notice appear in all copies. 6d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * 7d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13d9e397b599b13d642138480a28c14db7a136bf0Adam Langley * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 158f860b133896bf655e4342ecefe692d52df81d48Robert Sloan// This implementation of poly1305 is by Andrew Moon 168f860b133896bf655e4342ecefe692d52df81d48Robert Sloan// (https://github.com/floodyberry/poly1305-donna) and released as public 178f860b133896bf655e4342ecefe692d52df81d48Robert Sloan// domain. It implements SIMD vectorization based on the algorithm described in 188f860b133896bf655e4342ecefe692d52df81d48Robert Sloan// http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte 198f860b133896bf655e4342ecefe692d52df81d48Robert Sloan// block size 20d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 21d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#include <openssl/poly1305.h> 22d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 234969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin#include "../internal.h" 244969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin 25d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 26d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#if !defined(OPENSSL_WINDOWS) && defined(OPENSSL_X86_64) 27d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 28d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#include <emmintrin.h> 29d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 304969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin#define U8TO64_LE(m) (*(const uint64_t *)(m)) 314969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin#define U8TO32_LE(m) (*(const uint32_t *)(m)) 32d9e397b599b13d642138480a28c14db7a136bf0Adam Langley#define U64TO8_LE(m, v) (*(uint64_t *)(m)) = v 33d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 34d9e397b599b13d642138480a28c14db7a136bf0Adam Langleytypedef __m128i xmmi; 35d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 364969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = { 37d9e397b599b13d642138480a28c14db7a136bf0Adam Langley (1 << 26) - 1, 0, (1 << 26) - 1, 0}; 384969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0}; 394969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = { 404969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin (1 << 24), 0, (1 << 24), 0}; 41d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 424969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; } 43d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 444969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; } 45d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 464969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic inline uint128_t mul64x64_128(uint64_t a, uint64_t b) { 47d9e397b599b13d642138480a28c14db7a136bf0Adam Langley return (uint128_t)a * b; 48d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 49d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 504969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic inline uint64_t lo128(uint128_t a) { return (uint64_t)a; } 51d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 524969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic inline uint64_t shr128(uint128_t v, const int shift) { 53d9e397b599b13d642138480a28c14db7a136bf0Adam Langley return (uint64_t)(v >> shift); 54d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 55d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 564969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) { 57d9e397b599b13d642138480a28c14db7a136bf0Adam Langley return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift); 58d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 59d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 60d9e397b599b13d642138480a28c14db7a136bf0Adam Langleytypedef struct poly1305_power_t { 61d9e397b599b13d642138480a28c14db7a136bf0Adam Langley union { 62d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi v; 63d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t u[2]; 64d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint32_t d[4]; 65d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } R20, R21, R22, R23, R24, S21, S22, S23, S24; 66d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} poly1305_power; 67d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 68d9e397b599b13d642138480a28c14db7a136bf0Adam Langleytypedef struct poly1305_state_internal_t { 69d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 70d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bytes of free storage */ 71d9e397b599b13d642138480a28c14db7a136bf0Adam Langley union { 728f860b133896bf655e4342ecefe692d52df81d48Robert Sloan xmmi H[5]; // 80 bytes 73d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t HH[10]; 74d9e397b599b13d642138480a28c14db7a136bf0Adam Langley }; 758f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // uint64_t r0,r1,r2; [24 bytes] 768f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // uint64_t pad0,pad1; [16 bytes] 778f860b133896bf655e4342ecefe692d52df81d48Robert Sloan uint64_t started; // 8 bytes 788f860b133896bf655e4342ecefe692d52df81d48Robert Sloan uint64_t leftover; // 8 bytes 798f860b133896bf655e4342ecefe692d52df81d48Robert Sloan uint8_t buffer[64]; // 64 bytes 80d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} poly1305_state_internal; /* 448 bytes total + 63 bytes for 81d9e397b599b13d642138480a28c14db7a136bf0Adam Langley alignment = 511 bytes raw */ 82d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 834969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic inline poly1305_state_internal *poly1305_aligned_state( 84d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_state *state) { 85d9e397b599b13d642138480a28c14db7a136bf0Adam Langley return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63); 86d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 87d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 884969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjaminstatic inline size_t poly1305_min(size_t a, size_t b) { 89d9e397b599b13d642138480a28c14db7a136bf0Adam Langley return (a < b) ? a : b; 90d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 91d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 92d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyvoid CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { 93d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_state_internal *st = poly1305_aligned_state(state); 94d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_power *p; 95d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t r0, r1, r2; 96d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t t0, t1; 97d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 988f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // clamp key 99d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = U8TO64_LE(key + 0); 100d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 = U8TO64_LE(key + 8); 101d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r0 = t0 & 0xffc0fffffff; 102d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 >>= 44; 103d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 |= t1 << 20; 104d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r1 = t0 & 0xfffffc0ffff; 105d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 >>= 24; 106d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r2 = t1 & 0x00ffffffc0f; 107d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 1088f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // store r in un-used space of st->P[1] 109d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p = &st->P[1]; 110d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R20.d[1] = (uint32_t)(r0); 111d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R20.d[3] = (uint32_t)(r0 >> 32); 112d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R21.d[1] = (uint32_t)(r1); 113d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R21.d[3] = (uint32_t)(r1 >> 32); 114d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R22.d[1] = (uint32_t)(r2); 115d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R22.d[3] = (uint32_t)(r2 >> 32); 116d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 1178f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // store pad 118d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R23.d[1] = U8TO32_LE(key + 16); 119d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R23.d[3] = U8TO32_LE(key + 20); 120d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R24.d[1] = U8TO32_LE(key + 24); 121d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R24.d[3] = U8TO32_LE(key + 28); 122d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 1238f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H = 0 124d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[0] = _mm_setzero_si128(); 125d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[1] = _mm_setzero_si128(); 126d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[2] = _mm_setzero_si128(); 127d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[3] = _mm_setzero_si128(); 128d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[4] = _mm_setzero_si128(); 129d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 130d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->started = 0; 131d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->leftover = 0; 132d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 133d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 134d9e397b599b13d642138480a28c14db7a136bf0Adam Langleystatic void poly1305_first_block(poly1305_state_internal *st, 135d9e397b599b13d642138480a28c14db7a136bf0Adam Langley const uint8_t *m) { 1364969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); 1374969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); 1384969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); 139d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi T5, T6; 140d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_power *p; 141d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint128_t d[3]; 142d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t r0, r1, r2; 143d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t r20, r21, r22, s22; 144d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t pad0, pad1; 145d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t c; 146d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t i; 147d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 1488f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // pull out stored info 149d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p = &st->P[1]; 150d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 151d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 152d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 153d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 154d9e397b599b13d642138480a28c14db7a136bf0Adam Langley pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; 155d9e397b599b13d642138480a28c14db7a136bf0Adam Langley pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; 156d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 1578f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // compute powers r^2,r^4 158d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r20 = r0; 159d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r21 = r1; 160d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r22 = r2; 161d9e397b599b13d642138480a28c14db7a136bf0Adam Langley for (i = 0; i < 2; i++) { 162d9e397b599b13d642138480a28c14db7a136bf0Adam Langley s22 = r22 * (5 << 2); 163d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 164d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22)); 165d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21)); 166d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20)); 167d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 168d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r20 = lo128(d[0]) & 0xfffffffffff; 169d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = shr128(d[0], 44); 170d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[1] = add128_64(d[1], c); 171d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r21 = lo128(d[1]) & 0xfffffffffff; 172d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = shr128(d[1], 44); 173d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[2] = add128_64(d[2], c); 174d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r22 = lo128(d[2]) & 0x3ffffffffff; 175d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = shr128(d[2], 42); 176d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r20 += c * 5; 177d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (r20 >> 44); 178d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r20 = r20 & 0xfffffffffff; 179d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r21 += c; 180d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 181d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff), 182d9e397b599b13d642138480a28c14db7a136bf0Adam Langley _MM_SHUFFLE(1, 0, 1, 0)); 183d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R21.v = _mm_shuffle_epi32( 184d9e397b599b13d642138480a28c14db7a136bf0Adam Langley _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), 185d9e397b599b13d642138480a28c14db7a136bf0Adam Langley _MM_SHUFFLE(1, 0, 1, 0)); 186d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R22.v = 187d9e397b599b13d642138480a28c14db7a136bf0Adam Langley _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff), 188d9e397b599b13d642138480a28c14db7a136bf0Adam Langley _MM_SHUFFLE(1, 0, 1, 0)); 189d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R23.v = _mm_shuffle_epi32( 190d9e397b599b13d642138480a28c14db7a136bf0Adam Langley _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), 191d9e397b599b13d642138480a28c14db7a136bf0Adam Langley _MM_SHUFFLE(1, 0, 1, 0)); 192d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))), 193d9e397b599b13d642138480a28c14db7a136bf0Adam Langley _MM_SHUFFLE(1, 0, 1, 0)); 194d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->S21.v = _mm_mul_epu32(p->R21.v, FIVE); 195d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->S22.v = _mm_mul_epu32(p->R22.v, FIVE); 196d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->S23.v = _mm_mul_epu32(p->R23.v, FIVE); 197d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->S24.v = _mm_mul_epu32(p->R24.v, FIVE); 198d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p--; 199d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 200d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 2018f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // put saved info back 202d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p = &st->P[1]; 203d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R20.d[1] = (uint32_t)(r0); 204d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R20.d[3] = (uint32_t)(r0 >> 32); 205d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R21.d[1] = (uint32_t)(r1); 206d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R21.d[3] = (uint32_t)(r1 >> 32); 207d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R22.d[1] = (uint32_t)(r2); 208d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R22.d[3] = (uint32_t)(r2 >> 32); 209d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R23.d[1] = (uint32_t)(pad0); 210d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R23.d[3] = (uint32_t)(pad0 >> 32); 211d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R24.d[1] = (uint32_t)(pad1); 212d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R24.d[3] = (uint32_t)(pad1 >> 32); 213d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 2148f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H = [Mx,My] 2154969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), 2164969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin _mm_loadl_epi64((const xmmi *)(m + 16))); 2174969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), 2184969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin _mm_loadl_epi64((const xmmi *)(m + 24))); 219d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[0] = _mm_and_si128(MMASK, T5); 220d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 221d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 222d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[2] = _mm_and_si128(MMASK, T5); 223d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 224d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 225d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 226d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 227d9e397b599b13d642138480a28c14db7a136bf0Adam Langleystatic void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, 228d9e397b599b13d642138480a28c14db7a136bf0Adam Langley size_t bytes) { 2294969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); 2304969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); 2314969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); 232d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 233d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_power *p; 234d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi H0, H1, H2, H3, H4; 235d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi T0, T1, T2, T3, T4, T5, T6; 236d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi M0, M1, M2, M3, M4; 237d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi C1, C2; 238d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 239d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H0 = st->H[0]; 240d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H1 = st->H[1]; 241d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H2 = st->H[2]; 242d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H3 = st->H[3]; 243d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H4 = st->H[4]; 244d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 245d9e397b599b13d642138480a28c14db7a136bf0Adam Langley while (bytes >= 64) { 2468f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H *= [r^4,r^4] 247d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p = &st->P[0]; 248d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_mul_epu32(H0, p->R20.v); 249d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_mul_epu32(H0, p->R21.v); 250d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_mul_epu32(H0, p->R22.v); 251d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_mul_epu32(H0, p->R23.v); 252d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_mul_epu32(H0, p->R24.v); 253d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->S24.v); 254d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H1, p->R20.v); 255d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 256d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 257d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->S23.v); 258d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H2, p->S24.v); 259d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 260d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 261d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->S22.v); 262d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H3, p->S23.v); 263d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 264d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 265d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->S21.v); 266d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H4, p->S22.v); 267d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 268d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 269d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->R21.v); 270d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H1, p->R22.v); 271d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 272d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 273d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->R20.v); 274d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H2, p->R21.v); 275d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 276d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 277d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->S24.v); 278d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H3, p->R20.v); 279d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 280d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 281d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->S23.v); 282d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H4, p->S24.v); 283d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 284d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 285d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->R23.v); 286d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 287d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->R22.v); 288d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 289d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->R21.v); 290d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 291d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->R20.v); 292d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 293d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 2948f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H += [Mx,My]*[r^2,r^2] 2954969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), 2964969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin _mm_loadl_epi64((const xmmi *)(m + 16))); 2974969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), 2984969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin _mm_loadl_epi64((const xmmi *)(m + 24))); 299d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M0 = _mm_and_si128(MMASK, T5); 300d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 301d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 302d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M2 = _mm_and_si128(MMASK, T5); 303d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 304d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 305d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 306d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p = &st->P[1]; 307d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M0, p->R20.v); 308d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M0, p->R21.v); 309d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 310d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 311d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M1, p->S24.v); 312d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M1, p->R20.v); 313d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 314d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 315d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M2, p->S23.v); 316d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M2, p->S24.v); 317d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 318d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 319d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M3, p->S22.v); 320d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M3, p->S23.v); 321d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 322d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 323d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M4, p->S21.v); 324d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M4, p->S22.v); 325d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 326d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 327d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M0, p->R22.v); 328d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M0, p->R23.v); 329d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 330d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 331d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M1, p->R21.v); 332d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M1, p->R22.v); 333d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 334d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 335d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M2, p->R20.v); 336d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M2, p->R21.v); 337d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 338d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 339d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M3, p->S24.v); 340d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M3, p->R20.v); 341d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 342d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 343d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M4, p->S23.v); 344d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(M4, p->S24.v); 345d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 346d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 347d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M0, p->R24.v); 348d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 349d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M1, p->R23.v); 350d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 351d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M2, p->R22.v); 352d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 353d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M3, p->R21.v); 354d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 355d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(M4, p->R20.v); 356d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 357d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 3588f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H += [Mx,My] 3594969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)), 3604969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin _mm_loadl_epi64((const xmmi *)(m + 48))); 3614969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)), 3624969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin _mm_loadl_epi64((const xmmi *)(m + 56))); 363d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M0 = _mm_and_si128(MMASK, T5); 364d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 365d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 366d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M2 = _mm_and_si128(MMASK, T5); 367d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 368d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 369d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 370d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, M0); 371d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, M1); 372d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, M2); 373d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, M3); 374d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, M4); 375d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 3768f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // reduce 377d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T0, 26); 378d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T3, 26); 379d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_and_si128(T0, MMASK); 380d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_and_si128(T3, MMASK); 381d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, C1); 382d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, C2); 383d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T1, 26); 384d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T4, 26); 385d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_and_si128(T1, MMASK); 386d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_and_si128(T4, MMASK); 387d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, C1); 388d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 389d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T2, 26); 390d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T0, 26); 391d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_and_si128(T2, MMASK); 392d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_and_si128(T0, MMASK); 393d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, C1); 394d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, C2); 395d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T3, 26); 396d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_and_si128(T3, MMASK); 397d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, C1); 398d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 3998f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) 400d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H0 = T0; 401d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H1 = T1; 402d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H2 = T2; 403d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H3 = T3; 404d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H4 = T4; 405d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 406d9e397b599b13d642138480a28c14db7a136bf0Adam Langley m += 64; 407d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bytes -= 64; 408d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 409d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 410d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[0] = H0; 411d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[1] = H1; 412d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[2] = H2; 413d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[3] = H3; 414d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->H[4] = H4; 415d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 416d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 417d9e397b599b13d642138480a28c14db7a136bf0Adam Langleystatic size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, 418d9e397b599b13d642138480a28c14db7a136bf0Adam Langley size_t bytes) { 4194969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); 4204969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); 4214969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); 422d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 423d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_power *p; 424d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi H0, H1, H2, H3, H4; 425d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi M0, M1, M2, M3, M4; 426d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi T0, T1, T2, T3, T4, T5, T6; 427d9e397b599b13d642138480a28c14db7a136bf0Adam Langley xmmi C1, C2; 428d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 429d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t r0, r1, r2; 430d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t t0, t1, t2, t3, t4; 431d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t c; 432d9e397b599b13d642138480a28c14db7a136bf0Adam Langley size_t consumed = 0; 433d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 434d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H0 = st->H[0]; 435d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H1 = st->H[1]; 436d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H2 = st->H[2]; 437d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H3 = st->H[3]; 438d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H4 = st->H[4]; 439d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 4408f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // p = [r^2,r^2] 441d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p = &st->P[1]; 442d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 443d9e397b599b13d642138480a28c14db7a136bf0Adam Langley if (bytes >= 32) { 4448f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H *= [r^2,r^2] 445d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_mul_epu32(H0, p->R20.v); 446d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_mul_epu32(H0, p->R21.v); 447d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_mul_epu32(H0, p->R22.v); 448d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_mul_epu32(H0, p->R23.v); 449d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_mul_epu32(H0, p->R24.v); 450d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->S24.v); 451d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H1, p->R20.v); 452d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 453d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 454d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->S23.v); 455d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H2, p->S24.v); 456d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 457d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 458d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->S22.v); 459d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H3, p->S23.v); 460d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 461d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 462d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->S21.v); 463d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H4, p->S22.v); 464d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 465d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 466d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->R21.v); 467d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H1, p->R22.v); 468d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 469d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 470d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->R20.v); 471d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H2, p->R21.v); 472d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 473d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 474d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->S24.v); 475d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H3, p->R20.v); 476d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 477d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 478d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->S23.v); 479d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H4, p->S24.v); 480d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 481d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 482d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->R23.v); 483d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 484d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->R22.v); 485d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 486d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->R21.v); 487d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 488d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->R20.v); 489d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 490d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 4918f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H += [Mx,My] 4924969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), 4934969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin _mm_loadl_epi64((const xmmi *)(m + 16))); 4944969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), 4954969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin _mm_loadl_epi64((const xmmi *)(m + 24))); 496d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M0 = _mm_and_si128(MMASK, T5); 497d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 498d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 499d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M2 = _mm_and_si128(MMASK, T5); 500d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 501d9e397b599b13d642138480a28c14db7a136bf0Adam Langley M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 502d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 503d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, M0); 504d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, M1); 505d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, M2); 506d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, M3); 507d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, M4); 508d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 5098f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // reduce 510d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T0, 26); 511d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T3, 26); 512d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_and_si128(T0, MMASK); 513d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_and_si128(T3, MMASK); 514d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, C1); 515d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, C2); 516d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T1, 26); 517d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T4, 26); 518d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_and_si128(T1, MMASK); 519d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_and_si128(T4, MMASK); 520d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, C1); 521d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 522d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T2, 26); 523d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T0, 26); 524d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_and_si128(T2, MMASK); 525d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_and_si128(T0, MMASK); 526d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, C1); 527d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, C2); 528d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T3, 26); 529d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_and_si128(T3, MMASK); 530d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, C1); 531d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 5328f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H = (H*[r^2,r^2] + [Mx,My]) 533d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H0 = T0; 534d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H1 = T1; 535d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H2 = T2; 536d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H3 = T3; 537d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H4 = T4; 538d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 539d9e397b599b13d642138480a28c14db7a136bf0Adam Langley consumed = 32; 540d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 541d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 5428f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // finalize, H *= [r^2,r] 543d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 544d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 545d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 546d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 547d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R20.d[2] = (uint32_t)(r0)&0x3ffffff; 548d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; 549d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; 550d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; 551d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->R24.d[2] = (uint32_t)((r2 >> 16)); 552d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->S21.d[2] = p->R21.d[2] * 5; 553d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->S22.d[2] = p->R22.d[2] * 5; 554d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->S23.d[2] = p->R23.d[2] * 5; 555d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p->S24.d[2] = p->R24.d[2] * 5; 556d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 5578f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H *= [r^2,r] 558d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_mul_epu32(H0, p->R20.v); 559d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_mul_epu32(H0, p->R21.v); 560d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_mul_epu32(H0, p->R22.v); 561d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_mul_epu32(H0, p->R23.v); 562d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_mul_epu32(H0, p->R24.v); 563d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->S24.v); 564d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H1, p->R20.v); 565d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 566d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 567d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->S23.v); 568d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H2, p->S24.v); 569d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 570d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 571d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->S22.v); 572d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H3, p->S23.v); 573d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 574d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 575d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->S21.v); 576d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H4, p->S22.v); 577d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, T5); 578d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, T6); 579d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->R21.v); 580d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H1, p->R22.v); 581d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 582d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 583d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->R20.v); 584d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H2, p->R21.v); 585d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 586d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 587d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->S24.v); 588d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H3, p->R20.v); 589d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 590d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 591d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->S23.v); 592d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T6 = _mm_mul_epu32(H4, p->S24.v); 593d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, T5); 594d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, T6); 595d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H1, p->R23.v); 596d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 597d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H2, p->R22.v); 598d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 599d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H3, p->R21.v); 600d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 601d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T5 = _mm_mul_epu32(H4, p->R20.v); 602d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, T5); 603d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 604d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T0, 26); 605d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T3, 26); 606d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_and_si128(T0, MMASK); 607d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_and_si128(T3, MMASK); 608d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, C1); 609d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, C2); 610d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T1, 26); 611d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T4, 26); 612d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_and_si128(T1, MMASK); 613d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_and_si128(T4, MMASK); 614d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_add_epi64(T2, C1); 615d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 616d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T2, 26); 617d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C2 = _mm_srli_epi64(T0, 26); 618d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T2 = _mm_and_si128(T2, MMASK); 619d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T0 = _mm_and_si128(T0, MMASK); 620d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_add_epi64(T3, C1); 621d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T1 = _mm_add_epi64(T1, C2); 622d9e397b599b13d642138480a28c14db7a136bf0Adam Langley C1 = _mm_srli_epi64(T3, 26); 623d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T3 = _mm_and_si128(T3, MMASK); 624d9e397b599b13d642138480a28c14db7a136bf0Adam Langley T4 = _mm_add_epi64(T4, C1); 625d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 6268f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // H = H[0]+H[1] 627d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); 628d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); 629d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); 630d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); 631d9e397b599b13d642138480a28c14db7a136bf0Adam Langley H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); 632d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 633d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = _mm_cvtsi128_si32(H0); 634d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (t0 >> 26); 635d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 &= 0x3ffffff; 636d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 = _mm_cvtsi128_si32(H1) + c; 637d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (t1 >> 26); 638d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 &= 0x3ffffff; 639d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t2 = _mm_cvtsi128_si32(H2) + c; 640d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (t2 >> 26); 641d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t2 &= 0x3ffffff; 642d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t3 = _mm_cvtsi128_si32(H3) + c; 643d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (t3 >> 26); 644d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t3 &= 0x3ffffff; 645d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t4 = _mm_cvtsi128_si32(H4) + c; 646d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (t4 >> 26); 647d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t4 &= 0x3ffffff; 648d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = t0 + (c * 5); 649d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (t0 >> 26); 650d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 &= 0x3ffffff; 651d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 = t1 + c; 652d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 6534969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff); 6544969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff); 6554969cc9b0ab2905ec478277f50ed3849b37a6c6bDavid Benjamin st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff); 656d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 657d9e397b599b13d642138480a28c14db7a136bf0Adam Langley return consumed; 658d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 659d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 660d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyvoid CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, 661d9e397b599b13d642138480a28c14db7a136bf0Adam Langley size_t bytes) { 662d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_state_internal *st = poly1305_aligned_state(state); 663d9e397b599b13d642138480a28c14db7a136bf0Adam Langley size_t want; 664d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 6658f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // need at least 32 initial bytes to start the accelerated branch 666d9e397b599b13d642138480a28c14db7a136bf0Adam Langley if (!st->started) { 667d9e397b599b13d642138480a28c14db7a136bf0Adam Langley if ((st->leftover == 0) && (bytes > 32)) { 668d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_first_block(st, m); 669d9e397b599b13d642138480a28c14db7a136bf0Adam Langley m += 32; 670d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bytes -= 32; 671d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } else { 672d9e397b599b13d642138480a28c14db7a136bf0Adam Langley want = poly1305_min(32 - st->leftover, bytes); 673d5c2215355e1ae960be386b0d69aed228102cdaeRobert Sloan OPENSSL_memcpy(st->buffer + st->leftover, m, want); 674d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bytes -= want; 675d9e397b599b13d642138480a28c14db7a136bf0Adam Langley m += want; 676d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->leftover += want; 677e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if ((st->leftover < 32) || (bytes == 0)) { 678d9e397b599b13d642138480a28c14db7a136bf0Adam Langley return; 679e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley } 680d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_first_block(st, st->buffer); 681d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->leftover = 0; 682d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 683d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->started = 1; 684d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 685d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 6868f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // handle leftover 687d9e397b599b13d642138480a28c14db7a136bf0Adam Langley if (st->leftover) { 688d9e397b599b13d642138480a28c14db7a136bf0Adam Langley want = poly1305_min(64 - st->leftover, bytes); 689d5c2215355e1ae960be386b0d69aed228102cdaeRobert Sloan OPENSSL_memcpy(st->buffer + st->leftover, m, want); 690d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bytes -= want; 691d9e397b599b13d642138480a28c14db7a136bf0Adam Langley m += want; 692d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->leftover += want; 693e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if (st->leftover < 64) { 694d9e397b599b13d642138480a28c14db7a136bf0Adam Langley return; 695e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley } 696d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_blocks(st, st->buffer, 64); 697d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->leftover = 0; 698d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 699d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 7008f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // process 64 byte blocks 701d9e397b599b13d642138480a28c14db7a136bf0Adam Langley if (bytes >= 64) { 702d9e397b599b13d642138480a28c14db7a136bf0Adam Langley want = (bytes & ~63); 703d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_blocks(st, m, want); 704d9e397b599b13d642138480a28c14db7a136bf0Adam Langley m += want; 705d9e397b599b13d642138480a28c14db7a136bf0Adam Langley bytes -= want; 706d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 707d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 708d9e397b599b13d642138480a28c14db7a136bf0Adam Langley if (bytes) { 709d5c2215355e1ae960be386b0d69aed228102cdaeRobert Sloan OPENSSL_memcpy(st->buffer + st->leftover, m, bytes); 710d9e397b599b13d642138480a28c14db7a136bf0Adam Langley st->leftover += bytes; 711d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 712d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 713d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 714d9e397b599b13d642138480a28c14db7a136bf0Adam Langleyvoid CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) { 715d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_state_internal *st = poly1305_aligned_state(state); 716d9e397b599b13d642138480a28c14db7a136bf0Adam Langley size_t leftover = st->leftover; 717d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint8_t *m = st->buffer; 718d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint128_t d[3]; 719d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t h0, h1, h2; 720d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t t0, t1; 721d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t g0, g1, g2, c, nc; 722d9e397b599b13d642138480a28c14db7a136bf0Adam Langley uint64_t r0, r1, r2, s1, s2; 723d9e397b599b13d642138480a28c14db7a136bf0Adam Langley poly1305_power *p; 724d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 725d9e397b599b13d642138480a28c14db7a136bf0Adam Langley if (st->started) { 726d9e397b599b13d642138480a28c14db7a136bf0Adam Langley size_t consumed = poly1305_combine(st, m, leftover); 727d9e397b599b13d642138480a28c14db7a136bf0Adam Langley leftover -= consumed; 728d9e397b599b13d642138480a28c14db7a136bf0Adam Langley m += consumed; 729d9e397b599b13d642138480a28c14db7a136bf0Adam Langley } 730d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 7318f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // st->HH will either be 0 or have the combined result 732d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 = st->HH[0]; 733d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 = st->HH[1]; 734d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h2 = st->HH[2]; 735d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 736d9e397b599b13d642138480a28c14db7a136bf0Adam Langley p = &st->P[1]; 737d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; 738d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; 739d9e397b599b13d642138480a28c14db7a136bf0Adam Langley r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; 740d9e397b599b13d642138480a28c14db7a136bf0Adam Langley s1 = r1 * (5 << 2); 741d9e397b599b13d642138480a28c14db7a136bf0Adam Langley s2 = r2 * (5 << 2); 742d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 743e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if (leftover < 16) { 744d9e397b599b13d642138480a28c14db7a136bf0Adam Langley goto poly1305_donna_atmost15bytes; 745e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley } 746d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 747d9e397b599b13d642138480a28c14db7a136bf0Adam Langleypoly1305_donna_atleast16bytes: 748d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = U8TO64_LE(m + 0); 749d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 = U8TO64_LE(m + 8); 750d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 += t0 & 0xfffffffffff; 751d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = shr128_pair(t1, t0, 44); 752d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 += t0 & 0xfffffffffff; 753d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h2 += (t1 >> 24) | ((uint64_t)1 << 40); 754d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 755d9e397b599b13d642138480a28c14db7a136bf0Adam Langleypoly1305_donna_mul: 756d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), 757d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mul64x64_128(h2, s1)); 758d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), 759d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mul64x64_128(h2, s2)); 760d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), 761d9e397b599b13d642138480a28c14db7a136bf0Adam Langley mul64x64_128(h2, r0)); 762d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 = lo128(d[0]) & 0xfffffffffff; 763d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = shr128(d[0], 44); 764d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[1] = add128_64(d[1], c); 765d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 = lo128(d[1]) & 0xfffffffffff; 766d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = shr128(d[1], 44); 767d9e397b599b13d642138480a28c14db7a136bf0Adam Langley d[2] = add128_64(d[2], c); 768d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h2 = lo128(d[2]) & 0x3ffffffffff; 769d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = shr128(d[2], 42); 770d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 += c * 5; 771d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 772d9e397b599b13d642138480a28c14db7a136bf0Adam Langley m += 16; 773d9e397b599b13d642138480a28c14db7a136bf0Adam Langley leftover -= 16; 774e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if (leftover >= 16) { 775d9e397b599b13d642138480a28c14db7a136bf0Adam Langley goto poly1305_donna_atleast16bytes; 776e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley } 777d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 7788f860b133896bf655e4342ecefe692d52df81d48Robert Sloan// final bytes 779d9e397b599b13d642138480a28c14db7a136bf0Adam Langleypoly1305_donna_atmost15bytes: 780e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if (!leftover) { 781d9e397b599b13d642138480a28c14db7a136bf0Adam Langley goto poly1305_donna_finish; 782e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley } 783d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 784d9e397b599b13d642138480a28c14db7a136bf0Adam Langley m[leftover++] = 1; 785d5c2215355e1ae960be386b0d69aed228102cdaeRobert Sloan OPENSSL_memset(m + leftover, 0, 16 - leftover); 786d9e397b599b13d642138480a28c14db7a136bf0Adam Langley leftover = 16; 787d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 788d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = U8TO64_LE(m + 0); 789d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 = U8TO64_LE(m + 8); 790d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 += t0 & 0xfffffffffff; 791d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = shr128_pair(t1, t0, 44); 792d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 += t0 & 0xfffffffffff; 793d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h2 += (t1 >> 24); 794d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 795d9e397b599b13d642138480a28c14db7a136bf0Adam Langley goto poly1305_donna_mul; 796d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 797d9e397b599b13d642138480a28c14db7a136bf0Adam Langleypoly1305_donna_finish: 798d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (h0 >> 44); 799d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 &= 0xfffffffffff; 800d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 += c; 801d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (h1 >> 44); 802d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 &= 0xfffffffffff; 803d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h2 += c; 804d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (h2 >> 42); 805d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h2 &= 0x3ffffffffff; 806d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 += c * 5; 807d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 808d9e397b599b13d642138480a28c14db7a136bf0Adam Langley g0 = h0 + 5; 809d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (g0 >> 44); 810d9e397b599b13d642138480a28c14db7a136bf0Adam Langley g0 &= 0xfffffffffff; 811d9e397b599b13d642138480a28c14db7a136bf0Adam Langley g1 = h1 + c; 812d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (g1 >> 44); 813d9e397b599b13d642138480a28c14db7a136bf0Adam Langley g1 &= 0xfffffffffff; 814d9e397b599b13d642138480a28c14db7a136bf0Adam Langley g2 = h2 + c - ((uint64_t)1 << 42); 815d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 816d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (g2 >> 63) - 1; 817d9e397b599b13d642138480a28c14db7a136bf0Adam Langley nc = ~c; 818d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 = (h0 & nc) | (g0 & c); 819d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 = (h1 & nc) | (g1 & c); 820d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h2 = (h2 & nc) | (g2 & c); 821d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 8228f860b133896bf655e4342ecefe692d52df81d48Robert Sloan // pad 823d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; 824d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; 825d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 += (t0 & 0xfffffffffff); 826d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (h0 >> 44); 827d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h0 &= 0xfffffffffff; 828d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t0 = shr128_pair(t1, t0, 44); 829d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 += (t0 & 0xfffffffffff) + c; 830d9e397b599b13d642138480a28c14db7a136bf0Adam Langley c = (h1 >> 44); 831d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h1 &= 0xfffffffffff; 832d9e397b599b13d642138480a28c14db7a136bf0Adam Langley t1 = (t1 >> 24); 833d9e397b599b13d642138480a28c14db7a136bf0Adam Langley h2 += (t1)+c; 834d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 835d9e397b599b13d642138480a28c14db7a136bf0Adam Langley U64TO8_LE(mac + 0, ((h0) | (h1 << 44))); 836d9e397b599b13d642138480a28c14db7a136bf0Adam Langley U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24))); 837d9e397b599b13d642138480a28c14db7a136bf0Adam Langley} 838d9e397b599b13d642138480a28c14db7a136bf0Adam Langley 8398f860b133896bf655e4342ecefe692d52df81d48Robert Sloan#endif // !OPENSSL_WINDOWS && OPENSSL_X86_64 840