1187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root/* 2187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * version 20110505 3187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * D. J. Bernstein 4187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * Public domain. 5187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * 6187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * Based on crypto_core/salsa208/armneon/core.c from SUPERCOP 20130419 7187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root */ 8187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 9187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root#define ROUNDS 8 10187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Rootstatic void 11187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Rootsalsa20_8_intrinsic(void * input) 12187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root{ 13187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root int i; 14187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 15187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root const uint32x4_t abab = {-1,0,-1,0}; 16187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 17187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root /* 18187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * This is modified since we only have one argument. Usually you'd rearrange 19187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * the constant, key, and input bytes, but we just have one linear array to 20187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * rearrange which is a bit easier. 21187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root */ 22187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 23187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root /* 24187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * Change the input to be diagonals as if it's a 4x4 matrix of 32-bit values. 25187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root */ 26187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x0x5x10x15; 27187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x12x1x6x11; 28187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x8x13x2x7; 29187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x4x9x14x3; 30187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 31187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x0x1x10x11; 32187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x12x13x6x7; 33187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x8x9x2x3; 34187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x4x5x14x15; 35187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 36187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x0x1x2x3; 37187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x4x5x6x7; 38187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x8x9x10x11; 39187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t x12x13x14x15; 40187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 41187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x0x1x2x3 = vld1q_u8((uint8_t *) input); 42187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x4x5x6x7 = vld1q_u8(16 + (uint8_t *) input); 43187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x8x9x10x11 = vld1q_u8(32 + (uint8_t *) input); 44187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x12x13x14x15 = vld1q_u8(48 + (uint8_t *) input); 45187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 46187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x0x1x10x11 = vcombine_u32(vget_low_u32(x0x1x2x3), vget_high_u32(x8x9x10x11)); 47187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x4x5x14x15 = vcombine_u32(vget_low_u32(x4x5x6x7), vget_high_u32(x12x13x14x15)); 48187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x8x9x2x3 = vcombine_u32(vget_low_u32(x8x9x10x11), vget_high_u32(x0x1x2x3)); 49187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x12x13x6x7 = vcombine_u32(vget_low_u32(x12x13x14x15), vget_high_u32(x4x5x6x7)); 50187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 51187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x0x5x10x15 = vbslq_u32(abab,x0x1x10x11,x4x5x14x15); 52187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x8x13x2x7 = vbslq_u32(abab,x8x9x2x3,x12x13x6x7); 53187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x4x9x14x3 = vbslq_u32(abab,x4x5x14x15,x8x9x2x3); 54187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x12x1x6x11 = vbslq_u32(abab,x12x13x6x7,x0x1x10x11); 55187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 56187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t start0 = x0x5x10x15; 57187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t start1 = x12x1x6x11; 58187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t start3 = x4x9x14x3; 59187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t start2 = x8x13x2x7; 60187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 61187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root /* From here on this should be the same as the SUPERCOP version. */ 62187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 63187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t diag0 = start0; 64187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t diag1 = start1; 65187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t diag2 = start2; 66187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t diag3 = start3; 67187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 68187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t a0; 69187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t a1; 70187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t a2; 71187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root uint32x4_t a3; 72187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 73187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root for (i = ROUNDS;i > 0;i -= 2) { 74187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root a0 = diag1 + diag0; 75187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); 76187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root a1 = diag0 + diag3; 77187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); 78187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root a2 = diag3 + diag2; 79187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); 80187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root a3 = diag2 + diag1; 81187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); 82187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 83187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag3 = vextq_u32(diag3,diag3,3); 84187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag2 = vextq_u32(diag2,diag2,2); 85187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag1 = vextq_u32(diag1,diag1,1); 86187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 87187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root a0 = diag3 + diag0; 88187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); 89187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root a1 = diag0 + diag1; 90187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); 91187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root a2 = diag1 + diag2; 92187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); 93187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root a3 = diag2 + diag3; 94187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); 95187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 96187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag1 = vextq_u32(diag1,diag1,3); 97187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag2 = vextq_u32(diag2,diag2,2); 98187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root diag3 = vextq_u32(diag3,diag3,1); 99187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root } 100187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 101187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x0x5x10x15 = diag0 + start0; 102187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x12x1x6x11 = diag1 + start1; 103187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x8x13x2x7 = diag2 + start2; 104187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x4x9x14x3 = diag3 + start3; 105187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 106187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); 107187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); 108187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); 109187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); 110187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 111187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); 112187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); 113187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); 114187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); 115187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root 116187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root vst1q_u8((uint8_t *) input,(uint8x16_t) x0x1x2x3); 117187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root vst1q_u8(16 + (uint8_t *) input,(uint8x16_t) x4x5x6x7); 118187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root vst1q_u8(32 + (uint8_t *) input,(uint8x16_t) x8x9x10x11); 119187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root vst1q_u8(48 + (uint8_t *) input,(uint8x16_t) x12x13x14x15); 120187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root} 121