1187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root/*
2187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * version 20110505
3187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * D. J. Bernstein
4187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * Public domain.
5187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root *
6187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root * Based on crypto_core/salsa208/armneon/core.c from SUPERCOP 20130419
7187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root */
8187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
9187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root#define ROUNDS 8
10187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Rootstatic void
11187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Rootsalsa20_8_intrinsic(void * input)
12187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root{
13187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  int i;
14187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
15187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  const uint32x4_t abab = {-1,0,-1,0};
16187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
17187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  /*
18187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root   * This is modified since we only have one argument. Usually you'd rearrange
19187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root   * the constant, key, and input bytes, but we just have one linear array to
20187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root   * rearrange which is a bit easier.
21187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root   */
22187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
23187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  /*
24187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root   * Change the input to be diagonals as if it's a 4x4 matrix of 32-bit values.
25187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root   */
26187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x0x5x10x15;
27187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x12x1x6x11;
28187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x8x13x2x7;
29187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x4x9x14x3;
30187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
31187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x0x1x10x11;
32187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x12x13x6x7;
33187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x8x9x2x3;
34187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x4x5x14x15;
35187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
36187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x0x1x2x3;
37187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x4x5x6x7;
38187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x8x9x10x11;
39187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t x12x13x14x15;
40187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
41187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x0x1x2x3 = vld1q_u8((uint8_t *) input);
42187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x4x5x6x7 = vld1q_u8(16 + (uint8_t *) input);
43187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x8x9x10x11 = vld1q_u8(32 + (uint8_t *) input);
44187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x12x13x14x15 = vld1q_u8(48 + (uint8_t *) input);
45187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
46187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x0x1x10x11 = vcombine_u32(vget_low_u32(x0x1x2x3), vget_high_u32(x8x9x10x11));
47187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x4x5x14x15 = vcombine_u32(vget_low_u32(x4x5x6x7), vget_high_u32(x12x13x14x15));
48187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x8x9x2x3 = vcombine_u32(vget_low_u32(x8x9x10x11), vget_high_u32(x0x1x2x3));
49187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x12x13x6x7 = vcombine_u32(vget_low_u32(x12x13x14x15), vget_high_u32(x4x5x6x7));
50187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
51187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x0x5x10x15 = vbslq_u32(abab,x0x1x10x11,x4x5x14x15);
52187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x8x13x2x7 = vbslq_u32(abab,x8x9x2x3,x12x13x6x7);
53187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x4x9x14x3 = vbslq_u32(abab,x4x5x14x15,x8x9x2x3);
54187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x12x1x6x11 = vbslq_u32(abab,x12x13x6x7,x0x1x10x11);
55187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
56187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t start0 = x0x5x10x15;
57187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t start1 = x12x1x6x11;
58187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t start3 = x4x9x14x3;
59187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t start2 = x8x13x2x7;
60187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
61187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  /* From here on this should be the same as the SUPERCOP version. */
62187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
63187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t diag0 = start0;
64187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t diag1 = start1;
65187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t diag2 = start2;
66187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t diag3 = start3;
67187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
68187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t a0;
69187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t a1;
70187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t a2;
71187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  uint32x4_t a3;
72187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
73187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  for (i = ROUNDS;i > 0;i -= 2) {
74187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    a0 = diag1 + diag0;
75187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25);
76187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    a1 = diag0 + diag3;
77187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23);
78187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    a2 = diag3 + diag2;
79187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19);
80187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    a3 = diag2 + diag1;
81187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14);
82187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
83187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag3 = vextq_u32(diag3,diag3,3);
84187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag2 = vextq_u32(diag2,diag2,2);
85187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag1 = vextq_u32(diag1,diag1,1);
86187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
87187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    a0 = diag3 + diag0;
88187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25);
89187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    a1 = diag0 + diag1;
90187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23);
91187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    a2 = diag1 + diag2;
92187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19);
93187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    a3 = diag2 + diag3;
94187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14);
95187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
96187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag1 = vextq_u32(diag1,diag1,3);
97187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag2 = vextq_u32(diag2,diag2,2);
98187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root    diag3 = vextq_u32(diag3,diag3,1);
99187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  }
100187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
101187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x0x5x10x15 = diag0 + start0;
102187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x12x1x6x11 = diag1 + start1;
103187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x8x13x2x7 = diag2 + start2;
104187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x4x9x14x3 = diag3 + start3;
105187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
106187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
107187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
108187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
109187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);
110187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
111187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
112187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
113187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
114187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));
115187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root
116187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  vst1q_u8((uint8_t *) input,(uint8x16_t) x0x1x2x3);
117187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  vst1q_u8(16 + (uint8_t *) input,(uint8x16_t) x4x5x6x7);
118187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  vst1q_u8(32 + (uint8_t *) input,(uint8x16_t) x8x9x10x11);
119187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root  vst1q_u8(48 + (uint8_t *) input,(uint8x16_t) x12x13x14x15);
120187f492d9d783d53893c3dea07c0b14841ba61f8Kenny Root}
121