1
2/*
3gcc -o v8crypto v8crypto.c -march=armv8-a -mfpu=crypto-neon-fp-armv8
4gcc -o v8crypto v8crypto.c -mfpu=crypto-neon-fp-armv8
5*/
6
7#include <stdio.h>
8#include <assert.h>
9#include <malloc.h>  // memalign
10#include <string.h>  // memset
11#include "tests/malloc.h"
12#include <math.h>    // isnormal
13
14typedef  unsigned char           UChar;
15typedef  unsigned short int      UShort;
16typedef  unsigned int            UInt;
17typedef  signed int              Int;
18typedef  unsigned char           UChar;
19typedef  unsigned long long int  ULong;
20typedef  signed long long int    Long;
21typedef  double                  Double;
22typedef  float                   Float;
23
24typedef  unsigned char           Bool;
25#define False ((Bool)0)
26#define True  ((Bool)1)
27
28
29#define ITERS 1
30
31typedef
32  enum { TyHF=1234, TySF, TyDF, TyB, TyH, TyS, TyD, TyNONE }
33  LaneTy;
34
35union _V128 {
36   UChar  u8[16];
37   UShort u16[8];
38   UInt   u32[4];
39   ULong  u64[2];
40   Float  f32[4];
41   Double f64[2];
42};
43typedef  union _V128   V128;
44
45static inline UChar randUChar ( void )
46{
47   static UInt seed = 80021;
48   seed = 1103515245 * seed + 12345;
49   return (seed >> 17) & 0xFF;
50}
51
52//static ULong randULong ( LaneTy ty )
53//{
54//   Int i;
55//   ULong r = 0;
56//   for (i = 0; i < 8; i++) {
57//      r = (r << 8) | (ULong)(0xFF & randUChar());
58//   }
59//   return r;
60//}
61
62/* Generates a random V128.  Ensures that that it contains normalised
63   FP numbers when viewed as either F32x4 or F64x2, so that it is
64   reasonable to use in FP test cases. */
65static void randV128 ( /*OUT*/V128* v, LaneTy ty )
66{
67   static UInt nCalls = 0, nIters = 0;
68   Int i;
69   nCalls++;
70   while (1) {
71      nIters++;
72      for (i = 0; i < 16; i++) {
73         v->u8[i] = randUChar();
74      }
75      if (isnormal(v->f32[0]) && isnormal(v->f32[1]) && isnormal(v->f32[2])
76          && isnormal(v->f32[3]) && isnormal(v->f64[0]) && isnormal(v->f64[1]))
77        break;
78   }
79   if (0 == (nCalls & 0xFF))
80      printf("randV128: %u calls, %u iters\n", nCalls, nIters);
81}
82
83static void showV128 ( V128* v )
84{
85   Int i;
86   for (i = 15; i >= 0; i--)
87      printf("%02x", (Int)v->u8[i]);
88}
89
90//static void showBlock ( const char* msg, V128* block, Int nBlock )
91//{
92//   Int i;
93//   printf("%s\n", msg);
94//   for (i = 0; i < nBlock; i++) {
95//      printf("  ");
96//      showV128(&block[i]);
97//      printf("\n");
98//   }
99//}
100
101
102/* ---------------------------------------------------------------- */
103/* -- Parameterisable test macros                                -- */
104/* ---------------------------------------------------------------- */
105
106#define DO50(_action) \
107   do { \
108      Int _qq; for (_qq = 0; _qq < 50; _qq++) { _action ; } \
109   } while (0)
110
111
112/* Generate a test that involves two vector regs,
113   with no bias as towards which is input or output.
114   It's OK to use r8 as scratch.*/
115#define GEN_TWOVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO) \
116  __attribute__((noinline)) \
117  static void test_##TESTNAME ( LaneTy ty ) { \
118     Int i; \
119     for (i = 0; i < ITERS; i++) { \
120        V128 block[4+1]; \
121        memset(block, 0x55, sizeof(block)); \
122        randV128(&block[0], ty); \
123        randV128(&block[1], ty); \
124        randV128(&block[2], ty); \
125        randV128(&block[3], ty); \
126        __asm__ __volatile__( \
127           "mov r9, #0 ; vmsr fpscr, r9 ; " \
128           "add r9, %0, #0  ; vld1.8 { q"#VECREG1NO" }, [r9] ; " \
129           "add r9, %0, #16 ; vld1.8 { q"#VECREG2NO" }, [r9] ; " \
130           INSN " ; " \
131           "add r9, %0, #32 ; vst1.8 { q"#VECREG1NO" }, [r9] ; " \
132           "add r9, %0, #48 ; vst1.8 { q"#VECREG2NO" }, [r9] ; " \
133           "vmrs r9, fpscr ; str r9, [%0, #64] " \
134           : : "r"(&block[0]) \
135             : "cc", "memory", "q"#VECREG1NO, "q"#VECREG2NO, "r8", "r9" \
136        ); \
137        printf(INSN   "   "); \
138        UInt fpscr = 0xFFFFFFFF & block[4].u32[0]; \
139        showV128(&block[0]); printf("  "); \
140        showV128(&block[1]); printf("  "); \
141        showV128(&block[2]); printf("  "); \
142        showV128(&block[3]); printf(" fpscr=%08x\n", fpscr); \
143     } \
144  }
145
146
147/* Generate a test that involves three vector regs,
148   with no bias as towards which is input or output.  It's also OK
149   to use r8 scratch. */
150#define GEN_THREEVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO,VECREG3NO)  \
151  __attribute__((noinline)) \
152  static void test_##TESTNAME ( LaneTy ty ) { \
153     Int i; \
154     for (i = 0; i < ITERS; i++) { \
155        V128 block[6+1]; \
156        memset(block, 0x55, sizeof(block)); \
157        randV128(&block[0], ty); \
158        randV128(&block[1], ty); \
159        randV128(&block[2], ty); \
160        randV128(&block[3], ty); \
161        randV128(&block[4], ty); \
162        randV128(&block[5], ty); \
163        __asm__ __volatile__( \
164           "mov r9, #0 ; vmsr fpscr, r9 ; " \
165           "add r9, %0, #0  ; vld1.8 { q"#VECREG1NO" }, [r9] ; " \
166           "add r9, %0, #16 ; vld1.8 { q"#VECREG2NO" }, [r9] ; " \
167           "add r9, %0, #32 ; vld1.8 { q"#VECREG3NO" }, [r9] ; " \
168           INSN " ; " \
169           "add r9, %0, #48 ; vst1.8 { q"#VECREG1NO" }, [r9] ; " \
170           "add r9, %0, #64 ; vst1.8 { q"#VECREG2NO" }, [r9] ; " \
171           "add r9, %0, #80 ; vst1.8 { q"#VECREG3NO" }, [r9] ; " \
172           "vmrs r9, fpscr ; str r9, [%0, #96] " \
173           : : "r"(&block[0]) \
174           : "cc", "memory", "q"#VECREG1NO, "q"#VECREG2NO, "q"#VECREG3NO, \
175             "r8", "r9" \
176        ); \
177        printf(INSN   "   "); \
178        UInt fpscr = 0xFFFFFFFF & block[6].u32[0]; \
179        showV128(&block[0]); printf("  "); \
180        showV128(&block[1]); printf("  "); \
181        showV128(&block[2]); printf("  "); \
182        showV128(&block[3]); printf("  "); \
183        showV128(&block[4]); printf("  "); \
184        showV128(&block[5]); printf(" fpscr=%08x\n", fpscr); \
185     } \
186  }
187
188// ======================== CRYPTO ========================
189
190GEN_TWOVEC_TEST(aesd_q_q,   "aesd.8 q3, q4",     3,  4)
191GEN_TWOVEC_TEST(aese_q_q,   "aese.8 q12, q13",  12, 13)
192GEN_TWOVEC_TEST(aesimc_q_q, "aesimc.8 q15, q0", 15,  0)
193GEN_TWOVEC_TEST(aesmc_q_q,  "aesmc.8 q1, q9",    1,  9)
194
195GEN_THREEVEC_TEST(sha1c_q_q_q,   "sha1c.32 q11, q10, q2",   11, 10, 2)
196GEN_TWOVEC_TEST(sha1h_q_q,       "sha1h.32 q6, q7",         6, 7)
197GEN_THREEVEC_TEST(sha1m_q_q_q,   "sha1m.32 q2, q8, q13",    2, 8, 13)
198GEN_THREEVEC_TEST(sha1p_q_q_q,   "sha1p.32 q3, q9, q14",    3, 9, 14)
199GEN_THREEVEC_TEST(sha1su0_q_q_q, "sha1su0.32 q4, q10, q15", 4, 10, 15)
200GEN_TWOVEC_TEST(sha1su1_q_q,     "sha1su1.32 q11, q2",      11, 2)
201
202GEN_THREEVEC_TEST(sha256h2_q_q_q,  "sha256h2.32 q9, q8, q7",     9, 8, 7)
203GEN_THREEVEC_TEST(sha256h_q_q_q,   "sha256h.32 q10, q9, q8",     10, 9, 8)
204GEN_TWOVEC_TEST(sha256su0_q_q,     "sha256su0.32 q11, q10",      11, 10)
205GEN_THREEVEC_TEST(sha256su1_q_q_q, "sha256su1.32 q12, q11, q10", 12, 11, 10)
206
207// This is a bit complex.  This really mentions three registers, so it
208// should really be a THREEVEC variant.  But the two source registers
209// are D registers.  So we say it is just a TWOVEC insn, producing a Q
210// and taking a single Q (q7); q7 is the d14-d15 register pair, which
211// is why the insn itself is mentions d14 and d15 whereas the
212// numbers that follow mention q7.  The result (q7) is 128 bits wide and
213// so is unaffected by these shenanigans.
214GEN_TWOVEC_TEST(pmull_q_d_d,  "vmull.p64 q13, d14, d15", 13, 7)
215
216int main ( void )
217{
218   // ======================== CRYPTO ========================
219
220   // aesd.8     q_q (aes single round decryption)
221   // aese.8     q_q (aes single round encryption)
222   // aesimc.8   q_q (aes inverse mix columns)
223   // aesmc.8    q_q (aes mix columns)
224   if (1) DO50( test_aesd_q_q(TyNONE) );
225   if (1) DO50( test_aese_q_q(TyNONE) );
226   if (1) DO50( test_aesimc_q_q(TyNONE) );
227   if (1) DO50( test_aesmc_q_q(TyNONE) );
228
229   // sha1c.32   q_q_q
230   // sha1h.32   q_q
231   // sha1m.32   q_q_q
232   // sha1p.32   q_q_q
233   // sha1su0.32 q_q_q
234   // sha1su1.32 q_q
235   if (1) DO50( test_sha1c_q_q_q(TyNONE) );
236   if (1) DO50( test_sha1h_q_q(TyNONE) );
237   if (1) DO50( test_sha1m_q_q_q(TyNONE) );
238   if (1) DO50( test_sha1p_q_q_q(TyNONE) );
239   if (1) DO50( test_sha1su0_q_q_q(TyNONE) );
240   if (1) DO50( test_sha1su1_q_q(TyNONE) );
241
242   // sha256h2.32  q_q_q
243   // sha256h.32   q_q_q
244   // sha256su0.32 q_q
245   // sha256su1.32 q_q_q
246   if (1) DO50( test_sha256h2_q_q_q(TyNONE) );
247   if (1) DO50( test_sha256h_q_q_q(TyNONE) );
248   if (1) DO50( test_sha256su0_q_q(TyNONE) );
249   if (1) DO50( test_sha256su1_q_q_q(TyNONE) );
250
251   // vmull.64  q_d_d
252   if (1) DO50( test_pmull_q_d_d(TyD) );
253
254   return 0;
255}
256