1/*
2
3Copyright (c) 2013 STMicroelectronics
4Written by Christophe Lyon
5
6Permission is hereby granted, free of charge, to any person obtaining a copy
7of this software and associated documentation files (the "Software"), to deal
8in the Software without restriction, including without limitation the rights
9to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10copies of the Software, and to permit persons to whom the Software is
11furnished to do so, subject to the following conditions:
12
13The above copyright notice and this permission notice shall be included in
14all copies or substantial portions of the Software.
15
16THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22THE SOFTWARE.
23
24*/
25
26#if defined(__arm__) || defined(__aarch64__)
27#include <arm_neon.h>
28#else
29#include "stm-arm-neon.h"
30#endif
31#include "stm-arm-neon-ref.h"
32
33/* Initialization helpers; 4 slices are needed for vld2, vld3 and
34   vld4.  */
35#define MY_INIT_TAB(T,W,N) xNAME(INIT_TAB,N)(T##W##_t)
36#define MY_INIT_TAB2(T,W,N) xNAME(INIT_TAB2,N)(T##W##_t)
37#define MY_INIT_TAB3(T,W,N) xNAME(INIT_TAB3,N)(T##W##_t)
38#define MY_INIT_TAB4(T,W,N) xNAME(INIT_TAB4,N)(T##W##_t)
39
40/* Initialized input buffers.  */
41#define VECT_VAR_DECL_INIT(V, T, W, N)			\
42  VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,N) };
43
44/* Specialized initializer with 4 entries, as used by vldX_dup and
45   vdup tests, which iterate 4 times on input buffers.  */
46#define VECT_VAR_DECL_INIT4(V, T, W, N)			\
47  VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,4) };
48
49/* Initializers for arrays of vectors.  */
50#define VECT_ARRAY_INIT2(V, T, W, N)		\
51  T##W##_t VECT_ARRAY_VAR(V,T,W,N,2)[] =	\
52  { MY_INIT_TAB(T,W,N)				\
53    MY_INIT_TAB2(T,W,N) };
54
55#define VECT_ARRAY_INIT3(V, T, W, N)					\
56  T##W##_t VECT_ARRAY_VAR(V,T,W,N,3)[] =				\
57  { MY_INIT_TAB(T,W,N)							\
58    MY_INIT_TAB2(T,W,N)							\
59    MY_INIT_TAB3(T,W,N) };
60
61#define VECT_ARRAY_INIT4(V, T, W, N)					\
62  T##W##_t VECT_ARRAY_VAR(V,T,W,N,4)[] =				\
63  { MY_INIT_TAB(T,W,N)							\
64    MY_INIT_TAB2(T,W,N)							\
65    MY_INIT_TAB3(T,W,N)							\
66    MY_INIT_TAB4(T,W,N) };
67
68/* Sample initialization vectors.  */
69#define INIT_TAB_1(T)				\
70  (T)-16,
71#define INIT_TAB2_1(T)				\
72  (T)-15,
73#define INIT_TAB3_1(T)				\
74  (T)-14,
75#define INIT_TAB4_1(T)				\
76  (T)-13,
77
78#define INIT_TAB_2(T)				\
79  (T)-16, (T)-15,
80#define INIT_TAB2_2(T)				\
81  (T)-14, (T)-13,
82#define INIT_TAB3_2(T)				\
83  (T)-12, (T)-11,
84#define INIT_TAB4_2(T)				\
85  (T)-10, (T)-9,
86
87/* Initializer for vld3_lane tests.  */
88#define INIT_TAB_3(T)				\
89  (T)-16, (T)-15, (T)-14,
90
91#define INIT_TAB_4(T)				\
92  (T)-16, (T)-15, (T)-14, (T)-13,
93#define INIT_TAB2_4(T)				\
94  (T)-12, (T)-11, (T)-10, (T)-9,
95#define INIT_TAB3_4(T)				\
96  (T)-8, (T)-7, (T)-6, (T)-5,
97#define INIT_TAB4_4(T)				\
98  (T)-4, (T)-3, (T)-2, (T)-1,
99
100#define INIT_TAB_8(T)							\
101  (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9,
102#define INIT_TAB2_8(T)							\
103  (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1,
104#define INIT_TAB3_8(T)							\
105  (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7,
106#define INIT_TAB4_8(T)							\
107  (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15,
108
109#define INIT_TAB_16(T)							\
110  (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9,	\
111  (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1,
112#define INIT_TAB2_16(T)							\
113  (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7,			\
114  (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15,
115#define INIT_TAB3_16(T)							\
116  (T)16, (T)17, (T)18, (T)19, (T)20, (T)21, (T)22, (T)23,		\
117   (T)24, (T)25, (T)26, (T)27, (T)28, (T)29, (T)30, (T)31,
118#define INIT_TAB4_16(T)							\
119  (T)32, (T)33, (T)34, (T)35, (T)36, (T)37, (T)38, (T)39,		\
120  (T)40, (T)41, (T)42, (T)43, (T)44, (T)45, (T)46, (T)47,
121
122/* Input buffers, one of each size.  */
123/* Insert some padding to try to exhibit out of bounds accesses.  */
124VECT_VAR_DECL_INIT(buffer, int, 8, 8);
125PAD(buffer_pad, int, 8, 8);
126VECT_VAR_DECL_INIT(buffer, int, 16, 4);
127PAD(buffer_pad, int, 16, 4);
128VECT_VAR_DECL_INIT(buffer, int, 32, 2);
129PAD(buffer_pad, int, 32, 2);
130VECT_VAR_DECL_INIT(buffer, int, 64, 1);
131PAD(buffer_pad, int, 64, 1);
132VECT_VAR_DECL_INIT(buffer, uint, 8, 8);
133PAD(buffer_pad, uint, 8, 8);
134VECT_VAR_DECL_INIT(buffer, poly, 8, 8);
135PAD(buffer_pad, poly, 8, 8);
136VECT_VAR_DECL_INIT(buffer, poly, 16, 4);
137PAD(buffer_pad, poly, 16, 4);
138VECT_VAR_DECL_INIT(buffer, uint, 16, 4);
139PAD(buffer_pad, uint, 16, 4);
140VECT_VAR_DECL_INIT(buffer, uint, 32, 2);
141PAD(buffer_pad, uint, 32, 2);
142VECT_VAR_DECL_INIT(buffer, uint, 64, 1);
143PAD(buffer_pad, uint, 64, 1);
144VECT_VAR_DECL_INIT(buffer, float, 32, 2);
145PAD(buffer_pad, float, 32, 2);
146#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
147/* We need a different initialization for ARMCC, because the compiler
148   performs the conversion to half-precision internal
149   representation.  */
150#ifdef __ARMCC_VERSION
151__fp16 buffer_float16x4[4] = {-16, -15, -14, -13};
152#else
153VECT_VAR_DECL(buffer, float, 16, 4) [] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
154					  0xcb00 /* -14 */, 0xca80 /* -13 */};
155#endif
156PAD(buffer_pad, float, 16, 4);
157#endif
158VECT_VAR_DECL_INIT(buffer, int, 8, 16);
159PAD(buffer_pad, int, 8, 16);
160VECT_VAR_DECL_INIT(buffer, int, 16, 8);
161PAD(buffer_pad, int, 16, 8);
162VECT_VAR_DECL_INIT(buffer, int, 32, 4);
163PAD(buffer_pad, int, 32, 4);
164VECT_VAR_DECL_INIT(buffer, int, 64, 2);
165PAD(buffer_pad, int, 64, 2);
166VECT_VAR_DECL_INIT(buffer, uint, 8, 16);
167PAD(buffer_pad, uint, 8, 16);
168VECT_VAR_DECL_INIT(buffer, uint, 16, 8);
169PAD(buffer_pad, uint, 16, 8);
170VECT_VAR_DECL_INIT(buffer, uint, 32, 4);
171PAD(buffer_pad, uint, 32, 4);
172VECT_VAR_DECL_INIT(buffer, uint, 64, 2);
173PAD(buffer_pad, uint, 64, 2);
174VECT_VAR_DECL_INIT(buffer, poly, 8, 16);
175PAD(buffer_pad, poly, 8, 16);
176VECT_VAR_DECL_INIT(buffer, poly, 16, 8);
177PAD(buffer_pad, poly, 16, 8);
178VECT_VAR_DECL_INIT(buffer, float, 32, 4);
179PAD(buffer_pad, float, 32, 4);
180#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
181#ifdef __ARMCC_VERSION
182__fp16 buffer_float16x8[8] = {-16, -15, -14, -13, -12, -11, -10, -9};
183#else
184VECT_VAR_DECL(buffer, float, 16, 8) [] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
185					  0xcb00 /* -14 */, 0xca80 /* -13 */,
186					  0xca00 /* -12 */, 0xc980 /* -11 */,
187					  0xc900 /* -10 */, 0xc880 /* -9 */};
188#endif
189PAD(buffer_pad, float, 16, 8);
190#endif
191
192/* The tests for vld1_dup and vdup expect at least 4 entries in the
193   input buffer, so force 1- and 2-elements initializers to have 4
194   entries.  */
195VECT_VAR_DECL_INIT(buffer_dup, int, 8, 8);
196VECT_VAR_DECL(buffer_dup_pad, int, 8, 8);
197VECT_VAR_DECL_INIT(buffer_dup, int, 16, 4);
198VECT_VAR_DECL(buffer_dup_pad, int, 16, 4);
199VECT_VAR_DECL_INIT4(buffer_dup, int, 32, 2);
200VECT_VAR_DECL(buffer_dup_pad, int, 32, 2);
201VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 1);
202VECT_VAR_DECL(buffer_dup_pad, int, 64, 1);
203VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 8);
204VECT_VAR_DECL(buffer_dup_pad, uint, 8, 8);
205VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 4);
206VECT_VAR_DECL(buffer_dup_pad, uint, 16, 4);
207VECT_VAR_DECL_INIT4(buffer_dup, uint, 32, 2);
208VECT_VAR_DECL(buffer_dup_pad, uint, 32, 2);
209VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 1);
210VECT_VAR_DECL(buffer_dup_pad, uint, 64, 1);
211VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 8);
212VECT_VAR_DECL(buffer_dup_pad, poly, 8, 8);
213VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 4);
214VECT_VAR_DECL(buffer_dup_pad, poly, 16, 4);
215VECT_VAR_DECL_INIT4(buffer_dup, float, 32, 2);
216VECT_VAR_DECL(buffer_dup_pad, float, 32, 2);
217#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
218#ifdef __ARMCC_VERSION
219__fp16 buffer_dup_float16x4[4] = {-16, -15, -14, -13};
220#else
221VECT_VAR_DECL(buffer_dup, float, 16, 4)[] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
222					     0xcb00 /* -14 */, 0xca80 /* -13 */};
223#endif
224PAD(buffer_dup_pad, float, 16, 4);
225#endif
226VECT_VAR_DECL_INIT(buffer_dup, int, 8, 16);
227VECT_VAR_DECL(buffer_dup_pad, int, 8, 16);
228VECT_VAR_DECL_INIT(buffer_dup, int, 16, 8);
229VECT_VAR_DECL(buffer_dup_pad, int, 16, 8);
230VECT_VAR_DECL_INIT(buffer_dup, int, 32, 4);
231VECT_VAR_DECL(buffer_dup_pad, int, 32, 4);
232VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 2);
233VECT_VAR_DECL(buffer_dup_pad, int, 64, 2);
234VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 16);
235VECT_VAR_DECL(buffer_dup_pad, uint, 8, 16);
236VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 8);
237VECT_VAR_DECL(buffer_dup_pad, uint, 16, 8);
238VECT_VAR_DECL_INIT(buffer_dup, uint, 32, 4);
239VECT_VAR_DECL(buffer_dup_pad, uint, 32, 4);
240VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 2);
241VECT_VAR_DECL(buffer_dup_pad, uint, 64, 2);
242VECT_VAR_DECL_INIT(buffer_dup, poly, 8, 16);
243VECT_VAR_DECL(buffer_dup_pad, poly, 8, 16);
244VECT_VAR_DECL_INIT(buffer_dup, poly, 16, 8);
245VECT_VAR_DECL(buffer_dup_pad, poly, 16, 8);
246VECT_VAR_DECL_INIT(buffer_dup, float, 32, 4);
247VECT_VAR_DECL(buffer_dup_pad, float, 32, 4);
248#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
249#ifdef __ARMCC_VERSION
250__fp16 buffer_dup_float16x8[8] = {-16, -15, -14, -13, -12, -11, -10, -9};
251#else
252VECT_VAR_DECL(buffer_dup, float, 16, 8)[] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
253					     0xcb00 /* -14 */, 0xca80 /* -13 */,
254					     0xca00 /* -12 */, 0xc980 /* -11 */,
255					     0xc900 /* -10 */, 0xc880 /* -9 */};
256#endif
257PAD(buffer_dup_pad, float, 16, 8);
258#endif
259
260/* Input buffers for vld2, 1 of each size */
261VECT_ARRAY_INIT2(buffer_vld2, int, 8, 8);
262PAD(buffer_vld2_pad, int, 8, 8);
263VECT_ARRAY_INIT2(buffer_vld2, int, 16, 4);
264PAD(buffer_vld2_pad, int, 16, 4);
265VECT_ARRAY_INIT2(buffer_vld2, int, 32, 2);
266PAD(buffer_vld2_pad, int, 32, 2);
267VECT_ARRAY_INIT2(buffer_vld2, int, 64, 1);
268PAD(buffer_vld2_pad, int, 64, 1);
269VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 8);
270PAD(buffer_vld2_pad, uint, 8, 8);
271VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 4);
272PAD(buffer_vld2_pad, uint, 16, 4);
273VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 2);
274PAD(buffer_vld2_pad, uint, 32, 2);
275VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 1);
276PAD(buffer_vld2_pad, uint, 64, 1);
277VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 8);
278PAD(buffer_vld2_pad, poly, 8, 8);
279VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
280PAD(buffer_vld2_pad, poly, 16, 4);
281VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
282PAD(buffer_vld2_pad, float, 32, 2);
283#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
284#ifdef __ARMCC_VERSION
285__fp16 buffer_vld2_float16x4x2[4*2] = {-16, -15, -14, -13, -12, -11, -10, -9};
286#else
287float16_t buffer_vld2_float16x4x2[4*2] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
288					  0xcb00 /* -14 */, 0xca80 /* -13 */,
289					  0xca00 /* -12 */, 0xc980 /* -11 */,
290					  0xc900 /* -10 */, 0xc880 /* -9 */};
291#endif
292PAD(buffer_vld2_pad, float, 16, 4);
293#endif
294VECT_ARRAY_INIT2(buffer_vld2, int, 8, 16);
295PAD(buffer_vld2_pad, int, 8, 16);
296VECT_ARRAY_INIT2(buffer_vld2, int, 16, 8);
297PAD(buffer_vld2_pad, int, 16, 8);
298VECT_ARRAY_INIT2(buffer_vld2, int, 32, 4);
299PAD(buffer_vld2_pad, int, 32, 4);
300VECT_ARRAY_INIT2(buffer_vld2, int, 64, 2);
301PAD(buffer_vld2_pad, int, 64, 2);
302VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 16);
303PAD(buffer_vld2_pad, uint, 8, 16);
304VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 8);
305PAD(buffer_vld2_pad, uint, 16, 8);
306VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 4);
307PAD(buffer_vld2_pad, uint, 32, 4);
308VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 2);
309PAD(buffer_vld2_pad, uint, 64, 2);
310VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 16);
311PAD(buffer_vld2_pad, poly, 8, 16);
312VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
313PAD(buffer_vld2_pad, poly, 16, 8);
314VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
315PAD(buffer_vld2_pad, float, 32, 4);
316#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
317#ifdef __ARMCC_VERSION
318__fp16 buffer_vld2_float16x8x2[8*2] = {-16, -15, -14, -13, -12, -11, -10, -9,
319				       -8, -7, -6, -5, -4, -3, -2, -1};
320#else
321float16_t buffer_vld2_float16x8x2[8*2] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
322					  0xcb00 /* -14 */, 0xca80 /* -13 */,
323					  0xca00 /* -12 */, 0xc980 /* -11 */,
324					  0xc900 /* -10 */, 0xc880 /* -9 */,
325					  0xc800 /* -8 */, 0xc700 /* -7 */,
326					  0xc600 /* -6 */, 0xc500 /* -5 */,
327					  0xc400 /* -4 */, 0xc200 /* -3 */,
328					  0xc000 /* -2 */, 0xbc00 /* -1 */};
329#endif
330PAD(buffer_vld2_pad, float, 16, 8);
331#endif
332
333/* Input buffers for vld3, 1 of each size */
334VECT_ARRAY_INIT3(buffer_vld3, int, 8, 8);
335PAD(buffer_vld3_pad, int, 8, 8);
336VECT_ARRAY_INIT3(buffer_vld3, int, 16, 4);
337PAD(buffer_vld3_pad, int, 16, 4);
338VECT_ARRAY_INIT3(buffer_vld3, int, 32, 2);
339PAD(buffer_vld3_pad, int, 32, 2);
340VECT_ARRAY_INIT3(buffer_vld3, int, 64, 1);
341PAD(buffer_vld3_pad, int, 64, 1);
342VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 8);
343PAD(buffer_vld3_pad, uint, 8, 8);
344VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 4);
345PAD(buffer_vld3_pad, uint, 16, 4);
346VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 2);
347PAD(buffer_vld3_pad, uint, 32, 2);
348VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 1);
349PAD(buffer_vld3_pad, uint, 64, 1);
350VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 8);
351PAD(buffer_vld3_pad, poly, 8, 8);
352VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
353PAD(buffer_vld3_pad, poly, 16, 4);
354VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
355PAD(buffer_vld3_pad, float, 32, 2);
356#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
357#ifdef __ARMCC_VERSION
358__fp16 buffer_vld3_float16x4x3[4*3] = {-16, -15, -14, -13, -12, -11, -10, -9,
359				       -8, -7, -6, -5};
360#else
361float16_t buffer_vld3_float16x4x3[4*3] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
362					  0xcb00 /* -14 */, 0xca80 /* -13 */,
363					  0xca00 /* -12 */, 0xc980 /* -11 */,
364					  0xc900 /* -10 */, 0xc880 /* -9 */,
365					  0xc800 /* -8 */, 0xc700 /* -7 */,
366					  0xc600 /* -6 */, 0xc500 /* -5 */};
367#endif
368PAD(buffer_vld3_pad, float, 16, 4);
369#endif
370VECT_ARRAY_INIT3(buffer_vld3, int, 8, 16);
371PAD(buffer_vld3_pad, int, 8, 16);
372VECT_ARRAY_INIT3(buffer_vld3, int, 16, 8);
373PAD(buffer_vld3_pad, int, 16, 8);
374VECT_ARRAY_INIT3(buffer_vld3, int, 32, 4);
375PAD(buffer_vld3_pad, int, 32, 4);
376VECT_ARRAY_INIT3(buffer_vld3, int, 64, 2);
377PAD(buffer_vld3_pad, int, 64, 2);
378VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 16);
379PAD(buffer_vld3_pad, uint, 8, 16);
380VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 8);
381PAD(buffer_vld3_pad, uint, 16, 8);
382VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 4);
383PAD(buffer_vld3_pad, uint, 32, 4);
384VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 2);
385PAD(buffer_vld3_pad, uint, 64, 2);
386VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 16);
387PAD(buffer_vld3_pad, poly, 8, 16);
388VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
389PAD(buffer_vld3_pad, poly, 16, 8);
390VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
391PAD(buffer_vld3_pad, float, 32, 4);
392#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
393#ifdef __ARMCC_VERSION
394__fp16 buffer_vld3_float16x8x3[8*3] = {-16, -15, -14, -13, -12, -11, -10, -9,
395				       -8, -7, -6, -5, -4, -3, -2, -1,
396				       0, 1, 2, 3, 4, 5, 6, 7};
397#else
398float16_t buffer_vld3_float16x8x3[8*3] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
399					  0xcb00 /* -14 */, 0xca80 /* -13 */,
400					  0xca00 /* -12 */, 0xc980 /* -11 */,
401					  0xc900 /* -10 */, 0xc880 /* -9 */,
402					  0xc800 /* -8 */, 0xc700 /* -7 */,
403					  0xc600 /* -6 */, 0xc500 /* -6 */,
404					  0xc400 /* -4 */, 0xc200 /* -3 */,
405					  0xc000 /* -2 */, 0xbc00 /* -1 */,
406					  0, 0x3c00 /* 1 */,
407					  0x4000 /* 2 */, 0x4200 /* 3 */,
408					  0x4400 /* 4 */, 0x4500 /* 5 */,
409					  0x4600 /* 6 */, 0x4700 /* 7 */};
410#endif
411PAD(buffer_vld3_pad, float, 16, 8);
412#endif
413
414/* Input buffers for vld4, 1 of each size */
415VECT_ARRAY_INIT4(buffer_vld4, int, 8, 8);
416PAD(buffer_vld4_pad, int, 8, 8);
417VECT_ARRAY_INIT4(buffer_vld4, int, 16, 4);
418PAD(buffer_vld4_pad, int, 16, 4);
419VECT_ARRAY_INIT4(buffer_vld4, int, 32, 2);
420PAD(buffer_vld4_pad, int, 32, 2);
421VECT_ARRAY_INIT4(buffer_vld4, int, 64, 1);
422PAD(buffer_vld4_pad, int, 64, 1);
423VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 8);
424PAD(buffer_vld4_pad, uint, 8, 8);
425VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 4);
426PAD(buffer_vld4_pad, uint, 16, 4);
427VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 2);
428PAD(buffer_vld4_pad, uint, 32, 2);
429VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 1);
430PAD(buffer_vld4_pad, uint, 64, 1);
431VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 8);
432PAD(buffer_vld4_pad, poly, 8, 8);
433VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
434PAD(buffer_vld4_pad, poly, 16, 4);
435VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
436PAD(buffer_vld4_pad, float, 32, 2);
437#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
438#ifdef __ARMCC_VERSION
439__fp16 buffer_vld4_float16x4x4[4*4] = {-16, -15, -14, -13, -12, -11, -10, -9,
440				       -8, -7, -6, -5, -4, -3, -2, -1};
441#else
442float16_t buffer_vld4_float16x4x4[4*4] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
443					  0xcb00 /* -14 */, 0xca80 /* -13 */,
444					  0xca00 /* -12 */, 0xc980 /* -11 */,
445					  0xc900 /* -10 */, 0xc880 /* -9 */,
446					  0xc800 /* -8 */, 0xc700 /* -7 */,
447					  0xc600 /* -6 */, 0xc500 /* -5 */,
448					  0xc400 /* -4 */, 0xc200 /* -3 */,
449					  0xc000 /* -2 */, 0xbc00 /* -1 */};
450#endif
451PAD(buffer_vld4_pad, float, 16, 4);
452#endif
453VECT_ARRAY_INIT4(buffer_vld4, int, 8, 16);
454PAD(buffer_vld4_pad, int, 8, 16);
455VECT_ARRAY_INIT4(buffer_vld4, int, 16, 8);
456PAD(buffer_vld4_pad, int, 16, 8);
457VECT_ARRAY_INIT4(buffer_vld4, int, 32, 4);
458PAD(buffer_vld4_pad, int, 32, 4);
459VECT_ARRAY_INIT4(buffer_vld4, int, 64, 2);
460PAD(buffer_vld4_pad, int, 64, 2);
461VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 16);
462PAD(buffer_vld4_pad, uint, 8, 16);
463VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 8);
464PAD(buffer_vld4_pad, uint, 16, 8);
465VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 4);
466PAD(buffer_vld4_pad, uint, 32, 4);
467VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 2);
468PAD(buffer_vld4_pad, uint, 64, 2);
469VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 16);
470PAD(buffer_vld4_pad, poly, 8, 16);
471VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
472PAD(buffer_vld4_pad, poly, 16, 8);
473VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
474PAD(buffer_vld4_pad, float, 32, 4);
475#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
476#ifdef __ARMCC_VERSION
477__fp16 buffer_vld4_float16x8x4[8*4] = {-16, -15, -14, -13, -12, -11, -10, -9,
478				       -8, -7, -6, -5, -4, -3, -2, -1,
479				       0, 1, 2, 3, 4, 5, 6, 7,
480				       8, 9, 10, 11, 12, 13, 14, 15};
481#else
482float16_t buffer_vld4_float16x8x4[8*4] = {0xcc00 /* -16 */, 0xcb80 /* -15 */,
483					  0xcb00 /* -14 */, 0xca80 /* -13 */,
484					  0xca00 /* -12 */, 0xc980 /* -11 */,
485					  0xc900 /* -10 */, 0xc880 /* -9 */,
486					  0xc800 /* -8 */, 0xc700 /* -7 */,
487					  0xc600 /* -6 */, 0xc500 /* -6 */,
488					  0xc400 /* -4 */, 0xc200 /* -3 */,
489					  0xc000 /* -2 */, 0xbc00 /* -1 */,
490					  0, 0x3c00 /* 1 */,
491					  0x4000 /* 2 */, 0x4200 /* 3 */,
492					  0x4400 /* 4 */, 0x4500 /* 5 */,
493					  0x4600 /* 6 */, 0x4700 /* 7 */,
494					  0x4800 /* 8 */, 0x4880 /* 9 */,
495					  0x4900 /* 10 */, 0x4980 /* 11 */,
496					  0x4a00 /* 12 */, 0x4a80 /* 13 */,
497					  0x4b00 /* 14 */, 0x04b80 /* 15 */};
498#endif
499PAD(buffer_vld4_pad, float, 16, 8);
500#endif
501
502/* Input buffers for vld2_lane */
503VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 8, 2);
504VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 16, 2);
505VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 32, 2);
506VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 64, 2);
507VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 8, 2);
508VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 16, 2);
509VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
510VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
511VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
512VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
513VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
514#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
515#ifdef __ARMCC_VERSION
516__fp16 buffer_vld2_lane_float16x2[2] = {-16, -15};
517#else
518VECT_VAR_DECL(buffer_vld2_lane, float, 16, 2) [] = {0xcc00 /* -16 */,
519						    0xcb80 /* -15 */};
520#endif
521#endif
522
523/* Input buffers for vld3_lane */
524VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 8, 3);
525VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 16, 3);
526VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 32, 3);
527VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 64, 3);
528VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 8, 3);
529VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 16, 3);
530VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
531VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
532VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
533VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
534VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
535#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
536#ifdef __ARMCC_VERSION
537__fp16 buffer_vld3_lane_float16x3[3] = {-16, -15, -14};
538#else
539VECT_VAR_DECL(buffer_vld3_lane, float, 16, 3) [] = {0xcc00 /* -16 */,
540						    0xcb80 /* -15 */,
541						    0xcb00 /* -14 */};
542#endif
543#endif
544
545/* Input buffers for vld4_lane */
546VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 8, 4);
547VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 16, 4);
548VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 32, 4);
549VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 64, 4);
550VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 8, 4);
551VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 16, 4);
552VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
553VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
554VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
555VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
556VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
557#if defined(__ARM_FP16_FORMAT_IEEE) && ( ((__ARM_FP & 0x2) != 0) || ((__ARM_NEON_FP16_INTRINSICS & 1) != 0) )
558#ifdef __ARMCC_VERSION
559__fp16 buffer_vld4_lane_float16x4[4] = {-16, -15, -14, -13};
560#else
561VECT_VAR_DECL(buffer_vld4_lane, float, 16, 4) [] = {0xcc00 /* -16 */,
562						    0xcb80 /* -15 */,
563						    0xcb00 /* -14 */,
564						    0xca80 /* -13 */};
565#endif
566#endif
567