macros_msa.h revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VPX_DSP_MIPS_MACROS_MSA_H_
12#define VPX_DSP_MIPS_MACROS_MSA_H_
13
14#include <msa.h>
15
16#include "./vpx_config.h"
17#include "vpx/vpx_integer.h"
18
19#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
20#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
21#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
22
23#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
24#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
25#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
26
27#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
28#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
29
30#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
31#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
32#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
33
34#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
35#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
36
37#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
38#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
39
40#if (__mips_isa_rev >= 6)
41#define LH(psrc) ({                                 \
42  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
43  uint16_t val_m;                                   \
44                                                    \
45  __asm__ __volatile__ (                            \
46      "lh  %[val_m],  %[psrc_m]  \n\t"              \
47                                                    \
48      : [val_m] "=r" (val_m)                        \
49      : [psrc_m] "m" (*psrc_m)                      \
50  );                                                \
51                                                    \
52  val_m;                                            \
53})
54
55#define LW(psrc) ({                                 \
56  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
57  uint32_t val_m;                                   \
58                                                    \
59  __asm__ __volatile__ (                            \
60      "lw  %[val_m],  %[psrc_m]  \n\t"              \
61                                                    \
62      : [val_m] "=r" (val_m)                        \
63      : [psrc_m] "m" (*psrc_m)                      \
64  );                                                \
65                                                    \
66  val_m;                                            \
67})
68
69#if (__mips == 64)
70#define LD(psrc) ({                                 \
71  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
72  uint64_t val_m = 0;                               \
73                                                    \
74  __asm__ __volatile__ (                            \
75      "ld  %[val_m],  %[psrc_m]  \n\t"              \
76                                                    \
77      : [val_m] "=r" (val_m)                        \
78      : [psrc_m] "m" (*psrc_m)                      \
79  );                                                \
80                                                    \
81  val_m;                                            \
82})
83#else  // !(__mips == 64)
84#define LD(psrc) ({                                        \
85  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
86  uint32_t val0_m, val1_m;                                 \
87  uint64_t val_m = 0;                                      \
88                                                           \
89  val0_m = LW(psrc_m);                                     \
90  val1_m = LW(psrc_m + 4);                                 \
91                                                           \
92  val_m = (uint64_t)(val1_m);                              \
93  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
94  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
95                                                           \
96  val_m;                                                   \
97})
98#endif  // (__mips == 64)
99
100#define SH(val, pdst) {                 \
101  uint8_t *pdst_m = (uint8_t *)(pdst);  \
102  const uint16_t val_m = (val);         \
103                                        \
104  __asm__ __volatile__ (                \
105      "sh  %[val_m],  %[pdst_m]  \n\t"  \
106                                        \
107      : [pdst_m] "=m" (*pdst_m)         \
108      : [val_m] "r" (val_m)             \
109  );                                    \
110}
111
112#define SW(val, pdst) {                 \
113  uint8_t *pdst_m = (uint8_t *)(pdst);  \
114  const uint32_t val_m = (val);         \
115                                        \
116  __asm__ __volatile__ (                \
117      "sw  %[val_m],  %[pdst_m]  \n\t"  \
118                                        \
119      : [pdst_m] "=m" (*pdst_m)         \
120      : [val_m] "r" (val_m)             \
121  );                                    \
122}
123
124#define SD(val, pdst) {                 \
125  uint8_t *pdst_m = (uint8_t *)(pdst);  \
126  const uint64_t val_m = (val);         \
127                                        \
128  __asm__ __volatile__ (                \
129      "sd  %[val_m],  %[pdst_m]  \n\t"  \
130                                        \
131      : [pdst_m] "=m" (*pdst_m)         \
132      : [val_m] "r" (val_m)             \
133  );                                    \
134}
135#else  // !(__mips_isa_rev >= 6)
136#define LH(psrc) ({                                 \
137  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
138  uint16_t val_m;                                   \
139                                                    \
140  __asm__ __volatile__ (                            \
141      "ulh  %[val_m],  %[psrc_m]  \n\t"             \
142                                                    \
143      : [val_m] "=r" (val_m)                        \
144      : [psrc_m] "m" (*psrc_m)                      \
145  );                                                \
146                                                    \
147  val_m;                                            \
148})
149
150#define LW(psrc) ({                                 \
151  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
152  uint32_t val_m;                                   \
153                                                    \
154  __asm__ __volatile__ (                            \
155      "ulw  %[val_m],  %[psrc_m]  \n\t"             \
156                                                    \
157      : [val_m] "=r" (val_m)                        \
158      : [psrc_m] "m" (*psrc_m)                      \
159  );                                                \
160                                                    \
161  val_m;                                            \
162})
163
164#if (__mips == 64)
165#define LD(psrc) ({                                 \
166  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
167  uint64_t val_m = 0;                               \
168                                                    \
169  __asm__ __volatile__ (                            \
170      "uld  %[val_m],  %[psrc_m]  \n\t"             \
171                                                    \
172      : [val_m] "=r" (val_m)                        \
173      : [psrc_m] "m" (*psrc_m)                      \
174  );                                                \
175                                                    \
176  val_m;                                            \
177})
178#else  // !(__mips == 64)
179#define LD(psrc) ({                                        \
180  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
181  uint32_t val0_m, val1_m;                                 \
182  uint64_t val_m = 0;                                      \
183                                                           \
184  val0_m = LW(psrc_m1);                                    \
185  val1_m = LW(psrc_m1 + 4);                                \
186                                                           \
187  val_m = (uint64_t)(val1_m);                              \
188  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
189  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
190                                                           \
191  val_m;                                                   \
192})
193#endif  // (__mips == 64)
194
195#define SH(val, pdst) {                  \
196  uint8_t *pdst_m = (uint8_t *)(pdst);   \
197  const uint16_t val_m = (val);          \
198                                         \
199  __asm__ __volatile__ (                 \
200      "ush  %[val_m],  %[pdst_m]  \n\t"  \
201                                         \
202      : [pdst_m] "=m" (*pdst_m)          \
203      : [val_m] "r" (val_m)              \
204  );                                     \
205}
206
207#define SW(val, pdst) {                  \
208  uint8_t *pdst_m = (uint8_t *)(pdst);   \
209  const uint32_t val_m = (val);          \
210                                         \
211  __asm__ __volatile__ (                 \
212      "usw  %[val_m],  %[pdst_m]  \n\t"  \
213                                         \
214      : [pdst_m] "=m" (*pdst_m)          \
215      : [val_m] "r" (val_m)              \
216  );                                     \
217}
218
219#define SD(val, pdst) {                                     \
220  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
221  uint32_t val0_m, val1_m;                                  \
222                                                            \
223  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
224  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
225                                                            \
226  SW(val0_m, pdst_m1);                                      \
227  SW(val1_m, pdst_m1 + 4);                                  \
228}
229#endif  // (__mips_isa_rev >= 6)
230
231/* Description : Load 4 words with stride
232   Arguments   : Inputs  - psrc, stride
233                 Outputs - out0, out1, out2, out3
234   Details     : Load word in 'out0' from (psrc)
235                 Load word in 'out1' from (psrc + stride)
236                 Load word in 'out2' from (psrc + 2 * stride)
237                 Load word in 'out3' from (psrc + 3 * stride)
238*/
239#define LW4(psrc, stride, out0, out1, out2, out3) {  \
240  out0 = LW((psrc));                                 \
241  out1 = LW((psrc) + stride);                        \
242  out2 = LW((psrc) + 2 * stride);                    \
243  out3 = LW((psrc) + 3 * stride);                    \
244}
245
246/* Description : Load double words with stride
247   Arguments   : Inputs  - psrc, stride
248                 Outputs - out0, out1
249   Details     : Load double word in 'out0' from (psrc)
250                 Load double word in 'out1' from (psrc + stride)
251*/
252#define LD2(psrc, stride, out0, out1) {  \
253  out0 = LD((psrc));                     \
254  out1 = LD((psrc) + stride);            \
255}
256#define LD4(psrc, stride, out0, out1, out2, out3) {  \
257  LD2((psrc), stride, out0, out1);                   \
258  LD2((psrc) + 2 * stride, stride, out2, out3);      \
259}
260
261/* Description : Store 4 words with stride
262   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
263   Details     : Store word from 'in0' to (pdst)
264                 Store word from 'in1' to (pdst + stride)
265                 Store word from 'in2' to (pdst + 2 * stride)
266                 Store word from 'in3' to (pdst + 3 * stride)
267*/
268#define SW4(in0, in1, in2, in3, pdst, stride) {  \
269  SW(in0, (pdst))                                \
270  SW(in1, (pdst) + stride);                      \
271  SW(in2, (pdst) + 2 * stride);                  \
272  SW(in3, (pdst) + 3 * stride);                  \
273}
274
275/* Description : Store 4 double words with stride
276   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
277   Details     : Store double word from 'in0' to (pdst)
278                 Store double word from 'in1' to (pdst + stride)
279                 Store double word from 'in2' to (pdst + 2 * stride)
280                 Store double word from 'in3' to (pdst + 3 * stride)
281*/
282#define SD4(in0, in1, in2, in3, pdst, stride) {  \
283  SD(in0, (pdst))                                \
284  SD(in1, (pdst) + stride);                      \
285  SD(in2, (pdst) + 2 * stride);                  \
286  SD(in3, (pdst) + 3 * stride);                  \
287}
288
289/* Description : Load vectors with 16 byte elements with stride
290   Arguments   : Inputs  - psrc, stride
291                 Outputs - out0, out1
292                 Return Type - as per RTYPE
293   Details     : Load 16 byte elements in 'out0' from (psrc)
294                 Load 16 byte elements in 'out1' from (psrc + stride)
295*/
296#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
297  out0 = LD_B(RTYPE, (psrc));                     \
298  out1 = LD_B(RTYPE, (psrc) + stride);            \
299}
300#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
301#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
302
303#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
304  LD_B2(RTYPE, (psrc), stride, out0, out1);             \
305  out2 = LD_B(RTYPE, (psrc) + 2 * stride);              \
306}
307#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
308
309#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
310  LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
311  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
312}
313#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
314#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
315
316#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
317  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
318  out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
319}
320#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
321#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
322
323#define LD_B7(RTYPE, psrc, stride,                             \
324              out0, out1, out2, out3, out4, out5, out6) {      \
325  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
326  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
327}
328#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
329
330#define LD_B8(RTYPE, psrc, stride,                                    \
331              out0, out1, out2, out3, out4, out5, out6, out7) {       \
332  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
333  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
334}
335#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
336#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
337
338/* Description : Load vectors with 8 halfword elements with stride
339   Arguments   : Inputs  - psrc, stride
340                 Outputs - out0, out1
341   Details     : Load 8 halfword elements in 'out0' from (psrc)
342                 Load 8 halfword elements in 'out1' from (psrc + stride)
343*/
344#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
345  out0 = LD_H(RTYPE, (psrc));                     \
346  out1 = LD_H(RTYPE, (psrc) + (stride));          \
347}
348#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
349
350#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
351  LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
352  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
353}
354#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
355
356#define LD_H8(RTYPE, psrc, stride,                                    \
357              out0, out1, out2, out3, out4, out5, out6, out7) {       \
358  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
359  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
360}
361#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
362
363#define LD_H16(RTYPE, psrc, stride,                                     \
364               out0, out1, out2, out3, out4, out5, out6, out7,          \
365               out8, out9, out10, out11, out12, out13, out14, out15) {  \
366  LD_H8(RTYPE, (psrc), stride,                                          \
367        out0, out1, out2, out3, out4, out5, out6, out7);                \
368  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
369        out8, out9, out10, out11, out12, out13, out14, out15);          \
370}
371#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
372
373/* Description : Load 4x4 block of signed halfword elements from 1D source
374                 data into 4 vectors (Each vector with 4 signed halfwords)
375   Arguments   : Input   - psrc
376                 Outputs - out0, out1, out2, out3
377*/
378#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
379  out0 = LD_SH(psrc);                                    \
380  out2 = LD_SH(psrc + 8);                                \
381  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
382  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
383}
384
385/* Description : Load 2 vectors of signed word elements with stride
386   Arguments   : Inputs  - psrc, stride
387                 Outputs - out0, out1
388                 Return Type - signed word
389*/
390#define LD_SW2(psrc, stride, out0, out1) {  \
391  out0 = LD_SW((psrc));                     \
392  out1 = LD_SW((psrc) + stride);            \
393}
394
395/* Description : Store vectors of 16 byte elements with stride
396   Arguments   : Inputs - in0, in1, pdst, stride
397   Details     : Store 16 byte elements from 'in0' to (pdst)
398                 Store 16 byte elements from 'in1' to (pdst + stride)
399*/
400#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
401  ST_B(RTYPE, in0, (pdst));                     \
402  ST_B(RTYPE, in1, (pdst) + stride);            \
403}
404#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
405
406#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
407  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
408  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
409}
410#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
411
412#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
413              pdst, stride) {                                     \
414  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
415  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
416}
417#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
418
419/* Description : Store vectors of 8 halfword elements with stride
420   Arguments   : Inputs - in0, in1, pdst, stride
421   Details     : Store 8 halfword elements from 'in0' to (pdst)
422                 Store 8 halfword elements from 'in1' to (pdst + stride)
423*/
424#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
425  ST_H(RTYPE, in0, (pdst));                     \
426  ST_H(RTYPE, in1, (pdst) + stride);            \
427}
428#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
429
430#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
431  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
432  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
433}
434#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
435
436#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
437  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
438  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
439}
440#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
441
442/* Description : Store vectors of word elements with stride
443   Arguments   : Inputs - in0, in1, pdst, stride
444   Details     : Store 4 word elements from 'in0' to (pdst)
445                 Store 4 word elements from 'in1' to (pdst + stride)
446*/
447#define ST_SW2(in0, in1, pdst, stride) {  \
448  ST_SW(in0, (pdst));                     \
449  ST_SW(in1, (pdst) + stride);            \
450}
451
452/* Description : Store 2x4 byte block to destination memory from input vector
453   Arguments   : Inputs - in, stidx, pdst, stride
454   Details     : Index 'stidx' halfword element from 'in' vector is copied to
455                 the GP register and stored to (pdst)
456                 Index 'stidx+1' halfword element from 'in' vector is copied to
457                 the GP register and stored to (pdst + stride)
458                 Index 'stidx+2' halfword element from 'in' vector is copied to
459                 the GP register and stored to (pdst + 2 * stride)
460                 Index 'stidx+3' halfword element from 'in' vector is copied to
461                 the GP register and stored to (pdst + 3 * stride)
462*/
463#define ST2x4_UB(in, stidx, pdst, stride) {         \
464  uint16_t out0_m, out1_m, out2_m, out3_m;          \
465  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
466                                                    \
467  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
468  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
469  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
470  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
471                                                    \
472  SH(out0_m, pblk_2x4_m);                           \
473  SH(out1_m, pblk_2x4_m + stride);                  \
474  SH(out2_m, pblk_2x4_m + 2 * stride);              \
475  SH(out3_m, pblk_2x4_m + 3 * stride);              \
476}
477
478/* Description : Store 4x2 byte block to destination memory from input vector
479   Arguments   : Inputs - in, pdst, stride
480   Details     : Index 0 word element from 'in' vector is copied to the GP
481                 register and stored to (pdst)
482                 Index 1 word element from 'in' vector is copied to the GP
483                 register and stored to (pdst + stride)
484*/
485#define ST4x2_UB(in, pdst, stride) {        \
486  uint32_t out0_m, out1_m;                  \
487  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
488                                            \
489  out0_m = __msa_copy_u_w((v4i32)in, 0);    \
490  out1_m = __msa_copy_u_w((v4i32)in, 1);    \
491                                            \
492  SW(out0_m, pblk_4x2_m);                   \
493  SW(out1_m, pblk_4x2_m + stride);          \
494}
495
496/* Description : Store 4x4 byte block to destination memory from input vector
497   Arguments   : Inputs - in0, in1, pdst, stride
498   Details     : 'Idx0' word element from input vector 'in0' is copied to the
499                 GP register and stored to (pdst)
500                 'Idx1' word element from input vector 'in0' is copied to the
501                 GP register and stored to (pdst + stride)
502                 'Idx2' word element from input vector 'in0' is copied to the
503                 GP register and stored to (pdst + 2 * stride)
504                 'Idx3' word element from input vector 'in0' is copied to the
505                 GP register and stored to (pdst + 3 * stride)
506*/
507#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
508  uint32_t out0_m, out1_m, out2_m, out3_m;                          \
509  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
510                                                                    \
511  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
512  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
513  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
514  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
515                                                                    \
516  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
517}
518#define ST4x8_UB(in0, in1, pdst, stride) {                        \
519  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
520                                                                  \
521  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
522  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
523}
524
525/* Description : Store 8x1 byte block to destination memory from input vector
526   Arguments   : Inputs - in, pdst
527   Details     : Index 0 double word element from 'in' vector is copied to the
528                 GP register and stored to (pdst)
529*/
530#define ST8x1_UB(in, pdst) {              \
531  uint64_t out0_m;                        \
532                                          \
533  out0_m = __msa_copy_u_d((v2i64)in, 0);  \
534  SD(out0_m, pdst);                       \
535}
536
537/* Description : Store 8x2 byte block to destination memory from input vector
538   Arguments   : Inputs - in, pdst, stride
539   Details     : Index 0 double word element from 'in' vector is copied to the
540                 GP register and stored to (pdst)
541                 Index 1 double word element from 'in' vector is copied to the
542                 GP register and stored to (pdst + stride)
543*/
544#define ST8x2_UB(in, pdst, stride) {        \
545  uint64_t out0_m, out1_m;                  \
546  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
547                                            \
548  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
549  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
550                                            \
551  SD(out0_m, pblk_8x2_m);                   \
552  SD(out1_m, pblk_8x2_m + stride);          \
553}
554
555/* Description : Store 8x4 byte block to destination memory from input
556                 vectors
557   Arguments   : Inputs - in0, in1, pdst, stride
558   Details     : Index 0 double word element from 'in0' vector is copied to the
559                 GP register and stored to (pdst)
560                 Index 1 double word element from 'in0' vector is copied to the
561                 GP register and stored to (pdst + stride)
562                 Index 0 double word element from 'in1' vector is copied to the
563                 GP register and stored to (pdst + 2 * stride)
564                 Index 1 double word element from 'in1' vector is copied to the
565                 GP register and stored to (pdst + 3 * stride)
566*/
567#define ST8x4_UB(in0, in1, pdst, stride) {                  \
568  uint64_t out0_m, out1_m, out2_m, out3_m;                  \
569  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
570                                                            \
571  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
572  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
573  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
574  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
575                                                            \
576  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
577}
578
579/* Description : average with rounding (in0 + in1 + 1) / 2.
580   Arguments   : Inputs  - in0, in1, in2, in3,
581                 Outputs - out0, out1
582                 Return Type - as per RTYPE
583   Details     : Each unsigned byte element from 'in0' vector is added with
584                 each unsigned byte element from 'in1' vector. Then the average
585                 with rounding is calculated and written to 'out0'
586*/
587#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
588  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
589  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
590}
591#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
592
593#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
594                 out0, out1, out2, out3) {                       \
595  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
596  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
597}
598#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
599
600/* Description : Immediate number of elements to slide with zero
601   Arguments   : Inputs  - in0, in1, slide_val
602                 Outputs - out0, out1
603                 Return Type - as per RTYPE
604   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
605                 value specified in the 'slide_val'
606*/
607#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
608  v16i8 zero_m = { 0 };                                              \
609  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
610  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
611}
612#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
613
614#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
615                  out0, out1, out2, out3, slide_val) {  \
616  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
617  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
618}
619#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
620
621/* Description : Immediate number of elements to slide
622   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
623                 Outputs - out0, out1
624                 Return Type - as per RTYPE
625   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
626                 value specified in the 'slide_val'
627*/
628#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
629  out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
630  out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
631}
632#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
633#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
634
635#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
636                out0, out1, out2, slide_val) {                        \
637  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
638  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
639}
640#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
641#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
642
643/* Description : Shuffle byte vector elements as per mask vector
644   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
645                 Outputs - out0, out1
646                 Return Type - as per RTYPE
647   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
648                 'out0' as per control vector 'mask0'
649*/
650#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
651  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
652  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
653}
654#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
655#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
656#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
657
658#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
659                out0, out1, out2, out3) {                        \
660  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
661  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
662}
663#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
664#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
665
666/* Description : Dot product of byte vector elements
667   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
668                 Outputs - out0, out1
669                 Return Type - as per RTYPE
670   Details     : Unsigned byte elements from 'mult0' are multiplied with
671                 unsigned byte elements from 'cnst0' producing a result
672                 twice the size of input i.e. unsigned halfword.
673                 The multiplication result of adjacent odd-even elements
674                 are added together and written to the 'out0' vector
675*/
676#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
677  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
678  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
679}
680#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
681
682#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
683                 cnst0, cnst1, cnst2, cnst3,                \
684                 out0, out1, out2, out3) {                  \
685  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
686  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
687}
688#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
689
690/* Description : Dot product of byte vector elements
691   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
692                 Outputs - out0, out1
693                 Return Type - as per RTYPE
694   Details     : Signed byte elements from 'mult0' are multiplied with
695                 signed byte elements from 'cnst0' producing a result
696                 twice the size of input i.e. signed halfword.
697                 The multiplication result of adjacent odd-even elements
698                 are added together and written to the 'out0' vector
699*/
700#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
701  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
702  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
703}
704#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
705
706#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
707                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
708  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
709  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
710}
711#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
712
713/* Description : Dot product of halfword vector elements
714   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
715                 Outputs - out0, out1
716                 Return Type - as per RTYPE
717   Details     : Signed halfword elements from 'mult0' are multiplied with
718                 signed halfword elements from 'cnst0' producing a result
719                 twice the size of input i.e. signed word.
720                 The multiplication result of adjacent odd-even elements
721                 are added together and written to the 'out0' vector
722*/
723#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
724  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
725  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
726}
727#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
728
729#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
730                 cnst0, cnst1, cnst2, cnst3,                \
731                 out0, out1, out2, out3) {                  \
732  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
733  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
734}
735#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
736
737/* Description : Dot product of word vector elements
738   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
739                 Outputs - out0, out1
740                 Return Type - as per RTYPE
741   Details     : Signed word elements from 'mult0' are multiplied with
742                 signed word elements from 'cnst0' producing a result
743                 twice the size of input i.e. signed double word.
744                 The multiplication result of adjacent odd-even elements
745                 are added together and written to the 'out0' vector
746*/
747#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
748  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
749  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
750}
751#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
752
753/* Description : Dot product & addition of byte vector elements
754   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
755                 Outputs - out0, out1
756                 Return Type - as per RTYPE
757   Details     : Signed byte elements from 'mult0' are multiplied with
758                 signed byte elements from 'cnst0' producing a result
759                 twice the size of input i.e. signed halfword.
760                 The multiplication result of adjacent odd-even elements
761                 are added to the 'out0' vector
762*/
763#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
764  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
765  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
766}
767#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
768
769#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
770                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
771  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
772  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
773}
774#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
775
776/* Description : Dot product & addition of halfword vector elements
777   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
778                 Outputs - out0, out1
779                 Return Type - as per RTYPE
780   Details     : Signed halfword elements from 'mult0' are multiplied with
781                 signed halfword elements from 'cnst0' producing a result
782                 twice the size of input i.e. signed word.
783                 The multiplication result of adjacent odd-even elements
784                 are added to the 'out0' vector
785*/
786#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
787  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
788  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
789}
790#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
791
792/* Description : Dot product & addition of double word vector elements
793   Arguments   : Inputs  - mult0, mult1
794                 Outputs - out0, out1
795                 Return Type - as per RTYPE
796   Details     : Each signed word element from 'mult0' is multiplied with itself
797                 producing an intermediate result twice the size of input
798                 i.e. signed double word
799                 The multiplication result of adjacent odd-even elements
800                 are added to the 'out0' vector
801*/
802#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
803  out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
804  out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
805}
806#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
807
808/* Description : Minimum values between unsigned elements of
809                 either vector are copied to the output vector
810   Arguments   : Inputs  - in0, in1, min_vec
811                 Outputs - in place operation
812                 Return Type - as per RTYPE
813   Details     : Minimum of unsigned halfword element values from 'in0' and
814                 'min_vec' are written to output vector 'in0'
815*/
816#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
817  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
818  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
819}
820#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
821
822#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
823  MIN_UH2(RTYPE, in0, in1, min_vec);                   \
824  MIN_UH2(RTYPE, in2, in3, min_vec);                   \
825}
826#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
827
828/* Description : Clips all signed halfword elements of input vector
829                 between 0 & 255
830   Arguments   : Input  - in
831                 Output - out_m
832                 Return Type - signed halfword
833*/
834#define CLIP_SH_0_255(in) ({                          \
835  v8i16 max_m = __msa_ldi_h(255);                     \
836  v8i16 out_m;                                        \
837                                                      \
838  out_m = __msa_maxi_s_h((v8i16)in, 0);               \
839  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
840  out_m;                                              \
841})
842#define CLIP_SH2_0_255(in0, in1) {  \
843  in0 = CLIP_SH_0_255(in0);         \
844  in1 = CLIP_SH_0_255(in1);         \
845}
846#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
847  CLIP_SH2_0_255(in0, in1);                   \
848  CLIP_SH2_0_255(in2, in3);                   \
849}
850
851/* Description : Horizontal addition of 4 signed word elements of input vector
852   Arguments   : Input  - in       (signed word vector)
853                 Output - sum_m    (i32 sum)
854                 Return Type - signed word (GP)
855   Details     : 4 signed word elements of 'in' vector are added together and
856                 the resulting integer sum is returned
857*/
858#define HADD_SW_S32(in) ({                        \
859  v2i64 res0_m, res1_m;                           \
860  int32_t sum_m;                                  \
861                                                  \
862  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
863  res1_m = __msa_splati_d(res0_m, 1);             \
864  res0_m = res0_m + res1_m;                       \
865  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
866  sum_m;                                          \
867})
868
869/* Description : Horizontal addition of 8 unsigned halfword elements
870   Arguments   : Inputs  - in       (unsigned halfword vector)
871                 Outputs - sum_m    (u32 sum)
872                 Return Type - unsigned word
873   Details     : 8 unsigned halfword elements of input vector are added
874                 together and the resulting integer sum is returned
875*/
876#define HADD_UH_U32(in) ({                           \
877  v4u32 res_m;                                       \
878  v2u64 res0_m, res1_m;                              \
879  uint32_t sum_m;                                    \
880                                                     \
881  res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
882  res0_m = __msa_hadd_u_d(res_m, res_m);             \
883  res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
884  res0_m = res0_m + res1_m;                          \
885  sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
886  sum_m;                                             \
887})
888
889/* Description : Horizontal addition of unsigned byte vector elements
890   Arguments   : Inputs  - in0, in1
891                 Outputs - out0, out1
892                 Return Type - as per RTYPE
893   Details     : Each unsigned odd byte element from 'in0' is added to
894                 even unsigned byte element from 'in0' (pairwise) and the
895                 halfword result is written to 'out0'
896*/
897#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
898  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
899  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
900}
901#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
902
903#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
904  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
905  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
906}
907#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
908
909/* Description : Horizontal subtraction of unsigned byte vector elements
910   Arguments   : Inputs  - in0, in1
911                 Outputs - out0, out1
912                 Return Type - as per RTYPE
913   Details     : Each unsigned odd byte element from 'in0' is subtracted from
914                 even unsigned byte element from 'in0' (pairwise) and the
915                 halfword result is written to 'out0'
916*/
917#define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
918  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
919  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
920}
921#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
922
923/* Description : SAD (Sum of Absolute Difference)
924   Arguments   : Inputs  - in0, in1, ref0, ref1
925                 Outputs - sad_m                 (halfword vector)
926                 Return Type - unsigned halfword
927   Details     : Absolute difference of all the byte elements from 'in0' with
928                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
929                 pairs are added together to generate 8 halfword results.
930*/
931#define SAD_UB2_UH(in0, in1, ref0, ref1) ({                 \
932  v16u8 diff0_m, diff1_m;                                   \
933  v8u16 sad_m = { 0 };                                      \
934                                                            \
935  diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);        \
936  diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);        \
937                                                            \
938  sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m);  \
939  sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m);  \
940                                                            \
941  sad_m;                                                    \
942})
943
944/* Description : Horizontal subtraction of signed halfword vector elements
945   Arguments   : Inputs  - in0, in1
946                 Outputs - out0, out1
947                 Return Type - as per RTYPE
948   Details     : Each signed odd halfword element from 'in0' is subtracted from
949                 even signed halfword element from 'in0' (pairwise) and the
950                 word result is written to 'out0'
951*/
952#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
953  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
954  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
955}
956#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
957
958/* Description : Set element n input vector to GPR value
959   Arguments   : Inputs - in0, in1, in2, in3
960                 Output - out
961                 Return Type - as per RTYPE
962   Details     : Set element 0 in vector 'out' to value specified in 'in0'
963*/
964#define INSERT_W2(RTYPE, in0, in1, out) {           \
965  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
966  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
967}
968#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
969
970#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
971  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
972  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
973  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
974  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
975}
976#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
977#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
978
979#define INSERT_D2(RTYPE, in0, in1, out) {           \
980  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
981  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
982}
983#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
984#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
985
986/* Description : Interleave even byte elements from vectors
987   Arguments   : Inputs  - in0, in1, in2, in3
988                 Outputs - out0, out1
989                 Return Type - as per RTYPE
990   Details     : Even byte elements of 'in0' and 'in1' are interleaved
991                 and written to 'out0'
992*/
993#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
994  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
995  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
996}
997#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
998#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
999
1000/* Description : Interleave even halfword elements from vectors
1001   Arguments   : Inputs  - in0, in1, in2, in3
1002                 Outputs - out0, out1
1003                 Return Type - as per RTYPE
1004   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
1005                 and written to 'out0'
1006*/
1007#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1008  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
1009  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
1010}
1011#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1012#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1013#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1014
1015/* Description : Interleave even word elements from vectors
1016   Arguments   : Inputs  - in0, in1, in2, in3
1017                 Outputs - out0, out1
1018                 Return Type - as per RTYPE
1019   Details     : Even word elements of 'in0' and 'in1' are interleaved
1020                 and written to 'out0'
1021*/
1022#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1023  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
1024  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
1025}
1026#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1027
1028/* Description : Interleave even double word elements from vectors
1029   Arguments   : Inputs  - in0, in1, in2, in3
1030                 Outputs - out0, out1
1031                 Return Type - as per RTYPE
1032   Details     : Even double word elements of 'in0' and 'in1' are interleaved
1033                 and written to 'out0'
1034*/
1035#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1036  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
1037  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
1038}
1039#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1040
1041/* Description : Interleave left half of byte elements from vectors
1042   Arguments   : Inputs  - in0, in1, in2, in3
1043                 Outputs - out0, out1
1044                 Return Type - as per RTYPE
1045   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
1046                 and written to 'out0'.
1047*/
1048#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1049  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
1050  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
1051}
1052#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1053#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1054#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1055#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1056
1057#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1058                out0, out1, out2, out3) {                       \
1059  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1060  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1061}
1062#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1063#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1064
1065/* Description : Interleave left half of halfword elements from vectors
1066   Arguments   : Inputs  - in0, in1, in2, in3
1067                 Outputs - out0, out1
1068                 Return Type - as per RTYPE
1069   Details     : Left half of halfword elements of 'in0' and 'in1' are
1070                 interleaved and written to 'out0'.
1071*/
1072#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1073  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
1074  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
1075}
1076#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1077
1078/* Description : Interleave left half of word elements from vectors
1079   Arguments   : Inputs  - in0, in1, in2, in3
1080                 Outputs - out0, out1
1081                 Return Type - as per RTYPE
1082   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1083                 and written to 'out0'.
1084*/
1085#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1086  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
1087  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
1088}
1089#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1090#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1091
1092/* Description : Interleave right half of byte elements from vectors
1093   Arguments   : Inputs  - in0, in1, in2, in3
1094                 Outputs - out0, out1
1095                 Return Type - as per RTYPE
1096   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1097                 and written to out0.
1098*/
1099#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1100  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
1101  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
1102}
1103#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1104#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1105#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1106#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1107
1108#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1109                out0, out1, out2, out3) {                       \
1110  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1111  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1112}
1113#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1114#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1115#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1116#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1117
1118#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1119                in8, in9, in10, in11, in12, in13, in14, in15,      \
1120                out0, out1, out2, out3, out4, out5, out6, out7) {  \
1121  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
1122          out0, out1, out2, out3);                                 \
1123  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
1124          out4, out5, out6, out7);                                 \
1125}
1126#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1127
1128/* Description : Interleave right half of halfword elements from vectors
1129   Arguments   : Inputs  - in0, in1, in2, in3
1130                 Outputs - out0, out1
1131                 Return Type - as per RTYPE
1132   Details     : Right half of halfword elements of 'in0' and 'in1' are
1133                 interleaved and written to 'out0'.
1134*/
1135#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1136  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
1137  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
1138}
1139#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1140
1141#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1142                out0, out1, out2, out3) {                       \
1143  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1144  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1145}
1146#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1147
1148#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1149  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
1150  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
1151}
1152#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1153#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1154
1155#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1156                out0, out1, out2, out3) {                       \
1157  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1158  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1159}
1160#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1161
1162/* Description : Interleave right half of double word elements from vectors
1163   Arguments   : Inputs  - in0, in1, in2, in3
1164                 Outputs - out0, out1
1165                 Return Type - as per RTYPE
1166   Details     : Right half of double word elements of 'in0' and 'in1' are
1167                 interleaved and written to 'out0'.
1168*/
1169#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
1170  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
1171  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
1172}
1173#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1174#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1175#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1176
1177#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
1178  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
1179  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
1180}
1181#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1182
1183#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1184                out0, out1, out2, out3) {                       \
1185  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1186  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1187}
1188#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1189#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1190
1191/* Description : Interleave both left and right half of input vectors
1192   Arguments   : Inputs  - in0, in1
1193                 Outputs - out0, out1
1194                 Return Type - as per RTYPE
1195   Details     : Right half of byte elements from 'in0' and 'in1' are
1196                 interleaved and written to 'out0'
1197*/
1198#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
1199  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
1200  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
1201}
1202#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1203#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1204#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1205#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1206
1207#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
1208  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
1209  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
1210}
1211#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1212#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1213
1214#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
1215  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
1216  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
1217}
1218#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1219#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1220
1221/* Description : Saturate the halfword element values to the max
1222                 unsigned value of (sat_val + 1) bits
1223                 The element data width remains unchanged
1224   Arguments   : Inputs  - in0, in1, sat_val
1225                 Outputs - in place operation
1226                 Return Type - as per RTYPE
1227   Details     : Each unsigned halfword element from 'in0' is saturated to the
1228                 value generated with (sat_val + 1) bit range.
1229                 The results are written in place
1230*/
1231#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
1232  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
1233  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
1234}
1235#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1236
1237#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
1238  SAT_UH2(RTYPE, in0, in1, sat_val);                   \
1239  SAT_UH2(RTYPE, in2, in3, sat_val)                    \
1240}
1241#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1242
1243/* Description : Saturate the halfword element values to the max
1244                 unsigned value of (sat_val + 1) bits
1245                 The element data width remains unchanged
1246   Arguments   : Inputs  - in0, in1, sat_val
1247                 Outputs - in place operation
1248                 Return Type - as per RTYPE
1249   Details     : Each unsigned halfword element from 'in0' is saturated to the
1250                 value generated with (sat_val + 1) bit range
1251                 The results are written in place
1252*/
1253#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
1254  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
1255  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
1256}
1257#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1258
1259#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
1260  SAT_SH2(RTYPE, in0, in1, sat_val);                   \
1261  SAT_SH2(RTYPE, in2, in3, sat_val);                   \
1262}
1263#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1264
1265/* Description : Indexed halfword element values are replicated to all
1266                 elements in output vector
1267   Arguments   : Inputs  - in, idx0, idx1
1268                 Outputs - out0, out1
1269                 Return Type - as per RTYPE
1270   Details     : 'idx0' element value from 'in' vector is replicated to all
1271                  elements in 'out0' vector
1272                  Valid index range for halfword operation is 0-7
1273*/
1274#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
1275  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
1276  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
1277}
1278#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1279
1280#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1281                  out0, out1, out2, out3) {           \
1282  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
1283  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
1284}
1285#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1286#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1287
1288/* Description : Pack even byte elements of vector pairs
1289   Arguments   : Inputs  - in0, in1, in2, in3
1290                 Outputs - out0, out1
1291                 Return Type - as per RTYPE
1292   Details     : Even byte elements of 'in0' are copied to the left half of
1293                 'out0' & even byte elements of 'in1' are copied to the right
1294                 half of 'out0'.
1295*/
1296#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1297  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
1298  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
1299}
1300#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1301#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1302#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1303
1304#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1305                 out0, out1, out2, out3) {                       \
1306  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1307  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1308}
1309#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1310#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1311#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1312
1313/* Description : Pack even halfword elements of vector pairs
1314   Arguments   : Inputs  - in0, in1, in2, in3
1315                 Outputs - out0, out1
1316                 Return Type - as per RTYPE
1317   Details     : Even halfword elements of 'in0' are copied to the left half of
1318                 'out0' & even halfword elements of 'in1' are copied to the
1319                 right half of 'out0'.
1320*/
1321#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1322  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
1323  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
1324}
1325#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1326#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1327
1328#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1329                 out0, out1, out2, out3) {                       \
1330  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1331  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1332}
1333#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1334
1335/* Description : Pack even double word elements of vector pairs
1336   Arguments   : Inputs  - in0, in1, in2, in3
1337                 Outputs - out0, out1
1338                 Return Type - as per RTYPE
1339   Details     : Even double elements of 'in0' are copied to the left half of
1340                 'out0' & even double elements of 'in1' are copied to the right
1341                 half of 'out0'.
1342*/
1343#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1344  out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
1345  out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
1346}
1347#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1348#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1349
1350#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1351                 out0, out1, out2, out3) {                       \
1352  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1353  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1354}
1355#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1356
1357/* Description : Each byte element is logically xor'ed with immediate 128
1358   Arguments   : Inputs  - in0, in1
1359                 Outputs - in place operation
1360                 Return Type - as per RTYPE
1361   Details     : Each unsigned byte element from input vector 'in0' is
1362                 logically xor'ed with 128 and the result is stored in-place.
1363*/
1364#define XORI_B2_128(RTYPE, in0, in1) {         \
1365  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
1366  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
1367}
1368#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1369#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1370
1371#define XORI_B3_128(RTYPE, in0, in1, in2) {    \
1372  XORI_B2_128(RTYPE, in0, in1);                \
1373  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
1374}
1375#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1376
1377#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
1378  XORI_B2_128(RTYPE, in0, in1);                   \
1379  XORI_B2_128(RTYPE, in2, in3);                   \
1380}
1381#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1382#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1383
1384#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
1385  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
1386  XORI_B3_128(RTYPE, in4, in5, in6);                             \
1387}
1388#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1389
1390/* Description : Average of signed halfword elements -> (a + b) / 2
1391   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1392                 Outputs - out0, out1, out2, out3
1393                 Return Type - as per RTYPE
1394   Details     : Each signed halfword element from 'in0' is added to each
1395                 signed halfword element of 'in1' with full precision resulting
1396                 in one extra bit in the result. The result is then divided by
1397                 2 and written to 'out0'
1398*/
1399#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1400                out0, out1, out2, out3) {                       \
1401  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
1402  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
1403  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
1404  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
1405}
1406#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
1407
1408/* Description : Addition of signed halfword elements and signed saturation
1409   Arguments   : Inputs  - in0, in1, in2, in3
1410                 Outputs - out0, out1
1411                 Return Type - as per RTYPE
1412   Details     : Signed halfword elements from 'in0' are added to signed
1413                 halfword elements of 'in1'. The result is then signed saturated
1414                 between halfword data type range
1415*/
1416#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1417  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
1418  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
1419}
1420#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1421
1422#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1423                 out0, out1, out2, out3) {                       \
1424  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1425  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1426}
1427#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1428
1429/* Description : Shift left all elements of vector (generic for all data types)
1430   Arguments   : Inputs  - in0, in1, in2, in3, shift
1431                 Outputs - in place operation
1432                 Return Type - as per input vector RTYPE
1433   Details     : Each element of vector 'in0' is left shifted by 'shift' and
1434                 the result is written in-place.
1435*/
1436#define SLLI_4V(in0, in1, in2, in3, shift) {  \
1437  in0 = in0 << shift;                         \
1438  in1 = in1 << shift;                         \
1439  in2 = in2 << shift;                         \
1440  in3 = in3 << shift;                         \
1441}
1442
1443/* Description : Arithmetic shift right all elements of vector
1444                 (generic for all data types)
1445   Arguments   : Inputs  - in0, in1, in2, in3, shift
1446                 Outputs - in place operation
1447                 Return Type - as per input vector RTYPE
1448   Details     : Each element of vector 'in0' is right shifted by 'shift' and
1449                 the result is written in-place. 'shift' is a GP variable.
1450*/
1451#define SRA_4V(in0, in1, in2, in3, shift) {  \
1452  in0 = in0 >> shift;                        \
1453  in1 = in1 >> shift;                        \
1454  in2 = in2 >> shift;                        \
1455  in3 = in3 >> shift;                        \
1456}
1457
1458/* Description : Shift right arithmetic rounded words
1459   Arguments   : Inputs  - in0, in1, shift
1460                 Outputs - in place operation
1461                 Return Type - as per RTYPE
1462   Details     : Each element of vector 'in0' is shifted right arithmetically by
1463                 the number of bits in the corresponding element in the vector
1464                 'shift'. The last discarded bit is added to shifted value for
1465                 rounding and the result is written in-place.
1466                 'shift' is a vector.
1467*/
1468#define SRAR_W2(RTYPE, in0, in1, shift) {               \
1469  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
1470  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
1471}
1472
1473#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
1474  SRAR_W2(RTYPE, in0, in1, shift)                    \
1475  SRAR_W2(RTYPE, in2, in3, shift)                    \
1476}
1477#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1478
1479/* Description : Shift right arithmetic rounded (immediate)
1480   Arguments   : Inputs  - in0, in1, shift
1481                 Outputs - in place operation
1482                 Return Type - as per RTYPE
1483   Details     : Each element of vector 'in0' is shifted right arithmetically by
1484                 the value in 'shift'. The last discarded bit is added to the
1485                 shifted value for rounding and the result is written in-place.
1486                 'shift' is an immediate value.
1487*/
1488#define SRARI_H2(RTYPE, in0, in1, shift) {        \
1489  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
1490  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
1491}
1492#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1493#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1494
1495#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
1496  SRARI_H2(RTYPE, in0, in1, shift);                   \
1497  SRARI_H2(RTYPE, in2, in3, shift);                   \
1498}
1499#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1500#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1501
1502#define SRARI_W2(RTYPE, in0, in1, shift) {        \
1503  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
1504  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
1505}
1506#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1507
1508#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
1509  SRARI_W2(RTYPE, in0, in1, shift);                   \
1510  SRARI_W2(RTYPE, in2, in3, shift);                   \
1511}
1512#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1513
1514/* Description : Logical shift right all elements of vector (immediate)
1515   Arguments   : Inputs  - in0, in1, in2, in3, shift
1516                 Outputs - out0, out1, out2, out3
1517                 Return Type - as per RTYPE
1518   Details     : Each element of vector 'in0' is right shifted by 'shift' and
1519                 the result is written in-place. 'shift' is an immediate value.
1520*/
1521#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
1522  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
1523  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
1524  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
1525  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
1526}
1527#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
1528
1529/* Description : Multiplication of pairs of vectors
1530   Arguments   : Inputs  - in0, in1, in2, in3
1531                 Outputs - out0, out1
1532   Details     : Each element from 'in0' is multiplied with elements from 'in1'
1533                 and the result is written to 'out0'
1534*/
1535#define MUL2(in0, in1, in2, in3, out0, out1) {  \
1536  out0 = in0 * in1;                             \
1537  out1 = in2 * in3;                             \
1538}
1539#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1540             out0, out1, out2, out3) {                \
1541  MUL2(in0, in1, in2, in3, out0, out1);               \
1542  MUL2(in4, in5, in6, in7, out2, out3);               \
1543}
1544
1545/* Description : Addition of 2 pairs of vectors
1546   Arguments   : Inputs  - in0, in1, in2, in3
1547                 Outputs - out0, out1
1548   Details     : Each element in 'in0' is added to 'in1' and result is written
1549                 to 'out0'.
1550*/
1551#define ADD2(in0, in1, in2, in3, out0, out1) {  \
1552  out0 = in0 + in1;                             \
1553  out1 = in2 + in3;                             \
1554}
1555#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1556             out0, out1, out2, out3) {                \
1557  ADD2(in0, in1, in2, in3, out0, out1);               \
1558  ADD2(in4, in5, in6, in7, out2, out3);               \
1559}
1560
1561/* Description : Subtraction of 2 pairs of vectors
1562   Arguments   : Inputs  - in0, in1, in2, in3
1563                 Outputs - out0, out1
1564   Details     : Each element in 'in1' is subtracted from 'in0' and result is
1565                 written to 'out0'.
1566*/
1567#define SUB2(in0, in1, in2, in3, out0, out1) {  \
1568  out0 = in0 - in1;                             \
1569  out1 = in2 - in3;                             \
1570}
1571#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1572             out0, out1, out2, out3) {                \
1573  out0 = in0 - in1;                                   \
1574  out1 = in2 - in3;                                   \
1575  out2 = in4 - in5;                                   \
1576  out3 = in6 - in7;                                   \
1577}
1578
1579/* Description : Sign extend halfword elements from right half of the vector
1580   Arguments   : Input  - in    (halfword vector)
1581                 Output - out   (sign extended word vector)
1582                 Return Type - signed word
1583   Details     : Sign bit of halfword elements from input vector 'in' is
1584                 extracted and interleaved with same vector 'in0' to generate
1585                 4 word elements keeping sign intact
1586*/
1587#define UNPCK_R_SH_SW(in, out) {                 \
1588  v8i16 sign_m;                                  \
1589                                                 \
1590  sign_m = __msa_clti_s_h((v8i16)in, 0);         \
1591  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
1592}
1593
1594/* Description : Zero extend unsigned byte elements to halfword elements
1595   Arguments   : Input   - in          (unsigned byte vector)
1596                 Outputs - out0, out1  (unsigned  halfword vectors)
1597                 Return Type - signed halfword
1598   Details     : Zero extended right half of vector is returned in 'out0'
1599                 Zero extended left half of vector is returned in 'out1'
1600*/
1601#define UNPCK_UB_SH(in, out0, out1) {   \
1602  v16i8 zero_m = { 0 };                 \
1603                                        \
1604  ILVRL_B2_SH(zero_m, in, out0, out1);  \
1605}
1606
1607/* Description : Sign extend halfword elements from input vector and return
1608                 the result in pair of vectors
1609   Arguments   : Input   - in            (halfword vector)
1610                 Outputs - out0, out1   (sign extended word vectors)
1611                 Return Type - signed word
1612   Details     : Sign bit of halfword elements from input vector 'in' is
1613                 extracted and interleaved right with same vector 'in0' to
1614                 generate 4 signed word elements in 'out0'
1615                 Then interleaved left with same vector 'in0' to
1616                 generate 4 signed word elements in 'out1'
1617*/
1618#define UNPCK_SH_SW(in, out0, out1) {    \
1619  v8i16 tmp_m;                           \
1620                                         \
1621  tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
1622  ILVRL_H2_SW(tmp_m, in, out0, out1);    \
1623}
1624
1625/* Description : Butterfly of 4 input vectors
1626   Arguments   : Inputs  - in0, in1, in2, in3
1627                 Outputs - out0, out1, out2, out3
1628   Details     : Butterfly operation
1629*/
1630#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1631  out0 = in0 + in3;                                                \
1632  out1 = in1 + in2;                                                \
1633                                                                   \
1634  out2 = in1 - in2;                                                \
1635  out3 = in0 - in3;                                                \
1636}
1637
1638/* Description : Butterfly of 8 input vectors
1639   Arguments   : Inputs  - in0 ...  in7
1640                 Outputs - out0 .. out7
1641   Details     : Butterfly operation
1642*/
1643#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
1644                    out0, out1, out2, out3, out4, out5, out6, out7) {  \
1645  out0 = in0 + in7;                                                    \
1646  out1 = in1 + in6;                                                    \
1647  out2 = in2 + in5;                                                    \
1648  out3 = in3 + in4;                                                    \
1649                                                                       \
1650  out4 = in3 - in4;                                                    \
1651  out5 = in2 - in5;                                                    \
1652  out6 = in1 - in6;                                                    \
1653  out7 = in0 - in7;                                                    \
1654}
1655
1656/* Description : Butterfly of 16 input vectors
1657   Arguments   : Inputs  - in0 ...  in15
1658                 Outputs - out0 .. out15
1659   Details     : Butterfly operation
1660*/
1661#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
1662                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
1663                     out0, out1, out2, out3, out4, out5, out6, out7,          \
1664                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
1665  out0 = in0 + in15;                                                          \
1666  out1 = in1 + in14;                                                          \
1667  out2 = in2 + in13;                                                          \
1668  out3 = in3 + in12;                                                          \
1669  out4 = in4 + in11;                                                          \
1670  out5 = in5 + in10;                                                          \
1671  out6 = in6 + in9;                                                           \
1672  out7 = in7 + in8;                                                           \
1673                                                                              \
1674  out8 = in7 - in8;                                                           \
1675  out9 = in6 - in9;                                                           \
1676  out10 = in5 - in10;                                                         \
1677  out11 = in4 - in11;                                                         \
1678  out12 = in3 - in12;                                                         \
1679  out13 = in2 - in13;                                                         \
1680  out14 = in1 - in14;                                                         \
1681  out15 = in0 - in15;                                                         \
1682}
1683
1684/* Description : Transpose input 8x8 byte block
1685   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1686                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1687                 Return Type - as per RTYPE
1688*/
1689#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1690                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
1691  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
1692  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
1693                                                                           \
1694  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
1695             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
1696  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
1697  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
1698  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
1699  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
1700  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
1701  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
1702}
1703#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1704
1705/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1706   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1707                           in8, in9, in10, in11, in12, in13, in14, in15
1708                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1709                 Return Type - unsigned byte
1710*/
1711#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
1712                            in8, in9, in10, in11, in12, in13, in14, in15,      \
1713                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
1714  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
1715  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
1716                                                                               \
1717  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
1718  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
1719  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
1720  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
1721                                                                               \
1722  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
1723  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
1724  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
1725  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
1726  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
1727  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
1728  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
1729  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
1730                                                                               \
1731  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
1732  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1733  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1734                                                                               \
1735  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
1736  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
1737  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1738  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1739                                                                               \
1740  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
1741  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1742  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1743                                                                               \
1744  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
1745  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
1746  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
1747  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
1748  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1749  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1750}
1751
1752/* Description : Transpose 4x4 block with half word elements in vectors
1753   Arguments   : Inputs  - in0, in1, in2, in3
1754                 Outputs - out0, out1, out2, out3
1755                 Return Type - signed halfword
1756*/
1757#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1758  v8i16 s0_m, s1_m;                                                       \
1759                                                                          \
1760  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
1761  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
1762  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
1763  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
1764}
1765
1766/* Description : Transpose 4x8 block with half word elements in vectors
1767   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1768                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1769                 Return Type - signed halfword
1770*/
1771#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
1772                           out0, out1, out2, out3, out4, out5, out6, out7) {  \
1773  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
1774  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
1775  v8i16 zero_m = { 0 };                                                       \
1776                                                                              \
1777  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
1778             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
1779  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
1780  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
1781                                                                              \
1782  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
1783  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
1784  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
1785  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
1786                                                                              \
1787  out4 = zero_m;                                                              \
1788  out5 = zero_m;                                                              \
1789  out6 = zero_m;                                                              \
1790  out7 = zero_m;                                                              \
1791}
1792
1793/* Description : Transpose 8x4 block with half word elements in vectors
1794   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1795                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1796                 Return Type - signed halfword
1797*/
1798#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1799  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1800                                                                          \
1801  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
1802  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
1803  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
1804  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
1805}
1806
1807/* Description : Transpose 8x8 block with half word elements in vectors
1808   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1809                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1810                 Return Type - as per RTYPE
1811*/
1812#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1813                       out0, out1, out2, out3, out4, out5, out6, out7) {  \
1814  v8i16 s0_m, s1_m;                                                       \
1815  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1816  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
1817                                                                          \
1818  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1819  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
1820  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1821  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
1822  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1823  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
1824  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1825  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
1826  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
1827           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
1828  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
1829  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
1830  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
1831  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
1832}
1833#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1834
1835/* Description : Transpose 4x4 block with word elements in vectors
1836   Arguments   : Inputs  - in0, in1, in2, in3
1837                 Outputs - out0, out1, out2, out3
1838                 Return Type - signed word
1839*/
1840#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1841  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
1842                                                                          \
1843  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
1844  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
1845                                                                          \
1846  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
1847  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
1848  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
1849  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
1850}
1851
1852/* Description : Add block 4x4
1853   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1854   Details     : Least significant 4 bytes from each input vector are added to
1855                 the destination bytes, clipped between 0-255 and stored.
1856*/
1857#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
1858  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
1859  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
1860  v16i8 dst0_m = { 0 };                                         \
1861  v16i8 dst1_m = { 0 };                                         \
1862  v16i8 zero_m = { 0 };                                         \
1863                                                                \
1864  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
1865  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
1866  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
1867  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
1868  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
1869  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
1870  CLIP_SH2_0_255(res0_m, res1_m);                               \
1871  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
1872  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
1873}
1874
1875/* Description : Pack even elements of input vectors & xor with 128
1876   Arguments   : Inputs - in0, in1
1877                 Output - out_m
1878                 Return Type - unsigned byte
1879   Details     : Signed byte even elements from 'in0' and 'in1' are packed
1880                 together in one vector and the resulting vector is xor'ed with
1881                 128 to shift the range from signed to unsigned byte
1882*/
1883#define PCKEV_XORI128_UB(in0, in1) ({                    \
1884  v16u8 out_m;                                           \
1885                                                         \
1886  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
1887  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
1888  out_m;                                                 \
1889})
1890
1891/* Description : Converts inputs to unsigned bytes, interleave, average & store
1892                 as 8x4 unsigned byte block
1893   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
1894                          pdst, stride
1895*/
1896#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
1897                                dst0, dst1, dst2, dst3, pdst, stride) {  \
1898  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
1899  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
1900                                                                         \
1901  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
1902  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
1903  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
1904  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
1905  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
1906}
1907
1908/* Description : Pack even byte elements and store byte vector in destination
1909                 memory
1910   Arguments   : Inputs - in0, in1, pdst
1911*/
1912#define PCKEV_ST_SB(in0, in1, pdst) {             \
1913  v16i8 tmp_m;                                    \
1914                                                  \
1915  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
1916  ST_SB(tmp_m, (pdst));                           \
1917}
1918
1919/* Description : Horizontal 2 tap filter kernel code
1920   Arguments   : Inputs - in0, in1, mask, coeff, shift
1921*/
1922#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
1923  v16i8 tmp0_m;                                                \
1924  v8u16 tmp1_m;                                                \
1925                                                               \
1926  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
1927  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
1928  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
1929                                                               \
1930  tmp1_m;                                                      \
1931})
1932#endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */
1933