17ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/*
27ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
37ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *
47ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
57ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
67ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
77ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
87ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
97ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian */
107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifndef VPX_DSP_MIPS_MACROS_MSA_H_
127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VPX_DSP_MIPS_MACROS_MSA_H_
137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include <msa.h>
157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_config.h"
177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx/vpx_integer.h"
187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if (__mips_isa_rev >= 6)
417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LH(psrc) ({                                 \
427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint16_t val_m;                                   \
447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                            \
467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "lh  %[val_m],  %[psrc_m]  \n\t"              \
477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "=r" (val_m)                        \
497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [psrc_m] "m" (*psrc_m)                      \
507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                                \
517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m;                                            \
537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LW(psrc) ({                                 \
567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t val_m;                                   \
587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                            \
607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "lw  %[val_m],  %[psrc_m]  \n\t"              \
617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "=r" (val_m)                        \
637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [psrc_m] "m" (*psrc_m)                      \
647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                                \
657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m;                                            \
677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if (__mips == 64)
707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD(psrc) ({                                 \
717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint64_t val_m = 0;                               \
737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                            \
757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "ld  %[val_m],  %[psrc_m]  \n\t"              \
767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "=r" (val_m)                        \
787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [psrc_m] "m" (*psrc_m)                      \
797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                                \
807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m;                                            \
827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#else  // !(__mips == 64)
847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD(psrc) ({                                        \
857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t val0_m, val1_m;                                 \
877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint64_t val_m = 0;                                      \
887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                           \
897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val0_m = LW(psrc_m);                                     \
907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val1_m = LW(psrc_m + 4);                                 \
917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                           \
927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m = (uint64_t)(val1_m);                              \
937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                           \
967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m;                                                   \
977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // (__mips == 64)
997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SH(val, pdst) {                 \
1017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pdst_m = (uint8_t *)(pdst);  \
1027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint16_t val_m = (val);         \
1037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        \
1047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                \
1057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "sh  %[val_m],  %[pdst_m]  \n\t"  \
1067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        \
1077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [pdst_m] "=m" (*pdst_m)         \
1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "r" (val_m)             \
1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                    \
1107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
1117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SW(val, pdst) {                 \
1137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pdst_m = (uint8_t *)(pdst);  \
1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint32_t val_m = (val);         \
1157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        \
1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                \
1177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "sw  %[val_m],  %[pdst_m]  \n\t"  \
1187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        \
1197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [pdst_m] "=m" (*pdst_m)         \
1207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "r" (val_m)             \
1217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                    \
1227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
1237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SD(val, pdst) {                 \
1257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pdst_m = (uint8_t *)(pdst);  \
1267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint64_t val_m = (val);         \
1277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        \
1287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                \
1297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "sd  %[val_m],  %[pdst_m]  \n\t"  \
1307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        \
1317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [pdst_m] "=m" (*pdst_m)         \
1327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "r" (val_m)             \
1337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                    \
1347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
1357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#else  // !(__mips_isa_rev >= 6)
1367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LH(psrc) ({                                 \
1377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
1387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint16_t val_m;                                   \
1397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                            \
1417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "ulh  %[val_m],  %[psrc_m]  \n\t"             \
1427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "=r" (val_m)                        \
1447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [psrc_m] "m" (*psrc_m)                      \
1457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                                \
1467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m;                                            \
1487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
1497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LW(psrc) ({                                 \
1517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
1527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t val_m;                                   \
1537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                            \
1557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "ulw  %[val_m],  %[psrc_m]  \n\t"             \
1567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "=r" (val_m)                        \
1587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [psrc_m] "m" (*psrc_m)                      \
1597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                                \
1607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m;                                            \
1627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
1637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if (__mips == 64)
1657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD(psrc) ({                                 \
1667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
1677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint64_t val_m = 0;                               \
1687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                            \
1707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "uld  %[val_m],  %[psrc_m]  \n\t"             \
1717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "=r" (val_m)                        \
1737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [psrc_m] "m" (*psrc_m)                      \
1747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                                \
1757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
1767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m;                                            \
1777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
1787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#else  // !(__mips == 64)
1797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD(psrc) ({                                        \
1807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
1817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t val0_m, val1_m;                                 \
1827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint64_t val_m = 0;                                      \
1837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                           \
1847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val0_m = LW(psrc_m1);                                    \
1857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val1_m = LW(psrc_m1 + 4);                                \
1867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                           \
1877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m = (uint64_t)(val1_m);                              \
1887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
1897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
1907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                           \
1917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val_m;                                                   \
1927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
1937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // (__mips == 64)
1947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SH(val, pdst) {                  \
1967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pdst_m = (uint8_t *)(pdst);   \
1977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint16_t val_m = (val);          \
1987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                         \
1997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                 \
2007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "ush  %[val_m],  %[pdst_m]  \n\t"  \
2017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                         \
2027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [pdst_m] "=m" (*pdst_m)          \
2037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "r" (val_m)              \
2047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                     \
2057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
2067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SW(val, pdst) {                  \
2087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pdst_m = (uint8_t *)(pdst);   \
2097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  const uint32_t val_m = (val);          \
2107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                         \
2117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm__ __volatile__ (                 \
2127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      "usw  %[val_m],  %[pdst_m]  \n\t"  \
2137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                         \
2147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [pdst_m] "=m" (*pdst_m)          \
2157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      : [val_m] "r" (val_m)              \
2167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  );                                     \
2177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
2187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SD(val, pdst) {                                     \
2207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
2217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t val0_m, val1_m;                                  \
2227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                            \
2237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
2247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
2257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                            \
2267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW(val0_m, pdst_m1);                                      \
2277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW(val1_m, pdst_m1 + 4);                                  \
2287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
2297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // (__mips_isa_rev >= 6)
2307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Load 4 words with stride
2327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - psrc, stride
2337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3
2347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Load word in 'out0' from (psrc)
2357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Load word in 'out1' from (psrc + stride)
2367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Load word in 'out2' from (psrc + 2 * stride)
2377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Load word in 'out3' from (psrc + 3 * stride)
2387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
2397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LW4(psrc, stride, out0, out1, out2, out3) {  \
2407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = LW((psrc));                                 \
2417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = LW((psrc) + stride);                        \
2427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = LW((psrc) + 2 * stride);                    \
2437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = LW((psrc) + 3 * stride);                    \
2447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
2457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Load double words with stride
2477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - psrc, stride
2487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
2497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Load double word in 'out0' from (psrc)
2507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Load double word in 'out1' from (psrc + stride)
2517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
2527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD2(psrc, stride, out0, out1) {  \
2537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = LD((psrc));                     \
2547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = LD((psrc) + stride);            \
2557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
2567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD4(psrc, stride, out0, out1, out2, out3) {  \
2577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD2((psrc), stride, out0, out1);                   \
2587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD2((psrc) + 2 * stride, stride, out2, out3);      \
2597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
2607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store 4 words with stride
2627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
2637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Store word from 'in0' to (pdst)
2647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store word from 'in1' to (pdst + stride)
2657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store word from 'in2' to (pdst + 2 * stride)
2667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store word from 'in3' to (pdst + 3 * stride)
2677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
2687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SW4(in0, in1, in2, in3, pdst, stride) {  \
2697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW(in0, (pdst))                                \
2707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW(in1, (pdst) + stride);                      \
2717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW(in2, (pdst) + 2 * stride);                  \
2727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW(in3, (pdst) + 3 * stride);                  \
2737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
2747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store 4 double words with stride
2767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
2777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Store double word from 'in0' to (pdst)
2787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store double word from 'in1' to (pdst + stride)
2797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store double word from 'in2' to (pdst + 2 * stride)
2807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store double word from 'in3' to (pdst + 3 * stride)
2817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
2827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SD4(in0, in1, in2, in3, pdst, stride) {  \
2837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SD(in0, (pdst))                                \
2847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SD(in1, (pdst) + stride);                      \
2857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SD(in2, (pdst) + 2 * stride);                  \
2867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SD(in3, (pdst) + 3 * stride);                  \
2877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
2887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Load vectors with 16 byte elements with stride
2907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - psrc, stride
2917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
2927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
2937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Load 16 byte elements in 'out0' from (psrc)
2947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Load 16 byte elements in 'out1' from (psrc + stride)
2957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
2967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
2977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = LD_B(RTYPE, (psrc));                     \
2987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = LD_B(RTYPE, (psrc) + stride);            \
2997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
3017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
3027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
3047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_B2(RTYPE, (psrc), stride, out0, out1);             \
3057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = LD_B(RTYPE, (psrc) + 2 * stride);              \
3067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
3087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
3107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
3117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
3127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
3147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
3157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
3177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
3187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
3197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
3217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
3227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_B7(RTYPE, psrc, stride,                             \
3247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian              out0, out1, out2, out3, out4, out5, out6) {      \
3257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
3267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
3277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
3297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_B8(RTYPE, psrc, stride,                                    \
3317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian              out0, out1, out2, out3, out4, out5, out6, out7) {       \
3327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
3337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
3347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
3367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
3377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Load vectors with 8 halfword elements with stride
3397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - psrc, stride
3407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
3417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Load 8 halfword elements in 'out0' from (psrc)
3427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Load 8 halfword elements in 'out1' from (psrc + stride)
3437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
3447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
3457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = LD_H(RTYPE, (psrc));                     \
3467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = LD_H(RTYPE, (psrc) + (stride));          \
3477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
3497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
3517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
3527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
3537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
3557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_H8(RTYPE, psrc, stride,                                    \
3577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian              out0, out1, out2, out3, out4, out5, out6, out7) {       \
3587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
3597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
3607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
3627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_H16(RTYPE, psrc, stride,                                     \
3647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian               out0, out1, out2, out3, out4, out5, out6, out7,          \
3657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian               out8, out9, out10, out11, out12, out13, out14, out15) {  \
3667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_H8(RTYPE, (psrc), stride,                                          \
3677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        out0, out1, out2, out3, out4, out5, out6, out7);                \
3687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
3697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        out8, out9, out10, out11, out12, out13, out14, out15);          \
3707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
3727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Load 4x4 block of signed halfword elements from 1D source
3747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 data into 4 vectors (Each vector with 4 signed halfwords)
3757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Input   - psrc
3767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3
3777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
3787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
3797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = LD_SH(psrc);                                    \
3807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = LD_SH(psrc + 8);                                \
3817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
3827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
3837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Load 2 vectors of signed word elements with stride
3867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - psrc, stride
3877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
3887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed word
3897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
3907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define LD_SW2(psrc, stride, out0, out1) {  \
3917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = LD_SW((psrc));                     \
3927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = LD_SW((psrc) + stride);            \
3937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store vectors of 16 byte elements with stride
3967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, pdst, stride
3977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Store 16 byte elements from 'in0' to (pdst)
3987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store 16 byte elements from 'in1' to (pdst + stride)
3997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
4007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
4017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_B(RTYPE, in0, (pdst));                     \
4027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_B(RTYPE, in1, (pdst) + stride);            \
4037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
4057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
4077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
4087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
4097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
4117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
4137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian              pdst, stride) {                                     \
4147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
4157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
4167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
4187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store vectors of 8 halfword elements with stride
4207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, pdst, stride
4217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Store 8 halfword elements from 'in0' to (pdst)
4227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store 8 halfword elements from 'in1' to (pdst + stride)
4237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
4247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
4257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_H(RTYPE, in0, (pdst));                     \
4267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_H(RTYPE, in1, (pdst) + stride);            \
4277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
4297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
4317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
4327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
4337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
4357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
4377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
4387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
4397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
4417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store vectors of word elements with stride
4437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, pdst, stride
4447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Store 4 word elements from 'in0' to (pdst)
4457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Store 4 word elements from 'in1' to (pdst + stride)
4467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
4477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST_SW2(in0, in1, pdst, stride) {  \
4487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_SW(in0, (pdst));                     \
4497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_SW(in1, (pdst) + stride);            \
4507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store 2x4 byte block to destination memory from input vector
4537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in, stidx, pdst, stride
4547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Index 'stidx' halfword element from 'in' vector is copied to
4557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the GP register and stored to (pdst)
4567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Index 'stidx+1' halfword element from 'in' vector is copied to
4577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the GP register and stored to (pdst + stride)
4587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Index 'stidx+2' halfword element from 'in' vector is copied to
4597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the GP register and stored to (pdst + 2 * stride)
4607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Index 'stidx+3' halfword element from 'in' vector is copied to
4617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the GP register and stored to (pdst + 3 * stride)
4627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
4637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST2x4_UB(in, stidx, pdst, stride) {         \
4647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint16_t out0_m, out1_m, out2_m, out3_m;          \
4657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
4667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
4677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
4687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
4697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
4707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
4717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                    \
4727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SH(out0_m, pblk_2x4_m);                           \
4737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SH(out1_m, pblk_2x4_m + stride);                  \
4747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SH(out2_m, pblk_2x4_m + 2 * stride);              \
4757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SH(out3_m, pblk_2x4_m + 3 * stride);              \
4767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store 4x2 byte block to destination memory from input vector
4797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in, pdst, stride
4807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Index 0 word element from 'in' vector is copied to the GP
4817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 register and stored to (pdst)
4827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Index 1 word element from 'in' vector is copied to the GP
4837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 register and stored to (pdst + stride)
4847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
4857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST4x2_UB(in, pdst, stride) {        \
4867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t out0_m, out1_m;                  \
4877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
4887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                            \
4897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0_m = __msa_copy_u_w((v4i32)in, 0);    \
4907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1_m = __msa_copy_u_w((v4i32)in, 1);    \
4917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                            \
4927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW(out0_m, pblk_4x2_m);                   \
4937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW(out1_m, pblk_4x2_m + stride);          \
4947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store 4x4 byte block to destination memory from input vector
4977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, pdst, stride
4987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : 'Idx0' word element from input vector 'in0' is copied to the
4997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst)
5007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'Idx1' word element from input vector 'in0' is copied to the
5017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst + stride)
5027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'Idx2' word element from input vector 'in0' is copied to the
5037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst + 2 * stride)
5047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'Idx3' word element from input vector 'in0' is copied to the
5057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst + 3 * stride)
5067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
5077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
5087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t out0_m, out1_m, out2_m, out3_m;                          \
5097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
5107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                    \
5117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
5127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
5137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
5147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
5157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                    \
5167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
5177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
5187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST4x8_UB(in0, in1, pdst, stride) {                        \
5197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
5207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                  \
5217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
5227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
5237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
5247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store 8x1 byte block to destination memory from input vector
5267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in, pdst
5277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Index 0 double word element from 'in' vector is copied to the
5287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst)
5297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
5307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST8x1_UB(in, pdst) {              \
5317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint64_t out0_m;                        \
5327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                          \
5337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0_m = __msa_copy_u_d((v2i64)in, 0);  \
5347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SD(out0_m, pdst);                       \
5357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
5367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store 8x2 byte block to destination memory from input vector
5387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in, pdst, stride
5397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Index 0 double word element from 'in' vector is copied to the
5407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst)
5417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Index 1 double word element from 'in' vector is copied to the
5427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst + stride)
5437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
5447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST8x2_UB(in, pdst, stride) {        \
5457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint64_t out0_m, out1_m;                  \
5467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
5477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                            \
5487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
5497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
5507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                            \
5517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SD(out0_m, pblk_8x2_m);                   \
5527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SD(out1_m, pblk_8x2_m + stride);          \
5537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
5547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Store 8x4 byte block to destination memory from input
5567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 vectors
5577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, pdst, stride
5587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Index 0 double word element from 'in0' vector is copied to the
5597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst)
5607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Index 1 double word element from 'in0' vector is copied to the
5617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst + stride)
5627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Index 0 double word element from 'in1' vector is copied to the
5637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst + 2 * stride)
5647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Index 1 double word element from 'in1' vector is copied to the
5657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 GP register and stored to (pdst + 3 * stride)
5667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
5677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ST8x4_UB(in0, in1, pdst, stride) {                  \
5687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint64_t out0_m, out1_m, out2_m, out3_m;                  \
5697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
5707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                            \
5717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
5727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
5737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
5747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
5757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                            \
5767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
5777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
5787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : average with rounding (in0 + in1 + 1) / 2.
5807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3,
5817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
5827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
5837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each unsigned byte element from 'in0' vector is added with
5847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 each unsigned byte element from 'in1' vector. Then the average
5857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 with rounding is calculated and written to 'out0'
5867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
5877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
5887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
5897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
5907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
5917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
5927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
5947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 out0, out1, out2, out3) {                       \
5957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
5967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
5977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
5987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
5997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Immediate number of elements to slide with zero
6017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, slide_val
6027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
6037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
6047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
6057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 value specified in the 'slide_val'
6067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
6077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
6087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 zero_m = { 0 };                                              \
6097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
6107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
6117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
6137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
6157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                  out0, out1, out2, out3, slide_val) {  \
6167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
6177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
6187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
6207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Immediate number of elements to slide
6227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
6237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
6247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
6257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
6267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 value specified in the 'slide_val'
6277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
6287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
6297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
6307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
6317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
6337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
6347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
6367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, slide_val) {                        \
6377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
6387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
6397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
6417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
6427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Shuffle byte vector elements as per mask vector
6447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
6457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
6467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
6477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
6487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'out0' as per control vector 'mask0'
6497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
6507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
6517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
6527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
6537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
6557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
6567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
6577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
6597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, out3) {                        \
6607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
6617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
6627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
6647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
6657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Dot product of byte vector elements
6677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
6687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
6697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
6707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Unsigned byte elements from 'mult0' are multiplied with
6717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 unsigned byte elements from 'cnst0' producing a result
6727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 twice the size of input i.e. unsigned halfword.
6737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The multiplication result of adjacent odd-even elements
6747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 are added together and written to the 'out0' vector
6757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
6767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
6777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
6787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
6797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
6817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
6837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 cnst0, cnst1, cnst2, cnst3,                \
6847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 out0, out1, out2, out3) {                  \
6857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
6867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
6877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
6897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Dot product of byte vector elements
6917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
6927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
6937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
6947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Signed byte elements from 'mult0' are multiplied with
6957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 signed byte elements from 'cnst0' producing a result
6967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 twice the size of input i.e. signed halfword.
6977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The multiplication result of adjacent odd-even elements
6987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 are added together and written to the 'out0' vector
6997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
7007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
7017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
7027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
7037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
7047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
7057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
7067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
7077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
7087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
7097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
7107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
7117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
7127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
7137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Dot product of halfword vector elements
7147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
7157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
7167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
7177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Signed halfword elements from 'mult0' are multiplied with
7187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 signed halfword elements from 'cnst0' producing a result
7197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 twice the size of input i.e. signed word.
7207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The multiplication result of adjacent odd-even elements
7217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 are added together and written to the 'out0' vector
7227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
7237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
7247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
7257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
7267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
7277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
7287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
7297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
7307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 cnst0, cnst1, cnst2, cnst3,                \
7317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 out0, out1, out2, out3) {                  \
7327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
7337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
7347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
7357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
7367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
7377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Dot product of word vector elements
7387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
7397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
7407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
7417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Signed word elements from 'mult0' are multiplied with
7427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 signed word elements from 'cnst0' producing a result
7437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 twice the size of input i.e. signed double word.
7447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The multiplication result of adjacent odd-even elements
7457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 are added together and written to the 'out0' vector
7467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
7477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
7487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
7497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
7507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
7517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
7527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
7537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Dot product & addition of byte vector elements
7547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
7557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
7567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
7577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Signed byte elements from 'mult0' are multiplied with
7587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 signed byte elements from 'cnst0' producing a result
7597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 twice the size of input i.e. signed halfword.
7607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The multiplication result of adjacent odd-even elements
7617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 are added to the 'out0' vector
7627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
7637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
7647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
7657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
7667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
7677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
7687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
7697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
7707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
7717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
7727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
7737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
7747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
7757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
7767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Dot product & addition of halfword vector elements
7777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
7787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
7797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
7807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Signed halfword elements from 'mult0' are multiplied with
7817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 signed halfword elements from 'cnst0' producing a result
7827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 twice the size of input i.e. signed word.
7837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The multiplication result of adjacent odd-even elements
7847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 are added to the 'out0' vector
7857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
7867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
7877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
7887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
7897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
7907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
7917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
7927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Dot product & addition of double word vector elements
7937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - mult0, mult1
7947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
7957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
7967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each signed word element from 'mult0' is multiplied with itself
7977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 producing an intermediate result twice the size of input
7987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 i.e. signed double word
7997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The multiplication result of adjacent odd-even elements
8007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 are added to the 'out0' vector
8017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
8027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
8037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
8047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
8057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
8067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
8077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Minimum values between unsigned elements of
8097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 either vector are copied to the output vector
8107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, min_vec
8117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - in place operation
8127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
8137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Minimum of unsigned halfword element values from 'in0' and
8147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'min_vec' are written to output vector 'in0'
8157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
8167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
8177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
8187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
8197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
8207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
8217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
8237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  MIN_UH2(RTYPE, in0, in1, min_vec);                   \
8247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  MIN_UH2(RTYPE, in2, in3, min_vec);                   \
8257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
8267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
8277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Clips all signed halfword elements of input vector
8297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 between 0 & 255
8307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Input  - in
8317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Output - out_m
8327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed halfword
8337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
8347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define CLIP_SH_0_255(in) ({                          \
8357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 max_m = __msa_ldi_h(255);                     \
8367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 out_m;                                        \
8377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                      \
8387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out_m = __msa_maxi_s_h((v8i16)in, 0);               \
8397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
8407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out_m;                                              \
8417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
8427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define CLIP_SH2_0_255(in0, in1) {  \
8437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = CLIP_SH_0_255(in0);         \
8447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = CLIP_SH_0_255(in1);         \
8457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
8467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
8477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  CLIP_SH2_0_255(in0, in1);                   \
8487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  CLIP_SH2_0_255(in2, in3);                   \
8497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
8507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Horizontal addition of 4 signed word elements of input vector
8527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Input  - in       (signed word vector)
8537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Output - sum_m    (i32 sum)
8547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed word (GP)
8557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : 4 signed word elements of 'in' vector are added together and
8567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the resulting integer sum is returned
8577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
8587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HADD_SW_S32(in) ({                        \
8597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v2i64 res0_m, res1_m;                           \
8607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  int32_t sum_m;                                  \
8617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                  \
8627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
8637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  res1_m = __msa_splati_d(res0_m, 1);             \
8647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  res0_m = res0_m + res1_m;                       \
8657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
8667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum_m;                                          \
8677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
8687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Horizontal addition of 8 unsigned halfword elements
8707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in       (unsigned halfword vector)
8717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - sum_m    (u32 sum)
8727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - unsigned word
8737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : 8 unsigned halfword elements of input vector are added
8747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 together and the resulting integer sum is returned
8757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
8767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HADD_UH_U32(in) ({                           \
8777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v4u32 res_m;                                       \
8787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v2u64 res0_m, res1_m;                              \
8797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t sum_m;                                    \
8807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                     \
8817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
8827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  res0_m = __msa_hadd_u_d(res_m, res_m);             \
8837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
8847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  res0_m = res0_m + res1_m;                          \
8857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
8867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum_m;                                             \
8877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
8887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Horizontal addition of unsigned byte vector elements
8907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1
8917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
8927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
8937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each unsigned odd byte element from 'in0' is added to
8947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 even unsigned byte element from 'in0' (pairwise) and the
8957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 halfword result is written to 'out0'
8967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
8977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
8987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
8997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
9007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
9027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
9047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
9057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
9067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
9087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Horizontal subtraction of unsigned byte vector elements
9107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1
9117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
9127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
9137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each unsigned odd byte element from 'in0' is subtracted from
9147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 even unsigned byte element from 'in0' (pairwise) and the
9157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 halfword result is written to 'out0'
9167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
9177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
9187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
9197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
9207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
9227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : SAD (Sum of Absolute Difference)
9247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, ref0, ref1
9257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - sad_m                 (halfword vector)
9267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - unsigned halfword
9277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Absolute difference of all the byte elements from 'in0' with
9287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
9297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 pairs are added together to generate 8 halfword results.
9307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
9317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAD_UB2_UH(in0, in1, ref0, ref1) ({                 \
9327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16u8 diff0_m, diff1_m;                                   \
9337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8u16 sad_m = { 0 };                                      \
9347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                            \
9357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);        \
9367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);        \
9377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                            \
9387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m);  \
9397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m);  \
9407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                            \
9417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sad_m;                                                    \
9427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
9437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Horizontal subtraction of signed halfword vector elements
9457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1
9467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
9477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
9487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each signed odd halfword element from 'in0' is subtracted from
9497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 even signed halfword element from 'in0' (pairwise) and the
9507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 word result is written to 'out0'
9517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
9527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
9537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
9547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
9557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
9577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Set element n input vector to GPR value
9597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, in2, in3
9607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Output - out
9617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
9627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Set element 0 in vector 'out' to value specified in 'in0'
9637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
9647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define INSERT_W2(RTYPE, in0, in1, out) {           \
9657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
9667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
9677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
9697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
9717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
9727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
9737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
9747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
9757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
9777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
9787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define INSERT_D2(RTYPE, in0, in1, out) {           \
9807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
9817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
9827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
9847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
9857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave even byte elements from vectors
9877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
9887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
9897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
9907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Even byte elements of 'in0' and 'in1' are interleaved
9917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 and written to 'out0'
9927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
9937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
9947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
9957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
9967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
9987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
9997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave even halfword elements from vectors
10017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
10027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
10037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
10047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
10057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 and written to 'out0'
10067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
10077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
10087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
10097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
10107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
10117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
10127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
10137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
10147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave even word elements from vectors
10167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
10177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
10187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
10197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Even word elements of 'in0' and 'in1' are interleaved
10207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 and written to 'out0'
10217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
10227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
10237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
10247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
10257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
10267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
10277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave even double word elements from vectors
10297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
10307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
10317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
10327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Even double word elements of 'in0' and 'in1' are interleaved
10337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 and written to 'out0'
10347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
10357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
10367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
10377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
10387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
10397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
10407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave left half of byte elements from vectors
10427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
10437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
10447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
10457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
10467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 and written to 'out0'.
10477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
10487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
10497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
10507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
10517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
10527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
10537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
10547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
10557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
10567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
10587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, out3) {                       \
10597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
10607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
10617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
10627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
10637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
10647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave left half of halfword elements from vectors
10667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
10677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
10687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
10697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Left half of halfword elements of 'in0' and 'in1' are
10707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 interleaved and written to 'out0'.
10717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
10727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
10737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
10747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
10757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
10767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
10777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave left half of word elements from vectors
10797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
10807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
10817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
10827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
10837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 and written to 'out0'.
10847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
10857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
10867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
10877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
10887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
10897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
10907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
10917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave right half of byte elements from vectors
10937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
10947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
10957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
10967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
10977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 and written to out0.
10987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
10997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
11007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
11017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
11027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
11047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
11057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
11067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
11077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
11097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, out3) {                       \
11107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
11117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
11127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
11147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
11157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
11167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
11177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
11197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                in8, in9, in10, in11, in12, in13, in14, in15,      \
11207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, out3, out4, out5, out6, out7) {  \
11217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
11227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian          out0, out1, out2, out3);                                 \
11237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
11247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian          out4, out5, out6, out7);                                 \
11257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
11277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave right half of halfword elements from vectors
11297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
11307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
11317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
11327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Right half of halfword elements of 'in0' and 'in1' are
11337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 interleaved and written to 'out0'.
11347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
11357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
11367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
11377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
11387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
11407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
11427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, out3) {                       \
11437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
11447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
11457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
11477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
11497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
11507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
11517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
11537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
11547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
11567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, out3) {                       \
11577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
11587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
11597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
11617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave right half of double word elements from vectors
11637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
11647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
11657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
11667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Right half of double word elements of 'in0' and 'in1' are
11677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 interleaved and written to 'out0'.
11687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
11697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
11707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
11717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
11727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
11747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
11757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
11767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
11787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
11797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
11807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
11827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
11847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, out3) {                       \
11857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
11867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
11877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
11897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
11907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Interleave both left and right half of input vectors
11927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1
11937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
11947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
11957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Right half of byte elements from 'in0' and 'in1' are
11967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 interleaved and written to 'out0'
11977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
11987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
11997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
12007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
12017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
12037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
12047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
12057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
12067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
12087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
12097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
12107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
12127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
12137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
12157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
12167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
12177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
12197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
12207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Saturate the halfword element values to the max
12227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 unsigned value of (sat_val + 1) bits
12237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The element data width remains unchanged
12247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, sat_val
12257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - in place operation
12267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
12277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each unsigned halfword element from 'in0' is saturated to the
12287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 value generated with (sat_val + 1) bit range.
12297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The results are written in place
12307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
12317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
12327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
12337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
12347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
12367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
12387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SAT_UH2(RTYPE, in0, in1, sat_val);                   \
12397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SAT_UH2(RTYPE, in2, in3, sat_val)                    \
12407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
12427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Saturate the halfword element values to the max
12447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 unsigned value of (sat_val + 1) bits
12457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The element data width remains unchanged
12467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, sat_val
12477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - in place operation
12487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
12497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each unsigned halfword element from 'in0' is saturated to the
12507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 value generated with (sat_val + 1) bit range
12517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 The results are written in place
12527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
12537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
12547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
12557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
12567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
12587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
12607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SAT_SH2(RTYPE, in0, in1, sat_val);                   \
12617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SAT_SH2(RTYPE, in2, in3, sat_val);                   \
12627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
12647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Indexed halfword element values are replicated to all
12667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 elements in output vector
12677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in, idx0, idx1
12687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
12697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
12707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : 'idx0' element value from 'in' vector is replicated to all
12717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                  elements in 'out0' vector
12727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                  Valid index range for halfword operation is 0-7
12737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
12747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
12757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
12767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
12777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
12797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
12817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                  out0, out1, out2, out3) {           \
12827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
12837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
12847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
12857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
12867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
12877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
12887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Pack even byte elements of vector pairs
12897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
12907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
12917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
12927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Even byte elements of 'in0' are copied to the left half of
12937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'out0' & even byte elements of 'in1' are copied to the right
12947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 half of 'out0'.
12957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
12967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
12977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
12987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
12997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
13017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
13027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
13037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
13057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 out0, out1, out2, out3) {                       \
13067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
13077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
13087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
13107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
13117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
13127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Pack even halfword elements of vector pairs
13147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
13157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
13167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
13177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Even halfword elements of 'in0' are copied to the left half of
13187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'out0' & even halfword elements of 'in1' are copied to the
13197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 right half of 'out0'.
13207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
13217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
13227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
13237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
13247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
13267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
13277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
13297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 out0, out1, out2, out3) {                       \
13307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
13317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
13327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
13347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Pack even double word elements of vector pairs
13367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
13377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
13387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
13397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Even double elements of 'in0' are copied to the left half of
13407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'out0' & even double elements of 'in1' are copied to the right
13417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 half of 'out0'.
13427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
13437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
13447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
13457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
13467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
13487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
13497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
13517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 out0, out1, out2, out3) {                       \
13527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
13537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
13547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
13567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Each byte element is logically xor'ed with immediate 128
13587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1
13597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - in place operation
13607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
13617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each unsigned byte element from input vector 'in0' is
13627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 logically xor'ed with 128 and the result is stored in-place.
13637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
13647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B2_128(RTYPE, in0, in1) {         \
13657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
13667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
13677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
13697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
13707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B3_128(RTYPE, in0, in1, in2) {    \
13727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  XORI_B2_128(RTYPE, in0, in1);                \
13737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
13747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
13767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
13787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  XORI_B2_128(RTYPE, in0, in1);                   \
13797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  XORI_B2_128(RTYPE, in2, in3);                   \
13807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
13827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
13837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
13857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
13867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  XORI_B3_128(RTYPE, in4, in5, in6);                             \
13877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
13887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
13897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
13907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Average of signed halfword elements -> (a + b) / 2
13917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
13927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3
13937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
13947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each signed halfword element from 'in0' is added to each
13957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 signed halfword element of 'in1' with full precision resulting
13967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 in one extra bit in the result. The result is then divided by
13977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 2 and written to 'out0'
13987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
13997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
14007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                out0, out1, out2, out3) {                       \
14017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
14027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
14037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
14047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
14057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
14077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
14087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Addition of signed halfword elements and signed saturation
14097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
14107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
14117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
14127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Signed halfword elements from 'in0' are added to signed
14137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 halfword elements of 'in1'. The result is then signed saturated
14147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 between halfword data type range
14157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
14167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
14177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
14187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
14197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
14217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
14227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
14237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 out0, out1, out2, out3) {                       \
14247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
14257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
14267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
14287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
14297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Shift left all elements of vector (generic for all data types)
14307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, shift
14317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - in place operation
14327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per input vector RTYPE
14337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each element of vector 'in0' is left shifted by 'shift' and
14347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the result is written in-place.
14357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
14367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SLLI_4V(in0, in1, in2, in3, shift) {  \
14377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = in0 << shift;                         \
14387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = in1 << shift;                         \
14397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in2 = in2 << shift;                         \
14407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in3 = in3 << shift;                         \
14417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
14437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Arithmetic shift right all elements of vector
14447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 (generic for all data types)
14457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, shift
14467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - in place operation
14477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per input vector RTYPE
14487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each element of vector 'in0' is right shifted by 'shift' and
14497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the result is written in-place. 'shift' is a GP variable.
14507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
14517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRA_4V(in0, in1, in2, in3, shift) {  \
14527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = in0 >> shift;                        \
14537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = in1 >> shift;                        \
14547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in2 = in2 >> shift;                        \
14557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in3 = in3 >> shift;                        \
14567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
14587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Shift right arithmetic rounded words
14597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, shift
14607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - in place operation
14617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
14627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each element of vector 'in0' is shifted right arithmetically by
14637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the number of bits in the corresponding element in the vector
14647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'shift'. The last discarded bit is added to shifted value for
14657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 rounding and the result is written in-place.
14667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'shift' is a vector.
14677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
14687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRAR_W2(RTYPE, in0, in1, shift) {               \
14697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
14707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
14717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
14737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
14747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SRAR_W2(RTYPE, in0, in1, shift)                    \
14757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SRAR_W2(RTYPE, in2, in3, shift)                    \
14767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
14787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
14797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Shift right arithmetic rounded (immediate)
14807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, shift
14817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - in place operation
14827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
14837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each element of vector 'in0' is shifted right arithmetically by
14847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the value in 'shift'. The last discarded bit is added to the
14857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 shifted value for rounding and the result is written in-place.
14867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 'shift' is an immediate value.
14877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
14887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_H2(RTYPE, in0, in1, shift) {        \
14897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
14907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
14917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
14937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
14947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
14957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
14967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SRARI_H2(RTYPE, in0, in1, shift);                   \
14977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SRARI_H2(RTYPE, in2, in3, shift);                   \
14987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
14997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
15007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
15017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
15027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_W2(RTYPE, in0, in1, shift) {        \
15037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
15047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
15057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
15077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
15087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
15097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SRARI_W2(RTYPE, in0, in1, shift);                   \
15107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SRARI_W2(RTYPE, in2, in3, shift);                   \
15117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
15137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
15147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Logical shift right all elements of vector (immediate)
15157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, shift
15167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3
15177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
15187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each element of vector 'in0' is right shifted by 'shift' and
15197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the result is written in-place. 'shift' is an immediate value.
15207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
15217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
15227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
15237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
15247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
15257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
15267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
15287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
15297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Multiplication of pairs of vectors
15307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
15317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
15327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each element from 'in0' is multiplied with elements from 'in1'
15337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 and the result is written to 'out0'
15347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
15357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define MUL2(in0, in1, in2, in3, out0, out1) {  \
15367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = in0 * in1;                             \
15377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = in2 * in3;                             \
15387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
15407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian             out0, out1, out2, out3) {                \
15417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  MUL2(in0, in1, in2, in3, out0, out1);               \
15427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  MUL2(in4, in5, in6, in7, out2, out3);               \
15437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
15457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Addition of 2 pairs of vectors
15467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
15477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
15487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each element in 'in0' is added to 'in1' and result is written
15497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 to 'out0'.
15507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
15517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ADD2(in0, in1, in2, in3, out0, out1) {  \
15527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = in0 + in1;                             \
15537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = in2 + in3;                             \
15547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
15567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian             out0, out1, out2, out3) {                \
15577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ADD2(in0, in1, in2, in3, out0, out1);               \
15587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ADD2(in4, in5, in6, in7, out2, out3);               \
15597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
15617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Subtraction of 2 pairs of vectors
15627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
15637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1
15647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Each element in 'in1' is subtracted from 'in0' and result is
15657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 written to 'out0'.
15667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
15677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SUB2(in0, in1, in2, in3, out0, out1) {  \
15687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = in0 - in1;                             \
15697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = in2 - in3;                             \
15707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
15727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian             out0, out1, out2, out3) {                \
15737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = in0 - in1;                                   \
15747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = in2 - in3;                                   \
15757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = in4 - in5;                                   \
15767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = in6 - in7;                                   \
15777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
15797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Sign extend halfword elements from right half of the vector
15807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Input  - in    (halfword vector)
15817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Output - out   (sign extended word vector)
15827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed word
15837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Sign bit of halfword elements from input vector 'in' is
15847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 extracted and interleaved with same vector 'in0' to generate
15857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 4 word elements keeping sign intact
15867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
15877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UNPCK_R_SH_SW(in, out) {                 \
15887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 sign_m;                                  \
15897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                 \
15907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sign_m = __msa_clti_s_h((v8i16)in, 0);         \
15917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
15927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
15937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
15947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Zero extend unsigned byte elements to halfword elements
15957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Input   - in          (unsigned byte vector)
15967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1  (unsigned  halfword vectors)
15977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed halfword
15987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Zero extended right half of vector is returned in 'out0'
15997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Zero extended left half of vector is returned in 'out1'
16007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
16017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UNPCK_UB_SH(in, out0, out1) {   \
16027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 zero_m = { 0 };                 \
16037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        \
16047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_B2_SH(zero_m, in, out0, out1);  \
16057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
16067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
16077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Sign extend halfword elements from input vector and return
16087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the result in pair of vectors
16097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Input   - in            (halfword vector)
16107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1   (sign extended word vectors)
16117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed word
16127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Sign bit of halfword elements from input vector 'in' is
16137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 extracted and interleaved right with same vector 'in0' to
16147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 generate 4 signed word elements in 'out0'
16157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Then interleaved left with same vector 'in0' to
16167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 generate 4 signed word elements in 'out1'
16177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
16187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UNPCK_SH_SW(in, out0, out1) {    \
16197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 tmp_m;                           \
16207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                         \
16217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
16227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_H2_SW(tmp_m, in, out0, out1);    \
16237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
16247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
16257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Butterfly of 4 input vectors
16267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
16277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3
16287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Butterfly operation
16297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
16307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
16317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = in0 + in3;                                                \
16327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = in1 + in2;                                                \
16337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                   \
16347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = in1 - in2;                                                \
16357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = in0 - in3;                                                \
16367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
16377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
16387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Butterfly of 8 input vectors
16397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0 ...  in7
16407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0 .. out7
16417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Butterfly operation
16427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
16437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
16447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                    out0, out1, out2, out3, out4, out5, out6, out7) {  \
16457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = in0 + in7;                                                    \
16467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = in1 + in6;                                                    \
16477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = in2 + in5;                                                    \
16487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = in3 + in4;                                                    \
16497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                       \
16507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out4 = in3 - in4;                                                    \
16517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out5 = in2 - in5;                                                    \
16527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out6 = in1 - in6;                                                    \
16537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out7 = in0 - in7;                                                    \
16547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
16557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
16567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Butterfly of 16 input vectors
16577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0 ...  in15
16587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0 .. out15
16597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Butterfly operation
16607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
16617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
16627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
16637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                     out0, out1, out2, out3, out4, out5, out6, out7,          \
16647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
16657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = in0 + in15;                                                          \
16667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = in1 + in14;                                                          \
16677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = in2 + in13;                                                          \
16687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = in3 + in12;                                                          \
16697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out4 = in4 + in11;                                                          \
16707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out5 = in5 + in10;                                                          \
16717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out6 = in6 + in9;                                                           \
16727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out7 = in7 + in8;                                                           \
16737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                              \
16747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out8 = in7 - in8;                                                           \
16757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out9 = in6 - in9;                                                           \
16767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out10 = in5 - in10;                                                         \
16777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out11 = in4 - in11;                                                         \
16787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out12 = in3 - in12;                                                         \
16797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out13 = in2 - in13;                                                         \
16807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out14 = in1 - in14;                                                         \
16817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out15 = in0 - in15;                                                         \
16827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
16837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
16847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Transpose input 8x8 byte block
16857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
16867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
16877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
16887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
16897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
16907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
16917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
16927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
16937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                           \
16947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
16957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
16967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
16977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
16987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
16997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
17007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
17017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
17027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
17037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
17047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
17057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
17067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
17077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                           in8, in9, in10, in11, in12, in13, in14, in15
17087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
17097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - unsigned byte
17107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
17117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
17127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                            in8, in9, in10, in11, in12, in13, in14, in15,      \
17137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
17147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
17157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
17167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                               \
17177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
17187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
17197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
17207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
17217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                               \
17227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
17237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
17247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
17257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
17267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
17277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
17287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
17297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
17307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                               \
17317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
17327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
17337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
17347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                               \
17357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
17367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
17377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
17387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
17397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                               \
17407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
17417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
17427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
17437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                               \
17447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
17457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
17467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
17477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
17487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
17497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
17507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
17517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
17527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Transpose 4x4 block with half word elements in vectors
17537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
17547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3
17557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed halfword
17567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
17577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
17587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 s0_m, s1_m;                                                       \
17597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                          \
17607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
17617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
17627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
17637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
17647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
17657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
17667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Transpose 4x8 block with half word elements in vectors
17677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
17687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
17697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed halfword
17707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
17717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
17727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                           out0, out1, out2, out3, out4, out5, out6, out7) {  \
17737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
17747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
17757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 zero_m = { 0 };                                                       \
17767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                              \
17777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
17787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
17797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
17807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
17817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                              \
17827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
17837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
17847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
17857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
17867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                              \
17877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out4 = zero_m;                                                              \
17887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out5 = zero_m;                                                              \
17897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out6 = zero_m;                                                              \
17907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out7 = zero_m;                                                              \
17917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
17927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
17937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Transpose 8x4 block with half word elements in vectors
17947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
17957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
17967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed halfword
17977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
17987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
17997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
18007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                          \
18017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
18027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
18037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
18047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
18057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
18067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
18077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Transpose 8x8 block with half word elements in vectors
18087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
18097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
18107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - as per RTYPE
18117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
18127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
18137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                       out0, out1, out2, out3, out4, out5, out6, out7) {  \
18147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 s0_m, s1_m;                                                       \
18157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
18167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
18177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                          \
18187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
18197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
18207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
18217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
18227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
18237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
18247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
18257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
18267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
18277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
18287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
18297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
18307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
18317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
18327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
18337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
18347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
18357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Transpose 4x4 block with word elements in vectors
18367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs  - in0, in1, in2, in3
18377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Outputs - out0, out1, out2, out3
18387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - signed word
18397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
18407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
18417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
18427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                          \
18437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
18447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
18457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                          \
18467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
18477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
18487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
18497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
18507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
18517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
18527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Add block 4x4
18537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
18547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Least significant 4 bytes from each input vector are added to
18557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 the destination bytes, clipped between 0-255 and stored.
18567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
18577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
18587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
18597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
18607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 dst0_m = { 0 };                                         \
18617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 dst1_m = { 0 };                                         \
18627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 zero_m = { 0 };                                         \
18637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                \
18647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
18657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
18667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
18677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
18687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
18697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
18707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  CLIP_SH2_0_255(res0_m, res1_m);                               \
18717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
18727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
18737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
18747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
18757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Pack even elements of input vectors & xor with 128
18767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1
18777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Output - out_m
18787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 Return Type - unsigned byte
18797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Details     : Signed byte even elements from 'in0' and 'in1' are packed
18807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 together in one vector and the resulting vector is xor'ed with
18817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 128 to shift the range from signed to unsigned byte
18827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
18837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_XORI128_UB(in0, in1) ({                    \
18847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16u8 out_m;                                           \
18857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                         \
18867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
18877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
18887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  out_m;                                                 \
18897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
18907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
18917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Converts inputs to unsigned bytes, interleave, average & store
18927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 as 8x4 unsigned byte block
18937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
18947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          pdst, stride
18957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
18967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
18977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                dst0, dst1, dst2, dst3, pdst, stride) {  \
18987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
18997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
19007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                                         \
19017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
19027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
19037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
19047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
19057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
19067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
19077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
19087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Pack even byte elements and store byte vector in destination
19097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                 memory
19107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, pdst
19117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
19127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define PCKEV_ST_SB(in0, in1, pdst) {             \
19137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 tmp_m;                                    \
19147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                  \
19157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
19167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ST_SB(tmp_m, (pdst));                           \
19177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
19187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
19197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* Description : Horizontal 2 tap filter kernel code
19207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   Arguments   : Inputs - in0, in1, mask, coeff, shift
19217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian*/
19227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
19237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16i8 tmp0_m;                                                \
19247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8u16 tmp1_m;                                                \
19257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                               \
19267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
19277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
19287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
19297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                                               \
19307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  tmp1_m;                                                      \
19317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian})
19327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */
1933