17ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/*
27ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
37ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *
47ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
57ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
67ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
77ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
87ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
97ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian */
100a39d0a697ff3603e8c100300fda363658e10b23James Zern#include <stdlib.h>
117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1268e1c830ade592be74773e249bf94e2bbfb50de7Johann#include "./vpx_dsp_rtcd.h"
137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/mips/macros_msa.h"
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1568e1c830ade592be74773e249bf94e2bbfb50de7Johannuint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t sum_out;
177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v4u32 sum = { 0 };
207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum0 += sum4;
277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum = __msa_hadd_u_w(sum0, sum0);
297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum = __msa_hadd_u_w(sum0, sum0);
317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum_out = __msa_copy_u_w((v4i32)sum, 0);
337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  return sum_out;
357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3768e1c830ade592be74773e249bf94e2bbfb50de7Johannuint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t sum_out;
397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint32_t src0, src1, src2, src3;
407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v16u8 vec = { 0 };
417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v8u16 sum0;
427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v4u32 sum1;
437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  v2u64 sum2;
447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  LW4(src, src_stride, src0, src1, src2, src3);
467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  INSERT_W4_UB(src0, src1, src2, src3, vec);
477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum0 = __msa_hadd_u_h(vec, vec);
497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum1 = __msa_hadd_u_w(sum0, sum0);
507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum1 = __msa_hadd_u_w(sum0, sum0);
527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum2 = __msa_hadd_u_d(sum1, sum1);
537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  sum_out = __msa_copy_u_w((v4i32)sum1, 0);
557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  return sum_out;
577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
580a39d0a697ff3603e8c100300fda363658e10b23James Zern
59df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
60df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                          int16_t *dst) {
610a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
620a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
630a39d0a697ff3603e8c100300fda363658e10b23James Zern
640a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
650a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
660a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
670a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
680a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
690a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
700a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp4, tmp5, tmp1, tmp6, tmp2);
710a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
720a39d0a697ff3603e8c100300fda363658e10b23James Zern                     src2, src3, src4, src5, src6, src7);
730a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
740a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
750a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
760a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
770a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
780a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp4, tmp5, tmp1, tmp6, tmp2);
790a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
800a39d0a697ff3603e8c100300fda363658e10b23James Zern                     src2, src3, src4, src5, src6, src7);
810a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
820a39d0a697ff3603e8c100300fda363658e10b23James Zern}
830a39d0a697ff3603e8c100300fda363658e10b23James Zern
84df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
85df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                            int16_t *dst) {
860a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
870a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
880a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
890a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
900a39d0a697ff3603e8c100300fda363658e10b23James Zern
910a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src0, src8);
920a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
930a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src1, src9);
940a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
950a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src2, src10);
960a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
970a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src3, src11);
980a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
990a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src4, src12);
1000a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1010a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src5, src13);
1020a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1030a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src6, src14);
1040a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1050a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src7, src15);
1060a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1070a39d0a697ff3603e8c100300fda363658e10b23James Zern
1080a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
1090a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
1100a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
1110a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
1120a39d0a697ff3603e8c100300fda363658e10b23James Zern
1130a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
1140a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
1150a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
1160a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp4, tmp5, tmp1, tmp6, tmp2);
1170a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
1180a39d0a697ff3603e8c100300fda363658e10b23James Zern                     src2, src3, src4, src5, src6, src7);
1190a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
1200a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
1210a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
1220a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
1230a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
1240a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp4, tmp5, tmp1, tmp6, tmp2);
1250a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
1260a39d0a697ff3603e8c100300fda363658e10b23James Zern                     src2, src11, src4, src5, src6, src7);
1270a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8);
1280a39d0a697ff3603e8c100300fda363658e10b23James Zern
1290a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
1300a39d0a697ff3603e8c100300fda363658e10b23James Zern              src12, src13, src15, src14, src11, src10);
1310a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
1320a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
1330a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
1340a39d0a697ff3603e8c100300fda363658e10b23James Zern                     src9, src10, src11, src12, src13, src14, src15);
1350a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
1360a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
1370a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
1380a39d0a697ff3603e8c100300fda363658e10b23James Zern              src12, src13, src15, src14, src11, src10);
1390a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
1400a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
1410a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
1420a39d0a697ff3603e8c100300fda363658e10b23James Zern                     res1, res2, res3, res4, res5, res6, res7);
1430a39d0a697ff3603e8c100300fda363658e10b23James Zern
1440a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src0, src8);
1450a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1460a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src1, src9);
1470a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1480a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src2, src10);
1490a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1500a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src3, src11);
1510a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1520a39d0a697ff3603e8c100300fda363658e10b23James Zern
1530a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8);
1540a39d0a697ff3603e8c100300fda363658e10b23James Zern
1550a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src4, src12);
1560a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1570a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src5, src13);
1580a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1590a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src6, src14);
1600a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1610a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH2(src, 8, src7, src15);
1620a39d0a697ff3603e8c100300fda363658e10b23James Zern  src += src_stride;
1630a39d0a697ff3603e8c100300fda363658e10b23James Zern
1640a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
1650a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
1660a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
1670a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
1680a39d0a697ff3603e8c100300fda363658e10b23James Zern
1690a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
1700a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
1710a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
1720a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp4, tmp5, tmp1, tmp6, tmp2);
1730a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
1740a39d0a697ff3603e8c100300fda363658e10b23James Zern                     src2, src3, src4, src5, src6, src7);
1750a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
1760a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
1770a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
1780a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
1790a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
1800a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp4, tmp5, tmp1, tmp6, tmp2);
1810a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
1820a39d0a697ff3603e8c100300fda363658e10b23James Zern                     src2, src3, src4, src5, src6, src7);
1830a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8);
1840a39d0a697ff3603e8c100300fda363658e10b23James Zern
1850a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
1860a39d0a697ff3603e8c100300fda363658e10b23James Zern              src12, src13, src15, src14, src11, src10);
1870a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
1880a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
1890a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
1900a39d0a697ff3603e8c100300fda363658e10b23James Zern                     src9, src10, src11, src12, src13, src14, src15);
1910a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
1920a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
1930a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
1940a39d0a697ff3603e8c100300fda363658e10b23James Zern              src12, src13, src15, src14, src11, src10);
1950a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
1960a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
1970a39d0a697ff3603e8c100300fda363658e10b23James Zern  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
1980a39d0a697ff3603e8c100300fda363658e10b23James Zern                     res1, res2, res3, res4, res5, res6, res7);
1990a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8);
2000a39d0a697ff3603e8c100300fda363658e10b23James Zern
2010a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH4(dst, 64, src0, src1, src2, src3);
2020a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
2030a39d0a697ff3603e8c100300fda363658e10b23James Zern
2040a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
2050a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
2060a39d0a697ff3603e8c100300fda363658e10b23James Zern  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
2070a39d0a697ff3603e8c100300fda363658e10b23James Zern  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
2080a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
2090a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
2100a39d0a697ff3603e8c100300fda363658e10b23James Zern
2110a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH4(src0, src1, src2, src3, dst, 64);
2120a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
2130a39d0a697ff3603e8c100300fda363658e10b23James Zern  dst += 16;
2140a39d0a697ff3603e8c100300fda363658e10b23James Zern
2150a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH4(dst, 64, src0, src1, src2, src3);
2160a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
2170a39d0a697ff3603e8c100300fda363658e10b23James Zern
2180a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
2190a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
2200a39d0a697ff3603e8c100300fda363658e10b23James Zern  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
2210a39d0a697ff3603e8c100300fda363658e10b23James Zern  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
2220a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
2230a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
2240a39d0a697ff3603e8c100300fda363658e10b23James Zern
2250a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH4(src0, src1, src2, src3, dst, 64);
2260a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
2270a39d0a697ff3603e8c100300fda363658e10b23James Zern  dst += 16;
2280a39d0a697ff3603e8c100300fda363658e10b23James Zern
2290a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH4(dst, 64, src0, src1, src2, src3);
2300a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
2310a39d0a697ff3603e8c100300fda363658e10b23James Zern
2320a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
2330a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
2340a39d0a697ff3603e8c100300fda363658e10b23James Zern  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
2350a39d0a697ff3603e8c100300fda363658e10b23James Zern  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
2360a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
2370a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
2380a39d0a697ff3603e8c100300fda363658e10b23James Zern
2390a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH4(src0, src1, src2, src3, dst, 64);
2400a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
2410a39d0a697ff3603e8c100300fda363658e10b23James Zern  dst += 16;
2420a39d0a697ff3603e8c100300fda363658e10b23James Zern
2430a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH4(dst, 64, src0, src1, src2, src3);
2440a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
2450a39d0a697ff3603e8c100300fda363658e10b23James Zern
2460a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
2470a39d0a697ff3603e8c100300fda363658e10b23James Zern              tmp6, tmp7, tmp5, tmp3, tmp1);
2480a39d0a697ff3603e8c100300fda363658e10b23James Zern  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
2490a39d0a697ff3603e8c100300fda363658e10b23James Zern  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
2500a39d0a697ff3603e8c100300fda363658e10b23James Zern  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
2510a39d0a697ff3603e8c100300fda363658e10b23James Zern              src5, src7, src6, src3, src2);
2520a39d0a697ff3603e8c100300fda363658e10b23James Zern
2530a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH4(src0, src1, src2, src3, dst, 64);
2540a39d0a697ff3603e8c100300fda363658e10b23James Zern  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
2550a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2560a39d0a697ff3603e8c100300fda363658e10b23James Zern
2570a39d0a697ff3603e8c100300fda363658e10b23James Zernint vpx_satd_msa(const int16_t *data, int length) {
2580a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i, satd;
2590a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
2600a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 src8, src9, src10, src11, src12, src13, src14, src15;
2610a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 zero = { 0 };
2620a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h;
2630a39d0a697ff3603e8c100300fda363658e10b23James Zern  v4u32 tmp0_w = { 0 };
2640a39d0a697ff3603e8c100300fda363658e10b23James Zern
2650a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (16 == length) {
2660a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH2(data, 8, src0, src1);
2670a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
2680a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
2690a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
2700a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
2710a39d0a697ff3603e8c100300fda363658e10b23James Zern    satd = HADD_UW_U32(tmp0_w);
2720a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (64 == length) {
2730a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
2740a39d0a697ff3603e8c100300fda363658e10b23James Zern
2750a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
2760a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
2770a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
2780a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
2790a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
2800a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
2810a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
2820a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
2830a39d0a697ff3603e8c100300fda363658e10b23James Zern
2840a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
2850a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
2860a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
2870a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
2880a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
2890a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
2900a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
2910a39d0a697ff3603e8c100300fda363658e10b23James Zern    tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
2920a39d0a697ff3603e8c100300fda363658e10b23James Zern
2930a39d0a697ff3603e8c100300fda363658e10b23James Zern    satd = HADD_UW_U32(tmp0_w);
2940a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (256 == length) {
2950a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 2; ++i) {
2960a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
2970a39d0a697ff3603e8c100300fda363658e10b23James Zern      data += 8 * 8;
2980a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
2990a39d0a697ff3603e8c100300fda363658e10b23James Zern      data += 8 * 8;
3000a39d0a697ff3603e8c100300fda363658e10b23James Zern
3010a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
3020a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
3030a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
3040a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
3050a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
3060a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
3070a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
3080a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
3090a39d0a697ff3603e8c100300fda363658e10b23James Zern
3100a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
3110a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
3120a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
3130a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
3140a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
3150a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
3160a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
3170a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
3180a39d0a697ff3603e8c100300fda363658e10b23James Zern
3190a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
3200a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
3210a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
3220a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
3230a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
3240a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
3250a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
3260a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
3270a39d0a697ff3603e8c100300fda363658e10b23James Zern
3280a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
3290a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
3300a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
3310a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
3320a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
3330a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
3340a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
3350a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
3360a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
3370a39d0a697ff3603e8c100300fda363658e10b23James Zern
3380a39d0a697ff3603e8c100300fda363658e10b23James Zern    satd = HADD_UW_U32(tmp0_w);
3390a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (1024 == length) {
3400a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 8; ++i) {
3410a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
3420a39d0a697ff3603e8c100300fda363658e10b23James Zern      data += 8 * 8;
3430a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
3440a39d0a697ff3603e8c100300fda363658e10b23James Zern      data += 8 * 8;
3450a39d0a697ff3603e8c100300fda363658e10b23James Zern
3460a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
3470a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
3480a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
3490a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
3500a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
3510a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
3520a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
3530a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
3540a39d0a697ff3603e8c100300fda363658e10b23James Zern
3550a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
3560a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
3570a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
3580a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
3590a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
3600a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
3610a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
3620a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
3630a39d0a697ff3603e8c100300fda363658e10b23James Zern
3640a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
3650a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
3660a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
3670a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
3680a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
3690a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
3700a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
3710a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
3720a39d0a697ff3603e8c100300fda363658e10b23James Zern
3730a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
3740a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
3750a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
3760a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
3770a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
3780a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
3790a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
3800a39d0a697ff3603e8c100300fda363658e10b23James Zern      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
3810a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
3820a39d0a697ff3603e8c100300fda363658e10b23James Zern
3830a39d0a697ff3603e8c100300fda363658e10b23James Zern    satd = HADD_UW_U32(tmp0_w);
3840a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
3850a39d0a697ff3603e8c100300fda363658e10b23James Zern    satd = 0;
3860a39d0a697ff3603e8c100300fda363658e10b23James Zern
3870a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < length; ++i) {
3880a39d0a697ff3603e8c100300fda363658e10b23James Zern      satd += abs(data[i]);
3890a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
3900a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
3910a39d0a697ff3603e8c100300fda363658e10b23James Zern
3920a39d0a697ff3603e8c100300fda363658e10b23James Zern  return satd;
3930a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3940a39d0a697ff3603e8c100300fda363658e10b23James Zern
3950a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
3960a39d0a697ff3603e8c100300fda363658e10b23James Zern                         const int ref_stride, const int height) {
3970a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i;
3980a39d0a697ff3603e8c100300fda363658e10b23James Zern  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
3990a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 hbuf_r = { 0 };
4000a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 hbuf_l = { 0 };
4010a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
4020a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
4030a39d0a697ff3603e8c100300fda363658e10b23James Zern
4040a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (16 == height) {
4050a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 2; i--;) {
4060a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
4070a39d0a697ff3603e8c100300fda363658e10b23James Zern      ref += 8 * ref_stride;
4080a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
4090a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
4100a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
4110a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
4120a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
4130a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
4140a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
4150a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
4160a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
4170a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4180a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
4190a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4200a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
4210a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4220a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
4230a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4240a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
4250a39d0a697ff3603e8c100300fda363658e10b23James Zern
4260a39d0a697ff3603e8c100300fda363658e10b23James Zern    SRA_2V(hbuf_r, hbuf_l, 3);
4270a39d0a697ff3603e8c100300fda363658e10b23James Zern    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
4280a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (32 == height) {
4290a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 2; i--;) {
4300a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
4310a39d0a697ff3603e8c100300fda363658e10b23James Zern      ref += 8 * ref_stride;
4320a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
4330a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
4340a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
4350a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
4360a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
4370a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
4380a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
4390a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
4400a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
4410a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4420a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
4430a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4440a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
4450a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4460a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
4470a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4480a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
4490a39d0a697ff3603e8c100300fda363658e10b23James Zern      ref += 8 * ref_stride;
4500a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
4510a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
4520a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
4530a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
4540a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
4550a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
4560a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
4570a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
4580a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
4590a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4600a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
4610a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4620a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
4630a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4640a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
4650a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4660a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
4670a39d0a697ff3603e8c100300fda363658e10b23James Zern
4680a39d0a697ff3603e8c100300fda363658e10b23James Zern    SRA_2V(hbuf_r, hbuf_l, 4);
4690a39d0a697ff3603e8c100300fda363658e10b23James Zern    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
4700a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (64 == height) {
4710a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 4; i--;) {
4720a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
4730a39d0a697ff3603e8c100300fda363658e10b23James Zern      ref += 8 * ref_stride;
4740a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
4750a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
4760a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
4770a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
4780a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
4790a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
4800a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
4810a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
4820a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
4830a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4840a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
4850a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4860a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
4870a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4880a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
4890a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
4900a39d0a697ff3603e8c100300fda363658e10b23James Zern      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
4910a39d0a697ff3603e8c100300fda363658e10b23James Zern      ref += 8 * ref_stride;
4920a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
4930a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
4940a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
4950a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
4960a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
4970a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
4980a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
4990a39d0a697ff3603e8c100300fda363658e10b23James Zern      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
5000a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
5010a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
5020a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
5030a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
5040a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
5050a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
5060a39d0a697ff3603e8c100300fda363658e10b23James Zern      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
5070a39d0a697ff3603e8c100300fda363658e10b23James Zern           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
5080a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
5090a39d0a697ff3603e8c100300fda363658e10b23James Zern
5100a39d0a697ff3603e8c100300fda363658e10b23James Zern    SRA_2V(hbuf_r, hbuf_l, 5);
5110a39d0a697ff3603e8c100300fda363658e10b23James Zern    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
5120a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
5130a39d0a697ff3603e8c100300fda363658e10b23James Zern    const int norm_factor = height >> 1;
5140a39d0a697ff3603e8c100300fda363658e10b23James Zern    int cnt;
5150a39d0a697ff3603e8c100300fda363658e10b23James Zern
5160a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (cnt = 0; cnt < 16; cnt++) {
5170a39d0a697ff3603e8c100300fda363658e10b23James Zern      hbuf[cnt] = 0;
5180a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
5190a39d0a697ff3603e8c100300fda363658e10b23James Zern
5200a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < height; ++i) {
5210a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (cnt = 0; cnt < 16; cnt++) {
5220a39d0a697ff3603e8c100300fda363658e10b23James Zern        hbuf[cnt] += ref[cnt];
5230a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
5240a39d0a697ff3603e8c100300fda363658e10b23James Zern
5250a39d0a697ff3603e8c100300fda363658e10b23James Zern      ref += ref_stride;
5260a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
5270a39d0a697ff3603e8c100300fda363658e10b23James Zern
5280a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (cnt = 0; cnt < 16; cnt++) {
5290a39d0a697ff3603e8c100300fda363658e10b23James Zern      hbuf[cnt] /= norm_factor;
5300a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
5310a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
5320a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5330a39d0a697ff3603e8c100300fda363658e10b23James Zern
5340a39d0a697ff3603e8c100300fda363658e10b23James Zernint16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
5350a39d0a697ff3603e8c100300fda363658e10b23James Zern  int16_t sum;
5360a39d0a697ff3603e8c100300fda363658e10b23James Zern  v16u8 ref0, ref1, ref2, ref3;
5370a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8u16 ref0_h;
5380a39d0a697ff3603e8c100300fda363658e10b23James Zern
5390a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (16 == width) {
5400a39d0a697ff3603e8c100300fda363658e10b23James Zern    ref0 = LD_UB(ref);
5410a39d0a697ff3603e8c100300fda363658e10b23James Zern    ref0_h = __msa_hadd_u_h(ref0, ref0);
5420a39d0a697ff3603e8c100300fda363658e10b23James Zern    sum = HADD_UH_U32(ref0_h);
5430a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (32 == width) {
5440a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_UB2(ref, 16, ref0, ref1);
5450a39d0a697ff3603e8c100300fda363658e10b23James Zern    ref0_h = __msa_hadd_u_h(ref0, ref0);
5460a39d0a697ff3603e8c100300fda363658e10b23James Zern    ref0_h += __msa_hadd_u_h(ref1, ref1);
5470a39d0a697ff3603e8c100300fda363658e10b23James Zern    sum = HADD_UH_U32(ref0_h);
5480a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (64 == width) {
5490a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
5500a39d0a697ff3603e8c100300fda363658e10b23James Zern    ref0_h = __msa_hadd_u_h(ref0, ref0);
5510a39d0a697ff3603e8c100300fda363658e10b23James Zern    ref0_h += __msa_hadd_u_h(ref1, ref1);
5520a39d0a697ff3603e8c100300fda363658e10b23James Zern    ref0_h += __msa_hadd_u_h(ref2, ref2);
5530a39d0a697ff3603e8c100300fda363658e10b23James Zern    ref0_h += __msa_hadd_u_h(ref3, ref3);
5540a39d0a697ff3603e8c100300fda363658e10b23James Zern    sum = HADD_UH_U32(ref0_h);
5550a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
5560a39d0a697ff3603e8c100300fda363658e10b23James Zern    int idx;
5570a39d0a697ff3603e8c100300fda363658e10b23James Zern
5580a39d0a697ff3603e8c100300fda363658e10b23James Zern    sum = 0;
5590a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (idx = 0; idx < width; ++idx) {
5600a39d0a697ff3603e8c100300fda363658e10b23James Zern      sum += ref[idx];
5610a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
5620a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
5630a39d0a697ff3603e8c100300fda363658e10b23James Zern
5640a39d0a697ff3603e8c100300fda363658e10b23James Zern  return sum;
5650a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5660a39d0a697ff3603e8c100300fda363658e10b23James Zern
5670a39d0a697ff3603e8c100300fda363658e10b23James Zernint vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) {
5680a39d0a697ff3603e8c100300fda363658e10b23James Zern  int sse, mean, var;
5690a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2;
5700a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m;
5710a39d0a697ff3603e8c100300fda363658e10b23James Zern  v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m;
5720a39d0a697ff3603e8c100300fda363658e10b23James Zern  v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m;
5730a39d0a697ff3603e8c100300fda363658e10b23James Zern  v4i32 res_l7_m, mean_v;
5740a39d0a697ff3603e8c100300fda363658e10b23James Zern  v2i64 sse_v;
5750a39d0a697ff3603e8c100300fda363658e10b23James Zern
5760a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (2 == bwl) {
5770a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH2(src, 8, src0, src1);
5780a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH2(ref, 8, ref0, ref1);
5790a39d0a697ff3603e8c100300fda363658e10b23James Zern
5800a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
5810a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
5820a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
5830a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
5840a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
5850a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
5860a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
5870a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v = res_l0_m + res_l1_m;
5880a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l2_m + res_l3_m;
5890a39d0a697ff3603e8c100300fda363658e10b23James Zern
5900a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v += __msa_splati_d(sse_v, 1);
5910a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse = __msa_copy_s_w((v4i32)sse_v, 0);
5920a39d0a697ff3603e8c100300fda363658e10b23James Zern
5930a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean = HADD_SW_S32(mean_v);
5940a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (3 == bwl) {
5950a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH4(src, 8, src0, src1, src2, src3);
5960a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH4(ref, 8, ref0, ref1, ref2, ref3);
5970a39d0a697ff3603e8c100300fda363658e10b23James Zern
5980a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
5990a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
6000a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
6010a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
6020a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
6030a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
6040a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
6050a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
6060a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
6070a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
6080a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
6090a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
6100a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
6110a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v = res_l0_m + res_l1_m;
6120a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l2_m + res_l3_m;
6130a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l4_m + res_l5_m;
6140a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l6_m + res_l7_m;
6150a39d0a697ff3603e8c100300fda363658e10b23James Zern
6160a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v += __msa_splati_d(sse_v, 1);
6170a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse = __msa_copy_s_w((v4i32)sse_v, 0);
6180a39d0a697ff3603e8c100300fda363658e10b23James Zern
6190a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean = HADD_SW_S32(mean_v);
6200a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (4 == bwl) {
6210a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
6220a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
6230a39d0a697ff3603e8c100300fda363658e10b23James Zern
6240a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
6250a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
6260a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
6270a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
6280a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
6290a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
6300a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
6310a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
6320a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
6330a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
6340a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
6350a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
6360a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
6370a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v = res_l0_m + res_l1_m;
6380a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l2_m + res_l3_m;
6390a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l4_m + res_l5_m;
6400a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l6_m + res_l7_m;
6410a39d0a697ff3603e8c100300fda363658e10b23James Zern
6420a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m);
6430a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m);
6440a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m);
6450a39d0a697ff3603e8c100300fda363658e10b23James Zern    ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m);
6460a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
6470a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
6480a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
6490a39d0a697ff3603e8c100300fda363658e10b23James Zern    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
6500a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v);
6510a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
6520a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
6530a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
6540a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l0_m + res_l1_m;
6550a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l2_m + res_l3_m;
6560a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l4_m + res_l5_m;
6570a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean_v += res_l6_m + res_l7_m;
6580a39d0a697ff3603e8c100300fda363658e10b23James Zern
6590a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse_v += __msa_splati_d(sse_v, 1);
6600a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse = __msa_copy_s_w((v4i32)sse_v, 0);
6610a39d0a697ff3603e8c100300fda363658e10b23James Zern
6620a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean = HADD_SW_S32(mean_v);
6630a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
6640a39d0a697ff3603e8c100300fda363658e10b23James Zern    int i;
6650a39d0a697ff3603e8c100300fda363658e10b23James Zern    const int width = 4 << bwl;
6660a39d0a697ff3603e8c100300fda363658e10b23James Zern
6670a39d0a697ff3603e8c100300fda363658e10b23James Zern    sse = 0;
6680a39d0a697ff3603e8c100300fda363658e10b23James Zern    mean = 0;
6690a39d0a697ff3603e8c100300fda363658e10b23James Zern
6700a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < width; ++i) {
6710a39d0a697ff3603e8c100300fda363658e10b23James Zern      const int diff = ref[i] - src[i];
6720a39d0a697ff3603e8c100300fda363658e10b23James Zern
6730a39d0a697ff3603e8c100300fda363658e10b23James Zern      mean += diff;
6740a39d0a697ff3603e8c100300fda363658e10b23James Zern      sse += diff * diff;
6750a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
6760a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
6770a39d0a697ff3603e8c100300fda363658e10b23James Zern
6780a39d0a697ff3603e8c100300fda363658e10b23James Zern  var = sse - ((mean * mean) >> (bwl + 2));
6790a39d0a697ff3603e8c100300fda363658e10b23James Zern
6800a39d0a697ff3603e8c100300fda363658e10b23James Zern  return var;
6810a39d0a697ff3603e8c100300fda363658e10b23James Zern}
6820a39d0a697ff3603e8c100300fda363658e10b23James Zern
6830a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp,
6840a39d0a697ff3603e8c100300fda363658e10b23James Zern                        int *min, int *max) {
6850a39d0a697ff3603e8c100300fda363658e10b23James Zern  v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7;
6860a39d0a697ff3603e8c100300fda363658e10b23James Zern  v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1;
6870a39d0a697ff3603e8c100300fda363658e10b23James Zern
6880a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
6890a39d0a697ff3603e8c100300fda363658e10b23James Zern  LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7);
6900a39d0a697ff3603e8c100300fda363658e10b23James Zern  PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3);
6910a39d0a697ff3603e8c100300fda363658e10b23James Zern  PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3);
6920a39d0a697ff3603e8c100300fda363658e10b23James Zern
6930a39d0a697ff3603e8c100300fda363658e10b23James Zern  diff0 = __msa_asub_u_b(s0, d0);
6940a39d0a697ff3603e8c100300fda363658e10b23James Zern  diff1 = __msa_asub_u_b(s1, d1);
6950a39d0a697ff3603e8c100300fda363658e10b23James Zern  diff2 = __msa_asub_u_b(s2, d2);
6960a39d0a697ff3603e8c100300fda363658e10b23James Zern  diff3 = __msa_asub_u_b(s3, d3);
6970a39d0a697ff3603e8c100300fda363658e10b23James Zern
6980a39d0a697ff3603e8c100300fda363658e10b23James Zern  min0 = __msa_min_u_b(diff0, diff1);
6990a39d0a697ff3603e8c100300fda363658e10b23James Zern  min1 = __msa_min_u_b(diff2, diff3);
7000a39d0a697ff3603e8c100300fda363658e10b23James Zern  min0 = __msa_min_u_b(min0, min1);
7010a39d0a697ff3603e8c100300fda363658e10b23James Zern
7020a39d0a697ff3603e8c100300fda363658e10b23James Zern  max0 = __msa_max_u_b(diff0, diff1);
7030a39d0a697ff3603e8c100300fda363658e10b23James Zern  max1 = __msa_max_u_b(diff2, diff3);
7040a39d0a697ff3603e8c100300fda363658e10b23James Zern  max0 = __msa_max_u_b(max0, max1);
7050a39d0a697ff3603e8c100300fda363658e10b23James Zern
7060a39d0a697ff3603e8c100300fda363658e10b23James Zern  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8);
7070a39d0a697ff3603e8c100300fda363658e10b23James Zern  min0 = __msa_min_u_b(min0, min1);
7080a39d0a697ff3603e8c100300fda363658e10b23James Zern  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8);
7090a39d0a697ff3603e8c100300fda363658e10b23James Zern  max0 = __msa_max_u_b(max0, max1);
7100a39d0a697ff3603e8c100300fda363658e10b23James Zern
7110a39d0a697ff3603e8c100300fda363658e10b23James Zern  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4);
7120a39d0a697ff3603e8c100300fda363658e10b23James Zern  min0 = __msa_min_u_b(min0, min1);
7130a39d0a697ff3603e8c100300fda363658e10b23James Zern  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4);
7140a39d0a697ff3603e8c100300fda363658e10b23James Zern  max0 = __msa_max_u_b(max0, max1);
7150a39d0a697ff3603e8c100300fda363658e10b23James Zern
7160a39d0a697ff3603e8c100300fda363658e10b23James Zern  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2);
7170a39d0a697ff3603e8c100300fda363658e10b23James Zern  min0 = __msa_min_u_b(min0, min1);
7180a39d0a697ff3603e8c100300fda363658e10b23James Zern  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2);
7190a39d0a697ff3603e8c100300fda363658e10b23James Zern  max0 = __msa_max_u_b(max0, max1);
7200a39d0a697ff3603e8c100300fda363658e10b23James Zern
7210a39d0a697ff3603e8c100300fda363658e10b23James Zern  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1);
7220a39d0a697ff3603e8c100300fda363658e10b23James Zern  min0 = __msa_min_u_b(min0, min1);
7230a39d0a697ff3603e8c100300fda363658e10b23James Zern  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1);
7240a39d0a697ff3603e8c100300fda363658e10b23James Zern  max0 = __msa_max_u_b(max0, max1);
7250a39d0a697ff3603e8c100300fda363658e10b23James Zern
7260a39d0a697ff3603e8c100300fda363658e10b23James Zern  *min = min0[0];
7270a39d0a697ff3603e8c100300fda363658e10b23James Zern  *max = max0[0];
7280a39d0a697ff3603e8c100300fda363658e10b23James Zern}
729