17ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian/* 27ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 37ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * 47ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 57ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 67ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 77ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 87ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 97ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian */ 100a39d0a697ff3603e8c100300fda363658e10b23James Zern#include <stdlib.h> 117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1268e1c830ade592be74773e249bf94e2bbfb50de7Johann#include "./vpx_dsp_rtcd.h" 137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/mips/macros_msa.h" 147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1568e1c830ade592be74773e249bf94e2bbfb50de7Johannuint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { 167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint32_t sum_out; 177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; 197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian v4u32 sum = { 0 }; 207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); 237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); 247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); 257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ADD2(sum0, sum2, sum4, sum6, sum0, sum4); 267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum0 += sum4; 277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum = __msa_hadd_u_w(sum0, sum0); 297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); 307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum = __msa_hadd_u_w(sum0, sum0); 317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum = (v4u32)__msa_srari_w((v4i32)sum, 6); 327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum_out = __msa_copy_u_w((v4i32)sum, 0); 337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian return sum_out; 357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 3768e1c830ade592be74773e249bf94e2bbfb50de7Johannuint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { 387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint32_t sum_out; 397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint32_t src0, src1, src2, src3; 407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian v16u8 vec = { 0 }; 417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian v8u16 sum0; 427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian v4u32 sum1; 437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian v2u64 sum2; 447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian LW4(src, src_stride, src0, src1, src2, src3); 467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian INSERT_W4_UB(src0, src1, src2, src3, vec); 477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum0 = __msa_hadd_u_h(vec, vec); 497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum1 = __msa_hadd_u_w(sum0, sum0); 507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); 517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum1 = __msa_hadd_u_w(sum0, sum0); 527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum2 = __msa_hadd_u_d(sum1, sum1); 537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); 547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sum_out = __msa_copy_u_w((v4i32)sum1, 0); 557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian return sum_out; 577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 580a39d0a697ff3603e8c100300fda363658e10b23James Zern 59df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride, 60df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int16_t *dst) { 610a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 src0, src1, src2, src3, src4, src5, src6, src7; 620a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 630a39d0a697ff3603e8c100300fda363658e10b23James Zern 640a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 650a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 660a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 670a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 680a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 690a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, 700a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4, tmp5, tmp1, tmp6, tmp2); 710a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, 720a39d0a697ff3603e8c100300fda363658e10b23James Zern src2, src3, src4, src5, src6, src7); 730a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 740a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 750a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 760a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 770a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, 780a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4, tmp5, tmp1, tmp6, tmp2); 790a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, 800a39d0a697ff3603e8c100300fda363658e10b23James Zern src2, src3, src4, src5, src6, src7); 810a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8); 820a39d0a697ff3603e8c100300fda363658e10b23James Zern} 830a39d0a697ff3603e8c100300fda363658e10b23James Zern 84df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride, 85df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int16_t *dst) { 860a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 870a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 880a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; 890a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 900a39d0a697ff3603e8c100300fda363658e10b23James Zern 910a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src0, src8); 920a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 930a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src1, src9); 940a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 950a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src2, src10); 960a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 970a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src3, src11); 980a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 990a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src4, src12); 1000a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1010a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src5, src13); 1020a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1030a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src6, src14); 1040a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1050a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src7, src15); 1060a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1070a39d0a697ff3603e8c100300fda363658e10b23James Zern 1080a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 1090a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 1100a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, 1110a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); 1120a39d0a697ff3603e8c100300fda363658e10b23James Zern 1130a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 1140a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 1150a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, 1160a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4, tmp5, tmp1, tmp6, tmp2); 1170a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, 1180a39d0a697ff3603e8c100300fda363658e10b23James Zern src2, src3, src4, src5, src6, src7); 1190a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 1200a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 1210a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 1220a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 1230a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, 1240a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4, tmp5, tmp1, tmp6, tmp2); 1250a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, 1260a39d0a697ff3603e8c100300fda363658e10b23James Zern src2, src11, src4, src5, src6, src7); 1270a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8); 1280a39d0a697ff3603e8c100300fda363658e10b23James Zern 1290a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, 1300a39d0a697ff3603e8c100300fda363658e10b23James Zern src12, src13, src15, src14, src11, src10); 1310a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, 1320a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); 1330a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, 1340a39d0a697ff3603e8c100300fda363658e10b23James Zern src9, src10, src11, src12, src13, src14, src15); 1350a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, 1360a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); 1370a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, 1380a39d0a697ff3603e8c100300fda363658e10b23James Zern src12, src13, src15, src14, src11, src10); 1390a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, 1400a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); 1410a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, 1420a39d0a697ff3603e8c100300fda363658e10b23James Zern res1, res2, res3, res4, res5, res6, res7); 1430a39d0a697ff3603e8c100300fda363658e10b23James Zern 1440a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src0, src8); 1450a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1460a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src1, src9); 1470a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1480a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src2, src10); 1490a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1500a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src3, src11); 1510a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1520a39d0a697ff3603e8c100300fda363658e10b23James Zern 1530a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8); 1540a39d0a697ff3603e8c100300fda363658e10b23James Zern 1550a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src4, src12); 1560a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1570a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src5, src13); 1580a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1590a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src6, src14); 1600a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1610a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src7, src15); 1620a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1630a39d0a697ff3603e8c100300fda363658e10b23James Zern 1640a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 1650a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 1660a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, 1670a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); 1680a39d0a697ff3603e8c100300fda363658e10b23James Zern 1690a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 1700a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 1710a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, 1720a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4, tmp5, tmp1, tmp6, tmp2); 1730a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, 1740a39d0a697ff3603e8c100300fda363658e10b23James Zern src2, src3, src4, src5, src6, src7); 1750a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 1760a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 1770a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 1780a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 1790a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, 1800a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4, tmp5, tmp1, tmp6, tmp2); 1810a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, 1820a39d0a697ff3603e8c100300fda363658e10b23James Zern src2, src3, src4, src5, src6, src7); 1830a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8); 1840a39d0a697ff3603e8c100300fda363658e10b23James Zern 1850a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, 1860a39d0a697ff3603e8c100300fda363658e10b23James Zern src12, src13, src15, src14, src11, src10); 1870a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, 1880a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); 1890a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, 1900a39d0a697ff3603e8c100300fda363658e10b23James Zern src9, src10, src11, src12, src13, src14, src15); 1910a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, 1920a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); 1930a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, 1940a39d0a697ff3603e8c100300fda363658e10b23James Zern src12, src13, src15, src14, src11, src10); 1950a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, 1960a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); 1970a39d0a697ff3603e8c100300fda363658e10b23James Zern TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, 1980a39d0a697ff3603e8c100300fda363658e10b23James Zern res1, res2, res3, res4, res5, res6, res7); 1990a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8); 2000a39d0a697ff3603e8c100300fda363658e10b23James Zern 2010a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(dst, 64, src0, src1, src2, src3); 2020a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(dst + 8, 64, src4, src5, src6, src7); 2030a39d0a697ff3603e8c100300fda363658e10b23James Zern 2040a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 2050a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 2060a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); 2070a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); 2080a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 2090a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 2100a39d0a697ff3603e8c100300fda363658e10b23James Zern 2110a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH4(src0, src1, src2, src3, dst, 64); 2120a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH4(src4, src5, src6, src7, dst + 8, 64); 2130a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += 16; 2140a39d0a697ff3603e8c100300fda363658e10b23James Zern 2150a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(dst, 64, src0, src1, src2, src3); 2160a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(dst + 8, 64, src4, src5, src6, src7); 2170a39d0a697ff3603e8c100300fda363658e10b23James Zern 2180a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 2190a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 2200a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); 2210a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); 2220a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 2230a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 2240a39d0a697ff3603e8c100300fda363658e10b23James Zern 2250a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH4(src0, src1, src2, src3, dst, 64); 2260a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH4(src4, src5, src6, src7, dst + 8, 64); 2270a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += 16; 2280a39d0a697ff3603e8c100300fda363658e10b23James Zern 2290a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(dst, 64, src0, src1, src2, src3); 2300a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(dst + 8, 64, src4, src5, src6, src7); 2310a39d0a697ff3603e8c100300fda363658e10b23James Zern 2320a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 2330a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 2340a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); 2350a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); 2360a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 2370a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 2380a39d0a697ff3603e8c100300fda363658e10b23James Zern 2390a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH4(src0, src1, src2, src3, dst, 64); 2400a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH4(src4, src5, src6, src7, dst + 8, 64); 2410a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += 16; 2420a39d0a697ff3603e8c100300fda363658e10b23James Zern 2430a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(dst, 64, src0, src1, src2, src3); 2440a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(dst + 8, 64, src4, src5, src6, src7); 2450a39d0a697ff3603e8c100300fda363658e10b23James Zern 2460a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, 2470a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6, tmp7, tmp5, tmp3, tmp1); 2480a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); 2490a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); 2500a39d0a697ff3603e8c100300fda363658e10b23James Zern BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, 2510a39d0a697ff3603e8c100300fda363658e10b23James Zern src5, src7, src6, src3, src2); 2520a39d0a697ff3603e8c100300fda363658e10b23James Zern 2530a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH4(src0, src1, src2, src3, dst, 64); 2540a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH4(src4, src5, src6, src7, dst + 8, 64); 2550a39d0a697ff3603e8c100300fda363658e10b23James Zern} 2560a39d0a697ff3603e8c100300fda363658e10b23James Zern 2570a39d0a697ff3603e8c100300fda363658e10b23James Zernint vpx_satd_msa(const int16_t *data, int length) { 2580a39d0a697ff3603e8c100300fda363658e10b23James Zern int i, satd; 2590a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 src0, src1, src2, src3, src4, src5, src6, src7; 2600a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 src8, src9, src10, src11, src12, src13, src14, src15; 2610a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 zero = { 0 }; 2620a39d0a697ff3603e8c100300fda363658e10b23James Zern v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h; 2630a39d0a697ff3603e8c100300fda363658e10b23James Zern v4u32 tmp0_w = { 0 }; 2640a39d0a697ff3603e8c100300fda363658e10b23James Zern 2650a39d0a697ff3603e8c100300fda363658e10b23James Zern if (16 == length) { 2660a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(data, 8, src0, src1); 2670a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); 2680a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); 2690a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); 2700a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); 2710a39d0a697ff3603e8c100300fda363658e10b23James Zern satd = HADD_UW_U32(tmp0_w); 2720a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (64 == length) { 2730a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); 2740a39d0a697ff3603e8c100300fda363658e10b23James Zern 2750a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); 2760a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); 2770a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); 2780a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); 2790a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); 2800a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); 2810a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); 2820a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); 2830a39d0a697ff3603e8c100300fda363658e10b23James Zern 2840a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); 2850a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); 2860a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); 2870a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); 2880a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); 2890a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); 2900a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); 2910a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); 2920a39d0a697ff3603e8c100300fda363658e10b23James Zern 2930a39d0a697ff3603e8c100300fda363658e10b23James Zern satd = HADD_UW_U32(tmp0_w); 2940a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (256 == length) { 2950a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 2; ++i) { 2960a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); 2970a39d0a697ff3603e8c100300fda363658e10b23James Zern data += 8 * 8; 2980a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); 2990a39d0a697ff3603e8c100300fda363658e10b23James Zern data += 8 * 8; 3000a39d0a697ff3603e8c100300fda363658e10b23James Zern 3010a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); 3020a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); 3030a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); 3040a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); 3050a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); 3060a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); 3070a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); 3080a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); 3090a39d0a697ff3603e8c100300fda363658e10b23James Zern 3100a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); 3110a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); 3120a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); 3130a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); 3140a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); 3150a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); 3160a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); 3170a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); 3180a39d0a697ff3603e8c100300fda363658e10b23James Zern 3190a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); 3200a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); 3210a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); 3220a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); 3230a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); 3240a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); 3250a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); 3260a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); 3270a39d0a697ff3603e8c100300fda363658e10b23James Zern 3280a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); 3290a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); 3300a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); 3310a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); 3320a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); 3330a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); 3340a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); 3350a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); 3360a39d0a697ff3603e8c100300fda363658e10b23James Zern } 3370a39d0a697ff3603e8c100300fda363658e10b23James Zern 3380a39d0a697ff3603e8c100300fda363658e10b23James Zern satd = HADD_UW_U32(tmp0_w); 3390a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (1024 == length) { 3400a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 8; ++i) { 3410a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); 3420a39d0a697ff3603e8c100300fda363658e10b23James Zern data += 8 * 8; 3430a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); 3440a39d0a697ff3603e8c100300fda363658e10b23James Zern data += 8 * 8; 3450a39d0a697ff3603e8c100300fda363658e10b23James Zern 3460a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); 3470a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); 3480a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); 3490a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); 3500a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); 3510a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); 3520a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); 3530a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); 3540a39d0a697ff3603e8c100300fda363658e10b23James Zern 3550a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); 3560a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); 3570a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); 3580a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); 3590a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); 3600a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); 3610a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); 3620a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); 3630a39d0a697ff3603e8c100300fda363658e10b23James Zern 3640a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); 3650a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); 3660a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); 3670a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); 3680a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); 3690a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); 3700a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); 3710a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); 3720a39d0a697ff3603e8c100300fda363658e10b23James Zern 3730a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); 3740a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); 3750a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); 3760a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); 3770a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); 3780a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); 3790a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); 3800a39d0a697ff3603e8c100300fda363658e10b23James Zern tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); 3810a39d0a697ff3603e8c100300fda363658e10b23James Zern } 3820a39d0a697ff3603e8c100300fda363658e10b23James Zern 3830a39d0a697ff3603e8c100300fda363658e10b23James Zern satd = HADD_UW_U32(tmp0_w); 3840a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 3850a39d0a697ff3603e8c100300fda363658e10b23James Zern satd = 0; 3860a39d0a697ff3603e8c100300fda363658e10b23James Zern 3870a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < length; ++i) { 3880a39d0a697ff3603e8c100300fda363658e10b23James Zern satd += abs(data[i]); 3890a39d0a697ff3603e8c100300fda363658e10b23James Zern } 3900a39d0a697ff3603e8c100300fda363658e10b23James Zern } 3910a39d0a697ff3603e8c100300fda363658e10b23James Zern 3920a39d0a697ff3603e8c100300fda363658e10b23James Zern return satd; 3930a39d0a697ff3603e8c100300fda363658e10b23James Zern} 3940a39d0a697ff3603e8c100300fda363658e10b23James Zern 3950a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref, 3960a39d0a697ff3603e8c100300fda363658e10b23James Zern const int ref_stride, const int height) { 3970a39d0a697ff3603e8c100300fda363658e10b23James Zern int i; 3980a39d0a697ff3603e8c100300fda363658e10b23James Zern v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 3990a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 hbuf_r = { 0 }; 4000a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 hbuf_l = { 0 }; 4010a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l; 4020a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l; 4030a39d0a697ff3603e8c100300fda363658e10b23James Zern 4040a39d0a697ff3603e8c100300fda363658e10b23James Zern if (16 == height) { 4050a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 2; i--;) { 4060a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 4070a39d0a697ff3603e8c100300fda363658e10b23James Zern ref += 8 * ref_stride; 4080a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref0, ref0_r, ref0_l); 4090a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref1, ref1_r, ref1_l); 4100a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref2, ref2_r, ref2_l); 4110a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref3, ref3_r, ref3_l); 4120a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref4, ref4_r, ref4_l); 4130a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref5, ref5_r, ref5_l); 4140a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref6, ref6_r, ref6_l); 4150a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref7, ref7_r, ref7_l); 4160a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, 4170a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4180a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, 4190a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4200a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, 4210a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4220a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, 4230a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4240a39d0a697ff3603e8c100300fda363658e10b23James Zern } 4250a39d0a697ff3603e8c100300fda363658e10b23James Zern 4260a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_2V(hbuf_r, hbuf_l, 3); 4270a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH2(hbuf_r, hbuf_l, hbuf, 8); 4280a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (32 == height) { 4290a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 2; i--;) { 4300a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 4310a39d0a697ff3603e8c100300fda363658e10b23James Zern ref += 8 * ref_stride; 4320a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref0, ref0_r, ref0_l); 4330a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref1, ref1_r, ref1_l); 4340a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref2, ref2_r, ref2_l); 4350a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref3, ref3_r, ref3_l); 4360a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref4, ref4_r, ref4_l); 4370a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref5, ref5_r, ref5_l); 4380a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref6, ref6_r, ref6_l); 4390a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref7, ref7_r, ref7_l); 4400a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, 4410a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4420a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, 4430a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4440a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, 4450a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4460a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, 4470a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4480a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 4490a39d0a697ff3603e8c100300fda363658e10b23James Zern ref += 8 * ref_stride; 4500a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref0, ref0_r, ref0_l); 4510a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref1, ref1_r, ref1_l); 4520a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref2, ref2_r, ref2_l); 4530a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref3, ref3_r, ref3_l); 4540a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref4, ref4_r, ref4_l); 4550a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref5, ref5_r, ref5_l); 4560a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref6, ref6_r, ref6_l); 4570a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref7, ref7_r, ref7_l); 4580a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, 4590a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4600a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, 4610a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4620a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, 4630a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4640a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, 4650a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4660a39d0a697ff3603e8c100300fda363658e10b23James Zern } 4670a39d0a697ff3603e8c100300fda363658e10b23James Zern 4680a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_2V(hbuf_r, hbuf_l, 4); 4690a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH2(hbuf_r, hbuf_l, hbuf, 8); 4700a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (64 == height) { 4710a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 4; i--;) { 4720a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 4730a39d0a697ff3603e8c100300fda363658e10b23James Zern ref += 8 * ref_stride; 4740a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref0, ref0_r, ref0_l); 4750a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref1, ref1_r, ref1_l); 4760a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref2, ref2_r, ref2_l); 4770a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref3, ref3_r, ref3_l); 4780a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref4, ref4_r, ref4_l); 4790a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref5, ref5_r, ref5_l); 4800a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref6, ref6_r, ref6_l); 4810a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref7, ref7_r, ref7_l); 4820a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, 4830a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4840a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, 4850a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4860a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, 4870a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4880a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, 4890a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 4900a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 4910a39d0a697ff3603e8c100300fda363658e10b23James Zern ref += 8 * ref_stride; 4920a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref0, ref0_r, ref0_l); 4930a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref1, ref1_r, ref1_l); 4940a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref2, ref2_r, ref2_l); 4950a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref3, ref3_r, ref3_l); 4960a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref4, ref4_r, ref4_l); 4970a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref5, ref5_r, ref5_l); 4980a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref6, ref6_r, ref6_l); 4990a39d0a697ff3603e8c100300fda363658e10b23James Zern UNPCK_UB_SH(ref7, ref7_r, ref7_l); 5000a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, 5010a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 5020a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, 5030a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 5040a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, 5050a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 5060a39d0a697ff3603e8c100300fda363658e10b23James Zern ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, 5070a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf_r, hbuf_l, hbuf_r, hbuf_l); 5080a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5090a39d0a697ff3603e8c100300fda363658e10b23James Zern 5100a39d0a697ff3603e8c100300fda363658e10b23James Zern SRA_2V(hbuf_r, hbuf_l, 5); 5110a39d0a697ff3603e8c100300fda363658e10b23James Zern ST_SH2(hbuf_r, hbuf_l, hbuf, 8); 5120a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 5130a39d0a697ff3603e8c100300fda363658e10b23James Zern const int norm_factor = height >> 1; 5140a39d0a697ff3603e8c100300fda363658e10b23James Zern int cnt; 5150a39d0a697ff3603e8c100300fda363658e10b23James Zern 5160a39d0a697ff3603e8c100300fda363658e10b23James Zern for (cnt = 0; cnt < 16; cnt++) { 5170a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf[cnt] = 0; 5180a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5190a39d0a697ff3603e8c100300fda363658e10b23James Zern 5200a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < height; ++i) { 5210a39d0a697ff3603e8c100300fda363658e10b23James Zern for (cnt = 0; cnt < 16; cnt++) { 5220a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf[cnt] += ref[cnt]; 5230a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5240a39d0a697ff3603e8c100300fda363658e10b23James Zern 5250a39d0a697ff3603e8c100300fda363658e10b23James Zern ref += ref_stride; 5260a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5270a39d0a697ff3603e8c100300fda363658e10b23James Zern 5280a39d0a697ff3603e8c100300fda363658e10b23James Zern for (cnt = 0; cnt < 16; cnt++) { 5290a39d0a697ff3603e8c100300fda363658e10b23James Zern hbuf[cnt] /= norm_factor; 5300a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5310a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5320a39d0a697ff3603e8c100300fda363658e10b23James Zern} 5330a39d0a697ff3603e8c100300fda363658e10b23James Zern 5340a39d0a697ff3603e8c100300fda363658e10b23James Zernint16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) { 5350a39d0a697ff3603e8c100300fda363658e10b23James Zern int16_t sum; 5360a39d0a697ff3603e8c100300fda363658e10b23James Zern v16u8 ref0, ref1, ref2, ref3; 5370a39d0a697ff3603e8c100300fda363658e10b23James Zern v8u16 ref0_h; 5380a39d0a697ff3603e8c100300fda363658e10b23James Zern 5390a39d0a697ff3603e8c100300fda363658e10b23James Zern if (16 == width) { 5400a39d0a697ff3603e8c100300fda363658e10b23James Zern ref0 = LD_UB(ref); 5410a39d0a697ff3603e8c100300fda363658e10b23James Zern ref0_h = __msa_hadd_u_h(ref0, ref0); 5420a39d0a697ff3603e8c100300fda363658e10b23James Zern sum = HADD_UH_U32(ref0_h); 5430a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (32 == width) { 5440a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB2(ref, 16, ref0, ref1); 5450a39d0a697ff3603e8c100300fda363658e10b23James Zern ref0_h = __msa_hadd_u_h(ref0, ref0); 5460a39d0a697ff3603e8c100300fda363658e10b23James Zern ref0_h += __msa_hadd_u_h(ref1, ref1); 5470a39d0a697ff3603e8c100300fda363658e10b23James Zern sum = HADD_UH_U32(ref0_h); 5480a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (64 == width) { 5490a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB4(ref, 16, ref0, ref1, ref2, ref3); 5500a39d0a697ff3603e8c100300fda363658e10b23James Zern ref0_h = __msa_hadd_u_h(ref0, ref0); 5510a39d0a697ff3603e8c100300fda363658e10b23James Zern ref0_h += __msa_hadd_u_h(ref1, ref1); 5520a39d0a697ff3603e8c100300fda363658e10b23James Zern ref0_h += __msa_hadd_u_h(ref2, ref2); 5530a39d0a697ff3603e8c100300fda363658e10b23James Zern ref0_h += __msa_hadd_u_h(ref3, ref3); 5540a39d0a697ff3603e8c100300fda363658e10b23James Zern sum = HADD_UH_U32(ref0_h); 5550a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 5560a39d0a697ff3603e8c100300fda363658e10b23James Zern int idx; 5570a39d0a697ff3603e8c100300fda363658e10b23James Zern 5580a39d0a697ff3603e8c100300fda363658e10b23James Zern sum = 0; 5590a39d0a697ff3603e8c100300fda363658e10b23James Zern for (idx = 0; idx < width; ++idx) { 5600a39d0a697ff3603e8c100300fda363658e10b23James Zern sum += ref[idx]; 5610a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5620a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5630a39d0a697ff3603e8c100300fda363658e10b23James Zern 5640a39d0a697ff3603e8c100300fda363658e10b23James Zern return sum; 5650a39d0a697ff3603e8c100300fda363658e10b23James Zern} 5660a39d0a697ff3603e8c100300fda363658e10b23James Zern 5670a39d0a697ff3603e8c100300fda363658e10b23James Zernint vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) { 5680a39d0a697ff3603e8c100300fda363658e10b23James Zern int sse, mean, var; 5690a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2; 5700a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m; 5710a39d0a697ff3603e8c100300fda363658e10b23James Zern v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m; 5720a39d0a697ff3603e8c100300fda363658e10b23James Zern v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m; 5730a39d0a697ff3603e8c100300fda363658e10b23James Zern v4i32 res_l7_m, mean_v; 5740a39d0a697ff3603e8c100300fda363658e10b23James Zern v2i64 sse_v; 5750a39d0a697ff3603e8c100300fda363658e10b23James Zern 5760a39d0a697ff3603e8c100300fda363658e10b23James Zern if (2 == bwl) { 5770a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(src, 8, src0, src1); 5780a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH2(ref, 8, ref0, ref1); 5790a39d0a697ff3603e8c100300fda363658e10b23James Zern 5800a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); 5810a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); 5820a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); 5830a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); 5840a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); 5850a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); 5860a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); 5870a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v = res_l0_m + res_l1_m; 5880a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l2_m + res_l3_m; 5890a39d0a697ff3603e8c100300fda363658e10b23James Zern 5900a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v += __msa_splati_d(sse_v, 1); 5910a39d0a697ff3603e8c100300fda363658e10b23James Zern sse = __msa_copy_s_w((v4i32)sse_v, 0); 5920a39d0a697ff3603e8c100300fda363658e10b23James Zern 5930a39d0a697ff3603e8c100300fda363658e10b23James Zern mean = HADD_SW_S32(mean_v); 5940a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (3 == bwl) { 5950a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(src, 8, src0, src1, src2, src3); 5960a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH4(ref, 8, ref0, ref1, ref2, ref3); 5970a39d0a697ff3603e8c100300fda363658e10b23James Zern 5980a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); 5990a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); 6000a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); 6010a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); 6020a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); 6030a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); 6040a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); 6050a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); 6060a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); 6070a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); 6080a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); 6090a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); 6100a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); 6110a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v = res_l0_m + res_l1_m; 6120a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l2_m + res_l3_m; 6130a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l4_m + res_l5_m; 6140a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l6_m + res_l7_m; 6150a39d0a697ff3603e8c100300fda363658e10b23James Zern 6160a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v += __msa_splati_d(sse_v, 1); 6170a39d0a697ff3603e8c100300fda363658e10b23James Zern sse = __msa_copy_s_w((v4i32)sse_v, 0); 6180a39d0a697ff3603e8c100300fda363658e10b23James Zern 6190a39d0a697ff3603e8c100300fda363658e10b23James Zern mean = HADD_SW_S32(mean_v); 6200a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (4 == bwl) { 6210a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7); 6220a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 6230a39d0a697ff3603e8c100300fda363658e10b23James Zern 6240a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); 6250a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); 6260a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); 6270a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); 6280a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); 6290a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); 6300a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); 6310a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); 6320a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); 6330a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); 6340a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); 6350a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); 6360a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); 6370a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v = res_l0_m + res_l1_m; 6380a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l2_m + res_l3_m; 6390a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l4_m + res_l5_m; 6400a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l6_m + res_l7_m; 6410a39d0a697ff3603e8c100300fda363658e10b23James Zern 6420a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m); 6430a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m); 6440a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m); 6450a39d0a697ff3603e8c100300fda363658e10b23James Zern ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m); 6460a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); 6470a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); 6480a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); 6490a39d0a697ff3603e8c100300fda363658e10b23James Zern HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); 6500a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v); 6510a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); 6520a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); 6530a39d0a697ff3603e8c100300fda363658e10b23James Zern DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); 6540a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l0_m + res_l1_m; 6550a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l2_m + res_l3_m; 6560a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l4_m + res_l5_m; 6570a39d0a697ff3603e8c100300fda363658e10b23James Zern mean_v += res_l6_m + res_l7_m; 6580a39d0a697ff3603e8c100300fda363658e10b23James Zern 6590a39d0a697ff3603e8c100300fda363658e10b23James Zern sse_v += __msa_splati_d(sse_v, 1); 6600a39d0a697ff3603e8c100300fda363658e10b23James Zern sse = __msa_copy_s_w((v4i32)sse_v, 0); 6610a39d0a697ff3603e8c100300fda363658e10b23James Zern 6620a39d0a697ff3603e8c100300fda363658e10b23James Zern mean = HADD_SW_S32(mean_v); 6630a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 6640a39d0a697ff3603e8c100300fda363658e10b23James Zern int i; 6650a39d0a697ff3603e8c100300fda363658e10b23James Zern const int width = 4 << bwl; 6660a39d0a697ff3603e8c100300fda363658e10b23James Zern 6670a39d0a697ff3603e8c100300fda363658e10b23James Zern sse = 0; 6680a39d0a697ff3603e8c100300fda363658e10b23James Zern mean = 0; 6690a39d0a697ff3603e8c100300fda363658e10b23James Zern 6700a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < width; ++i) { 6710a39d0a697ff3603e8c100300fda363658e10b23James Zern const int diff = ref[i] - src[i]; 6720a39d0a697ff3603e8c100300fda363658e10b23James Zern 6730a39d0a697ff3603e8c100300fda363658e10b23James Zern mean += diff; 6740a39d0a697ff3603e8c100300fda363658e10b23James Zern sse += diff * diff; 6750a39d0a697ff3603e8c100300fda363658e10b23James Zern } 6760a39d0a697ff3603e8c100300fda363658e10b23James Zern } 6770a39d0a697ff3603e8c100300fda363658e10b23James Zern 6780a39d0a697ff3603e8c100300fda363658e10b23James Zern var = sse - ((mean * mean) >> (bwl + 2)); 6790a39d0a697ff3603e8c100300fda363658e10b23James Zern 6800a39d0a697ff3603e8c100300fda363658e10b23James Zern return var; 6810a39d0a697ff3603e8c100300fda363658e10b23James Zern} 6820a39d0a697ff3603e8c100300fda363658e10b23James Zern 6830a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp, 6840a39d0a697ff3603e8c100300fda363658e10b23James Zern int *min, int *max) { 6850a39d0a697ff3603e8c100300fda363658e10b23James Zern v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7; 6860a39d0a697ff3603e8c100300fda363658e10b23James Zern v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1; 6870a39d0a697ff3603e8c100300fda363658e10b23James Zern 6880a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); 6890a39d0a697ff3603e8c100300fda363658e10b23James Zern LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7); 6900a39d0a697ff3603e8c100300fda363658e10b23James Zern PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3); 6910a39d0a697ff3603e8c100300fda363658e10b23James Zern PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3); 6920a39d0a697ff3603e8c100300fda363658e10b23James Zern 6930a39d0a697ff3603e8c100300fda363658e10b23James Zern diff0 = __msa_asub_u_b(s0, d0); 6940a39d0a697ff3603e8c100300fda363658e10b23James Zern diff1 = __msa_asub_u_b(s1, d1); 6950a39d0a697ff3603e8c100300fda363658e10b23James Zern diff2 = __msa_asub_u_b(s2, d2); 6960a39d0a697ff3603e8c100300fda363658e10b23James Zern diff3 = __msa_asub_u_b(s3, d3); 6970a39d0a697ff3603e8c100300fda363658e10b23James Zern 6980a39d0a697ff3603e8c100300fda363658e10b23James Zern min0 = __msa_min_u_b(diff0, diff1); 6990a39d0a697ff3603e8c100300fda363658e10b23James Zern min1 = __msa_min_u_b(diff2, diff3); 7000a39d0a697ff3603e8c100300fda363658e10b23James Zern min0 = __msa_min_u_b(min0, min1); 7010a39d0a697ff3603e8c100300fda363658e10b23James Zern 7020a39d0a697ff3603e8c100300fda363658e10b23James Zern max0 = __msa_max_u_b(diff0, diff1); 7030a39d0a697ff3603e8c100300fda363658e10b23James Zern max1 = __msa_max_u_b(diff2, diff3); 7040a39d0a697ff3603e8c100300fda363658e10b23James Zern max0 = __msa_max_u_b(max0, max1); 7050a39d0a697ff3603e8c100300fda363658e10b23James Zern 7060a39d0a697ff3603e8c100300fda363658e10b23James Zern min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8); 7070a39d0a697ff3603e8c100300fda363658e10b23James Zern min0 = __msa_min_u_b(min0, min1); 7080a39d0a697ff3603e8c100300fda363658e10b23James Zern max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8); 7090a39d0a697ff3603e8c100300fda363658e10b23James Zern max0 = __msa_max_u_b(max0, max1); 7100a39d0a697ff3603e8c100300fda363658e10b23James Zern 7110a39d0a697ff3603e8c100300fda363658e10b23James Zern min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4); 7120a39d0a697ff3603e8c100300fda363658e10b23James Zern min0 = __msa_min_u_b(min0, min1); 7130a39d0a697ff3603e8c100300fda363658e10b23James Zern max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4); 7140a39d0a697ff3603e8c100300fda363658e10b23James Zern max0 = __msa_max_u_b(max0, max1); 7150a39d0a697ff3603e8c100300fda363658e10b23James Zern 7160a39d0a697ff3603e8c100300fda363658e10b23James Zern min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2); 7170a39d0a697ff3603e8c100300fda363658e10b23James Zern min0 = __msa_min_u_b(min0, min1); 7180a39d0a697ff3603e8c100300fda363658e10b23James Zern max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2); 7190a39d0a697ff3603e8c100300fda363658e10b23James Zern max0 = __msa_max_u_b(max0, max1); 7200a39d0a697ff3603e8c100300fda363658e10b23James Zern 7210a39d0a697ff3603e8c100300fda363658e10b23James Zern min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1); 7220a39d0a697ff3603e8c100300fda363658e10b23James Zern min0 = __msa_min_u_b(min0, min1); 7230a39d0a697ff3603e8c100300fda363658e10b23James Zern max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1); 7240a39d0a697ff3603e8c100300fda363658e10b23James Zern max0 = __msa_max_u_b(max0, max1); 7250a39d0a697ff3603e8c100300fda363658e10b23James Zern 7260a39d0a697ff3603e8c100300fda363658e10b23James Zern *min = min0[0]; 7270a39d0a697ff3603e8c100300fda363658e10b23James Zern *max = max0[0]; 7280a39d0a697ff3603e8c100300fda363658e10b23James Zern} 729