10a39d0a697ff3603e8c100300fda363658e10b23James Zern/*
20a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
30a39d0a697ff3603e8c100300fda363658e10b23James Zern *
40a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Use of this source code is governed by a BSD-style license
50a39d0a697ff3603e8c100300fda363658e10b23James Zern *  that can be found in the LICENSE file in the root of the source
60a39d0a697ff3603e8c100300fda363658e10b23James Zern *  tree. An additional intellectual property rights grant can be found
70a39d0a697ff3603e8c100300fda363658e10b23James Zern *  in the file PATENTS.  All contributing project authors may
80a39d0a697ff3603e8c100300fda363658e10b23James Zern *  be found in the AUTHORS file in the root of the source tree.
90a39d0a697ff3603e8c100300fda363658e10b23James Zern */
100a39d0a697ff3603e8c100300fda363658e10b23James Zern
110a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_dsp_rtcd.h"
120a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./macros_msa.h"
130a39d0a697ff3603e8c100300fda363658e10b23James Zern
140a39d0a697ff3603e8c100300fda363658e10b23James Zernuint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride,
150a39d0a697ff3603e8c100300fda363658e10b23James Zern                                    int size) {
160a39d0a697ff3603e8c100300fda363658e10b23James Zern  int row, col;
170a39d0a697ff3603e8c100300fda363658e10b23James Zern  uint64_t ss_res = 0;
180a39d0a697ff3603e8c100300fda363658e10b23James Zern  v4i32 mul0, mul1;
190a39d0a697ff3603e8c100300fda363658e10b23James Zern  v2i64 res0 = { 0 };
200a39d0a697ff3603e8c100300fda363658e10b23James Zern
210a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (4 == size) {
220a39d0a697ff3603e8c100300fda363658e10b23James Zern    uint64_t src0, src1, src2, src3;
230a39d0a697ff3603e8c100300fda363658e10b23James Zern    v8i16 diff0 = { 0 };
240a39d0a697ff3603e8c100300fda363658e10b23James Zern    v8i16 diff1 = { 0 };
250a39d0a697ff3603e8c100300fda363658e10b23James Zern
260a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD4(src, src_stride, src0, src1, src2, src3);
270a39d0a697ff3603e8c100300fda363658e10b23James Zern    INSERT_D2_SH(src0, src1, diff0);
280a39d0a697ff3603e8c100300fda363658e10b23James Zern    INSERT_D2_SH(src2, src3, diff1);
290a39d0a697ff3603e8c100300fda363658e10b23James Zern    DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1);
300a39d0a697ff3603e8c100300fda363658e10b23James Zern    mul0 += mul1;
310a39d0a697ff3603e8c100300fda363658e10b23James Zern    res0 = __msa_hadd_s_d(mul0, mul0);
320a39d0a697ff3603e8c100300fda363658e10b23James Zern    res0 += __msa_splati_d(res0, 1);
330a39d0a697ff3603e8c100300fda363658e10b23James Zern    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
340a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (8 == size) {
350a39d0a697ff3603e8c100300fda363658e10b23James Zern    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
360a39d0a697ff3603e8c100300fda363658e10b23James Zern
370a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
380a39d0a697ff3603e8c100300fda363658e10b23James Zern    DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
390a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
400a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
410a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
420a39d0a697ff3603e8c100300fda363658e10b23James Zern    mul0 += mul1;
430a39d0a697ff3603e8c100300fda363658e10b23James Zern    res0 = __msa_hadd_s_d(mul0, mul0);
440a39d0a697ff3603e8c100300fda363658e10b23James Zern    res0 += __msa_splati_d(res0, 1);
450a39d0a697ff3603e8c100300fda363658e10b23James Zern    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
460a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (16 == size) {
470a39d0a697ff3603e8c100300fda363658e10b23James Zern    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
480a39d0a697ff3603e8c100300fda363658e10b23James Zern
490a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
500a39d0a697ff3603e8c100300fda363658e10b23James Zern    DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
510a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
520a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
530a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
540a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
550a39d0a697ff3603e8c100300fda363658e10b23James Zern    src += 8 * src_stride;
560a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
570a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
580a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
590a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
600a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
610a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
620a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
630a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
640a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
650a39d0a697ff3603e8c100300fda363658e10b23James Zern    LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
660a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
670a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
680a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
690a39d0a697ff3603e8c100300fda363658e10b23James Zern    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
700a39d0a697ff3603e8c100300fda363658e10b23James Zern    mul0 += mul1;
710a39d0a697ff3603e8c100300fda363658e10b23James Zern    res0 += __msa_hadd_s_d(mul0, mul0);
720a39d0a697ff3603e8c100300fda363658e10b23James Zern
730a39d0a697ff3603e8c100300fda363658e10b23James Zern    res0 += __msa_splati_d(res0, 1);
740a39d0a697ff3603e8c100300fda363658e10b23James Zern    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
750a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (0 == (size % 16)) {
760a39d0a697ff3603e8c100300fda363658e10b23James Zern    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
770a39d0a697ff3603e8c100300fda363658e10b23James Zern
780a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (row = 0; row < (size >> 4); row++) {
790a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (col = 0; col < size; col += 16) {
800a39d0a697ff3603e8c100300fda363658e10b23James Zern        const int16_t *src_ptr = src + col;
810a39d0a697ff3603e8c100300fda363658e10b23James Zern        LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
820a39d0a697ff3603e8c100300fda363658e10b23James Zern               src7);
830a39d0a697ff3603e8c100300fda363658e10b23James Zern        DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
840a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
850a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
860a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
870a39d0a697ff3603e8c100300fda363658e10b23James Zern        LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
880a39d0a697ff3603e8c100300fda363658e10b23James Zern               src6, src7);
890a39d0a697ff3603e8c100300fda363658e10b23James Zern        src_ptr += 8 * src_stride;
900a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
910a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
920a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
930a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
940a39d0a697ff3603e8c100300fda363658e10b23James Zern        LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
950a39d0a697ff3603e8c100300fda363658e10b23James Zern               src7);
960a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
970a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
980a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
990a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
1000a39d0a697ff3603e8c100300fda363658e10b23James Zern        LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
1010a39d0a697ff3603e8c100300fda363658e10b23James Zern               src6, src7);
1020a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
1030a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
1040a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
1050a39d0a697ff3603e8c100300fda363658e10b23James Zern        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
1060a39d0a697ff3603e8c100300fda363658e10b23James Zern        mul0 += mul1;
1070a39d0a697ff3603e8c100300fda363658e10b23James Zern        res0 += __msa_hadd_s_d(mul0, mul0);
1080a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
1090a39d0a697ff3603e8c100300fda363658e10b23James Zern
1100a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += 16 * src_stride;
1110a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
1120a39d0a697ff3603e8c100300fda363658e10b23James Zern
1130a39d0a697ff3603e8c100300fda363658e10b23James Zern    res0 += __msa_splati_d(res0, 1);
1140a39d0a697ff3603e8c100300fda363658e10b23James Zern    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
1150a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
1160a39d0a697ff3603e8c100300fda363658e10b23James Zern    int16_t val;
1170a39d0a697ff3603e8c100300fda363658e10b23James Zern
1180a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (row = 0; row < size; row++) {
1190a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (col = 0; col < size; col++) {
1200a39d0a697ff3603e8c100300fda363658e10b23James Zern        val = src[col];
1210a39d0a697ff3603e8c100300fda363658e10b23James Zern        ss_res += val * val;
1220a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
1230a39d0a697ff3603e8c100300fda363658e10b23James Zern
1240a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
1250a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
1260a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
1270a39d0a697ff3603e8c100300fda363658e10b23James Zern
1280a39d0a697ff3603e8c100300fda363658e10b23James Zern  return ss_res;
1290a39d0a697ff3603e8c100300fda363658e10b23James Zern}
130