1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "./macros_msa.h" 13 14uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride, 15 int size) { 16 int row, col; 17 uint64_t ss_res = 0; 18 v4i32 mul0, mul1; 19 v2i64 res0 = { 0 }; 20 21 if (4 == size) { 22 uint64_t src0, src1, src2, src3; 23 v8i16 diff0 = { 0 }; 24 v8i16 diff1 = { 0 }; 25 26 LD4(src, src_stride, src0, src1, src2, src3); 27 INSERT_D2_SH(src0, src1, diff0); 28 INSERT_D2_SH(src2, src3, diff1); 29 DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1); 30 mul0 += mul1; 31 res0 = __msa_hadd_s_d(mul0, mul0); 32 res0 += __msa_splati_d(res0, 1); 33 ss_res = (uint64_t)__msa_copy_s_d(res0, 0); 34 } else if (8 == size) { 35 v8i16 src0, src1, src2, src3, src4, src5, src6, src7; 36 37 LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 38 DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); 39 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 40 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 41 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 42 mul0 += mul1; 43 res0 = __msa_hadd_s_d(mul0, mul0); 44 res0 += __msa_splati_d(res0, 1); 45 ss_res = (uint64_t)__msa_copy_s_d(res0, 0); 46 } else if (16 == size) { 47 v8i16 src0, src1, src2, src3, src4, src5, src6, src7; 48 49 LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 50 DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); 51 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 52 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 53 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 54 LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 55 src += 8 * src_stride; 56 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); 57 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 58 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 59 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 60 LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 61 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); 62 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 63 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 64 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 65 LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 66 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); 67 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 68 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 69 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 70 mul0 += mul1; 71 res0 += __msa_hadd_s_d(mul0, mul0); 72 73 res0 += __msa_splati_d(res0, 1); 74 ss_res = (uint64_t)__msa_copy_s_d(res0, 0); 75 } else if (0 == (size % 16)) { 76 v8i16 src0, src1, src2, src3, src4, src5, src6, src7; 77 78 for (row = 0; row < (size >> 4); row++) { 79 for (col = 0; col < size; col += 16) { 80 const int16_t *src_ptr = src + col; 81 LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, 82 src7); 83 DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); 84 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 85 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 86 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 87 LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, 88 src6, src7); 89 src_ptr += 8 * src_stride; 90 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); 91 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 92 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 93 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 94 LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, 95 src7); 96 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); 97 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 98 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 99 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 100 LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, 101 src6, src7); 102 DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); 103 DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); 104 DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); 105 DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); 106 mul0 += mul1; 107 res0 += __msa_hadd_s_d(mul0, mul0); 108 } 109 110 src += 16 * src_stride; 111 } 112 113 res0 += __msa_splati_d(res0, 1); 114 ss_res = (uint64_t)__msa_copy_s_d(res0, 0); 115 } else { 116 int16_t val; 117 118 for (row = 0; row < size; row++) { 119 for (col = 0; col < size; col++) { 120 val = src[col]; 121 ss_res += val * val; 122 } 123 124 src += src_stride; 125 } 126 } 127 128 return ss_res; 129} 130