1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/macros_msa.h"
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            const uint8_t *pred_ptr, int32_t pred_stride,
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            int16_t *diff_ptr, int32_t diff_stride) {
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t src0, src1, src2, src3;
18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t pred0, pred1, pred2, pred3;
19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src = { 0 };
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 pred = { 0 };
21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src_l0, src_l1;
22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 diff0, diff1;
23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LW4(src_ptr, src_stride, src0, src1, src2, src3);
25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  INSERT_W4_SB(src0, src1, src2, src3, src);
27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ILVRL_B2_UB(src, pred, src_l0, src_l1);
29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            const uint8_t *pred_ptr, int32_t pred_stride,
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                            int16_t *diff_ptr, int32_t diff_stride) {
36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint64_t src0, src1, pred0, pred1;
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src = { 0 };
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 pred = { 0 };
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src_l0, src_l1;
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 diff0, diff1;
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 4; loop_cnt--;) {
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD2(src_ptr, src_stride, src0, src1);
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src_ptr += (2 * src_stride);
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD2(pred_ptr, pred_stride, pred0, pred1);
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred_ptr += (2 * pred_stride);
48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_D2_SB(src0, src1, src);
50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    INSERT_D2_SB(pred0, pred1, pred);
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src, pred, src_l0, src_l1);
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff_ptr, diff_stride);
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff_ptr += (2 * diff_stride);
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              const uint8_t *pred, int32_t pred_stride,
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              int16_t *diff, int32_t diff_stride) {
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int8_t count;
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src_l0, src_l1;
65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 diff0, diff1;
66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (count = 2; count--;) {
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += (8 * src_stride);
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
717bc9febe8749e98a3812a0dc4380ceae75c29450Johann    LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
727bc9febe8749e98a3812a0dc4380ceae75c29450Johann           pred7);
73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred += (8 * pred_stride);
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              const uint8_t *pred, int32_t pred_stride,
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              int16_t *diff, int32_t diff_stride) {
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src_l0, src_l1;
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 diff0, diff1;
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 8; loop_cnt--;) {
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(src, 16, src0, src1);
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(src, 16, src2, src3);
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(src, 16, src4, src5);
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(src, 16, src6, src7);
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(pred, 16, pred0, pred1);
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred += pred_stride;
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(pred, 16, pred2, pred3);
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred += pred_stride;
140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(pred, 16, pred4, pred5);
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred += pred_stride;
142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB2(pred, 16, pred6, pred7);
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred += pred_stride;
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 16, 8);
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 16, 8);
159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 16, 8);
167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 16, 8);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              const uint8_t *pred, int32_t pred_stride,
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              int16_t *diff, int32_t diff_stride) {
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t loop_cnt;
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v16u8 src_l0, src_l1;
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  v8i16 diff0, diff1;
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (loop_cnt = 32; loop_cnt--;) {
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, 16, src0, src1, src2, src3);
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(src, 16, src4, src5, src6, src7);
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride;
193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred += pred_stride;
196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    pred += pred_stride;
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 16, 8);
205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 32, 8);
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 48, 8);
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff, 8);
216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 16, 8);
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 32, 8);
222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ST_SH2(diff0, diff1, diff + 48, 8);
225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    diff += diff_stride;
226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
2297bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
2307bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
2317bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
2327bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            ptrdiff_t pred_stride) {
233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (rows == cols) {
234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (rows) {
235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
2367bc9febe8749e98a3812a0dc4380ceae75c29450Johann        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
2377bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        diff_stride);
238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
2407bc9febe8749e98a3812a0dc4380ceae75c29450Johann        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
2417bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        diff_stride);
242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
2447bc9febe8749e98a3812a0dc4380ceae75c29450Johann        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
2457bc9febe8749e98a3812a0dc4380ceae75c29450Johann                          diff_stride);
246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
2487bc9febe8749e98a3812a0dc4380ceae75c29450Johann        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
2497bc9febe8749e98a3812a0dc4380ceae75c29450Johann                          diff_stride);
250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
2527bc9febe8749e98a3812a0dc4380ceae75c29450Johann        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
2537bc9febe8749e98a3812a0dc4380ceae75c29450Johann                          diff_stride);
254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                             src_stride, pred_ptr, pred_stride);
258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  } else {
261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         pred_ptr, pred_stride);
263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
265