1/*
2 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_ports/asmdefs_mmi.h"
13#include "vpx/vpx_integer.h"
14#include "vpx_ports/mem.h"
15
16#define SAD_SRC_REF_ABS_SUB_64                                      \
17  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
18  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
19  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
20  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
21  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
22  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
23  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
24  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
25  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
26  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
27  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
28  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
29  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
30  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
31  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
32  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
33  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
34  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
35  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
36  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
37  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
38  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
39  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
40  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
41  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
42  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
43  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
44  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
45  "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
46  "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
47  "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
48  "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
49  "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
50  "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
51  "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
52  "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
53  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
54  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
55  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
56  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
57  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
58  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
59  "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
60  "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
61  "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
62  "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
63  "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
64  "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
65  "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
66  "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
67  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
68  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
69  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
70  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
71  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
72  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
73
74#define SAD_SRC_REF_ABS_SUB_32                                      \
75  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
76  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
77  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
78  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
79  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
80  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
81  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
82  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
83  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
84  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
85  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
86  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
87  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
88  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
89  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
90  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
91  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
92  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
93  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
94  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
95  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
96  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
97  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
98  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
99  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
100  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
101  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
102  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
103
104#define SAD_SRC_REF_ABS_SUB_16                                      \
105  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
106  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
107  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
108  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
109  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
110  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
111  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
112  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
113  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
114  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
115  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
116  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
117  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
118  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
119
120#define SAD_SRC_REF_ABS_SUB_8                                       \
121  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
122  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
123  "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
124  "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
125  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
126  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
127  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
128
129#if _MIPS_SIM == _ABIO32
130#define SAD_SRC_REF_ABS_SUB_4                                       \
131  "ulw        %[tmp0],    0x00(%[src])                        \n\t" \
132  "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
133  "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
134  "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
135  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
136  "mthc1      $0,         %[ftmp1]                            \n\t" \
137  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
138  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
139#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
140#define SAD_SRC_REF_ABS_SUB_4                                       \
141  "gslwlc1    %[ftmp1],   0x03(%[src])                        \n\t" \
142  "gslwrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
143  "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
144  "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
145  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
146  "mthc1      $0,         %[ftmp1]                            \n\t" \
147  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
148  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
149#endif /* _MIPS_SIM == _ABIO32 */
150
151#define SAD_SRC_AVGREF_ABS_SUB_64                                   \
152  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
153  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
154  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
155  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
156  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
157  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
158  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
159  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
160  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
161  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
162  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
163  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
164  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
165  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
166  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
167  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
168  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
169  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
170  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
171  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
172  "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
173  "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
174  "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
175  "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
176  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
177  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
178  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
179  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
180  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
181  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
182  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
183  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
184  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
185  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
186  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
187  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
188  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
189  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
190  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
191  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
192  "gsldlc1    %[ftmp1],   0x27(%[second_pred])                \n\t" \
193  "gsldrc1    %[ftmp1],   0x20(%[second_pred])                \n\t" \
194  "gsldlc1    %[ftmp2],   0x2f(%[second_pred])                \n\t" \
195  "gsldrc1    %[ftmp2],   0x28(%[second_pred])                \n\t" \
196  "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
197  "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
198  "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
199  "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
200  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
201  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
202  "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
203  "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
204  "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
205  "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
206  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
207  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
208  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
209  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
210  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
211  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
212  "gsldlc1    %[ftmp1],   0x37(%[second_pred])                \n\t" \
213  "gsldrc1    %[ftmp1],   0x30(%[second_pred])                \n\t" \
214  "gsldlc1    %[ftmp2],   0x3f(%[second_pred])                \n\t" \
215  "gsldrc1    %[ftmp2],   0x38(%[second_pred])                \n\t" \
216  "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
217  "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
218  "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
219  "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
220  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
221  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
222  "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
223  "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
224  "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
225  "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
226  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
227  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
228  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
229  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
230  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
231  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
232
233#define SAD_SRC_AVGREF_ABS_SUB_32                                   \
234  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
235  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
236  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
237  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
238  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
239  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
240  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
241  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
242  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
243  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
244  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
245  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
246  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
247  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
248  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
249  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
250  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
251  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
252  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
253  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
254  "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
255  "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
256  "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
257  "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
258  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
259  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
260  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
261  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
262  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
263  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
264  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
265  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
266  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
267  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
268  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
269  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
270  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
271  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
272  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
273  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
274
275#define SAD_SRC_AVGREF_ABS_SUB_16                                   \
276  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
277  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
278  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
279  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
280  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
281  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
282  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
283  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
284  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
285  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
286  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
287  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
288  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
289  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
290  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
291  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
292  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
293  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
294  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
295  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
296
297#define SAD_SRC_AVGREF_ABS_SUB_8                                    \
298  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
299  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
300  "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
301  "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
302  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
303  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
304  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
305  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
306  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
307  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
308
309#if _MIPS_SIM == _ABIO32
310#define SAD_SRC_AVGREF_ABS_SUB_4                                    \
311  "ulw        %[tmp0],    0x00(%[second_pred])                \n\t" \
312  "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
313  "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
314  "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
315  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
316  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
317  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
318  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
319  "mthc1      $0,         %[ftmp1]                            \n\t" \
320  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
321  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
322#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
323#define SAD_SRC_AVGREF_ABS_SUB_4                                    \
324  "gslwlc1    %[ftmp1],   0x03(%[second_pred])                \n\t" \
325  "gslwrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
326  "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
327  "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
328  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
329  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
330  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
331  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
332  "mthc1      $0,         %[ftmp1]                            \n\t" \
333  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
334  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
335#endif /* _MIPS_SIM == _ABIO32 */
336
337// depending on call sites, pass **ref_array to avoid & in subsequent call and
338// de-dup with 4D below.
339#define sadMxNxK_mmi(m, n, k)                                                 \
340  void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride,       \
341                                    const uint8_t *ref_array, int ref_stride, \
342                                    uint32_t *sad_array) {                    \
343    int i;                                                                    \
344    for (i = 0; i < k; ++i)                                                   \
345      sad_array[i] =                                                          \
346          vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
347  }
348
349// This appears to be equivalent to the above when k == 4 and refs is const
350#define sadMxNx4D_mmi(m, n)                                                  \
351  void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride,         \
352                                 const uint8_t *const ref_array[],           \
353                                 int ref_stride, uint32_t *sad_array) {      \
354    int i;                                                                   \
355    for (i = 0; i < 4; ++i)                                                  \
356      sad_array[i] =                                                         \
357          vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \
358  }
359
360static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
361                                      const uint8_t *ref, int ref_stride,
362                                      int counter) {
363  unsigned int sad;
364  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
365  mips_reg l_counter = counter;
366
367  __asm__ volatile (
368    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
369    "1:                                                         \n\t"
370    // Include two loop body, to reduce loop time.
371    SAD_SRC_REF_ABS_SUB_64
372    MMI_ADDU(%[src],     %[src],         %[src_stride])
373    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
374    SAD_SRC_REF_ABS_SUB_64
375    MMI_ADDU(%[src],     %[src],         %[src_stride])
376    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
377    MMI_ADDIU(%[counter], %[counter], -0x02)
378    "bnez       %[counter], 1b                                  \n\t"
379    "mfc1       %[sad],     %[ftmp5]                            \n\t"
380    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
381      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
382      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
383    : [src_stride]"r"((mips_reg)src_stride),
384      [ref_stride]"r"((mips_reg)ref_stride)
385  );
386
387  return sad;
388}
389
390#define vpx_sad64xN(H)                                                   \
391  unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride,   \
392                                   const uint8_t *ref, int ref_stride) { \
393    return vpx_sad64x(src, src_stride, ref, ref_stride, H);              \
394  }
395
396vpx_sad64xN(64);
397vpx_sad64xN(32);
398sadMxNx4D_mmi(64, 64);
399sadMxNx4D_mmi(64, 32);
400
401static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
402                                          const uint8_t *ref, int ref_stride,
403                                          const uint8_t *second_pred,
404                                          int counter) {
405  unsigned int sad;
406  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
407  mips_reg l_counter = counter;
408
409  __asm__ volatile (
410    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
411    "1:                                                         \n\t"
412    // Include two loop body, to reduce loop time.
413    SAD_SRC_AVGREF_ABS_SUB_64
414    MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
415    MMI_ADDU(%[src],     %[src],         %[src_stride])
416    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
417    SAD_SRC_AVGREF_ABS_SUB_64
418    MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
419    MMI_ADDU(%[src],     %[src],         %[src_stride])
420    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
421    MMI_ADDIU(%[counter], %[counter], -0x02)
422    "bnez       %[counter], 1b                                  \n\t"
423    "mfc1       %[sad],     %[ftmp5]                            \n\t"
424    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
425      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
426      [src]"+&r"(src), [ref]"+&r"(ref),
427      [second_pred]"+&r"((mips_reg)second_pred),
428      [sad]"=&r"(sad)
429    : [src_stride]"r"((mips_reg)src_stride),
430      [ref_stride]"r"((mips_reg)ref_stride)
431  );
432
433  return sad;
434}
435
436#define vpx_sad_avg64xN(H)                                                   \
437  unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
438                                       const uint8_t *ref, int ref_stride,   \
439                                       const uint8_t *second_pred) {         \
440    return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \
441  }
442
443vpx_sad_avg64xN(64);
444vpx_sad_avg64xN(32);
445
446static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
447                                      const uint8_t *ref, int ref_stride,
448                                      int counter) {
449  unsigned int sad;
450  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
451  mips_reg l_counter = counter;
452
453  __asm__ volatile (
454    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
455    "1:                                                         \n\t"
456    // Include two loop body, to reduce loop time.
457    SAD_SRC_REF_ABS_SUB_32
458    MMI_ADDU(%[src],     %[src],         %[src_stride])
459    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
460    SAD_SRC_REF_ABS_SUB_32
461    MMI_ADDU(%[src],     %[src],         %[src_stride])
462    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
463    MMI_ADDIU(%[counter], %[counter], -0x02)
464    "bnez       %[counter], 1b                                  \n\t"
465    "mfc1       %[sad],     %[ftmp5]                            \n\t"
466    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
467      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
468      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
469    : [src_stride]"r"((mips_reg)src_stride),
470      [ref_stride]"r"((mips_reg)ref_stride)
471  );
472
473  return sad;
474}
475
476#define vpx_sad32xN(H)                                                   \
477  unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride,   \
478                                   const uint8_t *ref, int ref_stride) { \
479    return vpx_sad32x(src, src_stride, ref, ref_stride, H);              \
480  }
481
482vpx_sad32xN(64);
483vpx_sad32xN(32);
484vpx_sad32xN(16);
485sadMxNx4D_mmi(32, 64);
486sadMxNx4D_mmi(32, 32);
487sadMxNx4D_mmi(32, 16);
488
489static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
490                                          const uint8_t *ref, int ref_stride,
491                                          const uint8_t *second_pred,
492                                          int counter) {
493  unsigned int sad;
494  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
495  mips_reg l_counter = counter;
496
497  __asm__ volatile (
498    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
499    "1:                                                         \n\t"
500    // Include two loop body, to reduce loop time.
501    SAD_SRC_AVGREF_ABS_SUB_32
502    MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
503    MMI_ADDU(%[src],     %[src],         %[src_stride])
504    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
505    SAD_SRC_AVGREF_ABS_SUB_32
506    MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
507    MMI_ADDU(%[src],     %[src],         %[src_stride])
508    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
509    MMI_ADDIU(%[counter], %[counter], -0x02)
510    "bnez       %[counter], 1b                                  \n\t"
511    "mfc1       %[sad],     %[ftmp5]                            \n\t"
512    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
513      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
514      [src]"+&r"(src), [ref]"+&r"(ref),
515      [second_pred]"+&r"((mips_reg)second_pred),
516      [sad]"=&r"(sad)
517    : [src_stride]"r"((mips_reg)src_stride),
518      [ref_stride]"r"((mips_reg)ref_stride)
519  );
520
521  return sad;
522}
523
524#define vpx_sad_avg32xN(H)                                                   \
525  unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
526                                       const uint8_t *ref, int ref_stride,   \
527                                       const uint8_t *second_pred) {         \
528    return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \
529  }
530
531vpx_sad_avg32xN(64);
532vpx_sad_avg32xN(32);
533vpx_sad_avg32xN(16);
534
535static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
536                                      const uint8_t *ref, int ref_stride,
537                                      int counter) {
538  unsigned int sad;
539  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
540  mips_reg l_counter = counter;
541
542  __asm__ volatile (
543    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
544    "1:                                                         \n\t"
545    // Include two loop body, to reduce loop time.
546    SAD_SRC_REF_ABS_SUB_16
547    MMI_ADDU(%[src],     %[src],         %[src_stride])
548    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
549    SAD_SRC_REF_ABS_SUB_16
550    MMI_ADDU(%[src],     %[src],         %[src_stride])
551    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
552    MMI_ADDIU(%[counter], %[counter], -0x02)
553    "bnez       %[counter], 1b                                  \n\t"
554    "mfc1       %[sad],     %[ftmp5]                            \n\t"
555    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
556      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
557      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
558    : [src_stride]"r"((mips_reg)src_stride),
559      [ref_stride]"r"((mips_reg)ref_stride)
560  );
561
562  return sad;
563}
564
565#define vpx_sad16xN(H)                                                   \
566  unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride,   \
567                                   const uint8_t *ref, int ref_stride) { \
568    return vpx_sad16x(src, src_stride, ref, ref_stride, H);              \
569  }
570
571vpx_sad16xN(32);
572vpx_sad16xN(16);
573vpx_sad16xN(8);
574sadMxNxK_mmi(16, 16, 3);
575sadMxNxK_mmi(16, 16, 8);
576sadMxNxK_mmi(16, 8, 3);
577sadMxNxK_mmi(16, 8, 8);
578sadMxNx4D_mmi(16, 32);
579sadMxNx4D_mmi(16, 16);
580sadMxNx4D_mmi(16, 8);
581
582static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
583                                          const uint8_t *ref, int ref_stride,
584                                          const uint8_t *second_pred,
585                                          int counter) {
586  unsigned int sad;
587  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
588  mips_reg l_counter = counter;
589
590  __asm__ volatile (
591    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
592    "1:                                                         \n\t"
593    // Include two loop body, to reduce loop time.
594    SAD_SRC_AVGREF_ABS_SUB_16
595    MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
596    MMI_ADDU(%[src],     %[src],         %[src_stride])
597    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
598    SAD_SRC_AVGREF_ABS_SUB_16
599    MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
600    MMI_ADDU(%[src],     %[src],         %[src_stride])
601    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
602    MMI_ADDIU(%[counter], %[counter], -0x02)
603    "bnez       %[counter], 1b                                  \n\t"
604    "mfc1       %[sad],     %[ftmp5]                            \n\t"
605    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
606      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
607      [src]"+&r"(src), [ref]"+&r"(ref),
608      [second_pred]"+&r"((mips_reg)second_pred),
609      [sad]"=&r"(sad)
610    : [src_stride]"r"((mips_reg)src_stride),
611      [ref_stride]"r"((mips_reg)ref_stride)
612  );
613
614  return sad;
615}
616
617#define vpx_sad_avg16xN(H)                                                   \
618  unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
619                                       const uint8_t *ref, int ref_stride,   \
620                                       const uint8_t *second_pred) {         \
621    return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \
622  }
623
624vpx_sad_avg16xN(32);
625vpx_sad_avg16xN(16);
626vpx_sad_avg16xN(8);
627
628static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
629                                     const uint8_t *ref, int ref_stride,
630                                     int counter) {
631  unsigned int sad;
632  double ftmp1, ftmp2, ftmp3;
633  mips_reg l_counter = counter;
634
635  __asm__ volatile (
636    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
637    "1:                                                         \n\t"
638    // Include two loop body, to reduce loop time.
639    SAD_SRC_REF_ABS_SUB_8
640    MMI_ADDU(%[src],     %[src],         %[src_stride])
641    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
642    SAD_SRC_REF_ABS_SUB_8
643    MMI_ADDU(%[src],     %[src],         %[src_stride])
644    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
645    MMI_ADDIU(%[counter], %[counter], -0x02)
646    "bnez       %[counter], 1b                                  \n\t"
647    "mfc1       %[sad],     %[ftmp3]                            \n\t"
648    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
649      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
650      [sad]"=&r"(sad)
651    : [src_stride]"r"((mips_reg)src_stride),
652      [ref_stride]"r"((mips_reg)ref_stride)
653  );
654
655  return sad;
656}
657
658#define vpx_sad8xN(H)                                                   \
659  unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride,   \
660                                  const uint8_t *ref, int ref_stride) { \
661    return vpx_sad8x(src, src_stride, ref, ref_stride, H);              \
662  }
663
664vpx_sad8xN(16);
665vpx_sad8xN(8);
666vpx_sad8xN(4);
667sadMxNxK_mmi(8, 16, 3);
668sadMxNxK_mmi(8, 16, 8);
669sadMxNxK_mmi(8, 8, 3);
670sadMxNxK_mmi(8, 8, 8);
671sadMxNx4D_mmi(8, 16);
672sadMxNx4D_mmi(8, 8);
673sadMxNx4D_mmi(8, 4);
674
675static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
676                                         const uint8_t *ref, int ref_stride,
677                                         const uint8_t *second_pred,
678                                         int counter) {
679  unsigned int sad;
680  double ftmp1, ftmp2, ftmp3;
681  mips_reg l_counter = counter;
682
683  __asm__ volatile (
684    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
685    "1:                                                         \n\t"
686    // Include two loop body, to reduce loop time.
687    SAD_SRC_AVGREF_ABS_SUB_8
688    MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
689    MMI_ADDU(%[src],     %[src],         %[src_stride])
690    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
691    SAD_SRC_AVGREF_ABS_SUB_8
692    MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
693    MMI_ADDU(%[src],     %[src],         %[src_stride])
694    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
695    MMI_ADDIU(%[counter], %[counter], -0x02)
696    "bnez       %[counter], 1b                                  \n\t"
697    "mfc1       %[sad],     %[ftmp3]                            \n\t"
698    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
699      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
700      [second_pred]"+&r"((mips_reg)second_pred),
701      [sad]"=&r"(sad)
702    : [src_stride]"r"((mips_reg)src_stride),
703      [ref_stride]"r"((mips_reg)ref_stride)
704  );
705
706  return sad;
707}
708
709#define vpx_sad_avg8xN(H)                                                   \
710  unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
711                                      const uint8_t *ref, int ref_stride,   \
712                                      const uint8_t *second_pred) {         \
713    return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \
714  }
715
716vpx_sad_avg8xN(16);
717vpx_sad_avg8xN(8);
718vpx_sad_avg8xN(4);
719
720static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
721                                     const uint8_t *ref, int ref_stride,
722                                     int counter) {
723  unsigned int sad;
724  double ftmp1, ftmp2, ftmp3;
725  mips_reg l_counter = counter;
726
727  __asm__ volatile (
728    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
729    "1:                                                         \n\t"
730    // Include two loop body, to reduce loop time.
731    SAD_SRC_REF_ABS_SUB_4
732    MMI_ADDU(%[src],     %[src],         %[src_stride])
733    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
734    SAD_SRC_REF_ABS_SUB_4
735    MMI_ADDU(%[src],     %[src],         %[src_stride])
736    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
737    MMI_ADDIU(%[counter], %[counter], -0x02)
738    "bnez       %[counter], 1b                                  \n\t"
739    "mfc1       %[sad],     %[ftmp3]                            \n\t"
740    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
741      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
742      [sad]"=&r"(sad)
743    : [src_stride]"r"((mips_reg)src_stride),
744      [ref_stride]"r"((mips_reg)ref_stride)
745  );
746
747  return sad;
748}
749
750#define vpx_sad4xN(H)                                                   \
751  unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride,   \
752                                  const uint8_t *ref, int ref_stride) { \
753    return vpx_sad4x(src, src_stride, ref, ref_stride, H);              \
754  }
755
756vpx_sad4xN(8);
757vpx_sad4xN(4);
758sadMxNxK_mmi(4, 4, 3);
759sadMxNxK_mmi(4, 4, 8);
760sadMxNx4D_mmi(4, 8);
761sadMxNx4D_mmi(4, 4);
762
763static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
764                                         const uint8_t *ref, int ref_stride,
765                                         const uint8_t *second_pred,
766                                         int counter) {
767  unsigned int sad;
768  double ftmp1, ftmp2, ftmp3;
769  mips_reg l_counter = counter;
770
771  __asm__ volatile (
772    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
773    "1:                                                         \n\t"
774    // Include two loop body, to reduce loop time.
775    SAD_SRC_AVGREF_ABS_SUB_4
776    MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
777    MMI_ADDU(%[src],     %[src],         %[src_stride])
778    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
779    SAD_SRC_AVGREF_ABS_SUB_4
780    MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
781    MMI_ADDU(%[src],     %[src],         %[src_stride])
782    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
783    MMI_ADDIU(%[counter], %[counter], -0x02)
784    "bnez       %[counter], 1b                                  \n\t"
785    "mfc1       %[sad],     %[ftmp3]                            \n\t"
786    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
787      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
788      [second_pred]"+&r"((mips_reg)second_pred),
789      [sad]"=&r"(sad)
790    : [src_stride]"r"((mips_reg)src_stride),
791      [ref_stride]"r"((mips_reg)ref_stride)
792  );
793
794  return sad;
795}
796
797#define vpx_sad_avg4xN(H)                                                   \
798  unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
799                                      const uint8_t *ref, int ref_stride,   \
800                                      const uint8_t *second_pred) {         \
801    return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \
802  }
803
804vpx_sad_avg4xN(8);
805vpx_sad_avg4xN(4);
806