1/*
2 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "vp8/common/filter.h"
12#include "vpx_ports/asmdefs_mmi.h"
13
14DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
15  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
16    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
17    0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
18    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
19    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
20    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
21  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
22    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
23    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
24    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
25    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
26    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
27  { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
28    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
29    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
30    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
31    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
32    0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
33  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
34    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
35    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
36    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
37    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
38    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
39  { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
40    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
41    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
42    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
43    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
44    0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
45  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
46    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
47    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
48    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
49    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
50    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
51  { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
52    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
53    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
54    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
55    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
56    0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
57  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
58    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
59    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
60    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
61    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
62    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
63};
64
65/* Horizontal filter:  pixel_step is 1, output_height and output_width are
66   the size of horizontal filtering output, output_height is always H + 5 */
67static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
68                                             uint16_t *output_ptr,
69                                             unsigned int src_pixels_per_line,
70                                             unsigned int output_height,
71                                             unsigned int output_width,
72                                             const int16_t *vp8_filter) {
73  uint32_t tmp[1];
74  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
75
76#if _MIPS_SIM == _ABIO32
77  register double fzero asm("$f0");
78  register double ftmp0 asm("$f2");
79  register double ftmp1 asm("$f4");
80  register double ftmp2 asm("$f6");
81  register double ftmp3 asm("$f8");
82  register double ftmp4 asm("$f10");
83  register double ftmp5 asm("$f12");
84  register double ftmp6 asm("$f14");
85  register double ftmp7 asm("$f16");
86  register double ftmp8 asm("$f18");
87  register double ftmp9 asm("$f20");
88  register double ftmp10 asm("$f22");
89  register double ftmp11 asm("$f24");
90#else
91  register double fzero asm("$f0");
92  register double ftmp0 asm("$f1");
93  register double ftmp1 asm("$f2");
94  register double ftmp2 asm("$f3");
95  register double ftmp3 asm("$f4");
96  register double ftmp4 asm("$f5");
97  register double ftmp5 asm("$f6");
98  register double ftmp6 asm("$f7");
99  register double ftmp7 asm("$f8");
100  register double ftmp8 asm("$f9");
101  register double ftmp9 asm("$f10");
102  register double ftmp10 asm("$f11");
103  register double ftmp11 asm("$f12");
104#endif  // _MIPS_SIM == _ABIO32
105
106  __asm__ volatile (
107    "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
108    "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
109    "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
110    "ldc1       %[ftmp3],       0x30(%[vp8_filter])                   \n\t"
111    "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
112    "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
113    "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
114    "li         %[tmp0],        0x07                                  \n\t"
115    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
116    "li         %[tmp0],        0x08                                  \n\t"
117    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
118
119    "1:                                                               \n\t"
120    "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
121    "gsldrc1    %[ftmp9],       -0x02(%[src_ptr])                     \n\t"
122    "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
123    "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"
124
125    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
126    "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
127
128    "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
129    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
130    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
131
132    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
133    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
134    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
135
136    "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
137    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
138    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
139
140    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
141    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
142    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
143    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
144
145    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
146    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
147    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
148    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
149
150    "paddsh     %[ftmp8],       %[ftmp8],          %[ff_ph_40]        \n\t"
151    "psrah      %[ftmp8],       %[ftmp8],          %[ftmp7]           \n\t"
152    "packushb   %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
153    "punpcklbh  %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
154    "gssdlc1    %[ftmp8],       0x07(%[output_ptr])                   \n\t"
155    "gssdrc1    %[ftmp8],       0x00(%[output_ptr])                   \n\t"
156
157    "addiu      %[output_height], %[output_height], -0x01             \n\t"
158    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
159    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
160    "bnez       %[output_height],               1b                    \n\t"
161    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
162      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
163      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
164      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
165      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
166      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
167      [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
168      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
169      [src_ptr]"+&r"(src_ptr)
170    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
171      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
172      [ff_ph_40]"f"(ff_ph_40)
173    : "memory"
174    );
175}
176
177/* Horizontal filter:  pixel_step is always W */
178static INLINE void vp8_filter_block1dc_v6_mmi(
179    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
180    int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
181  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
182  uint32_t tmp[1];
183  mips_reg addr[1];
184#if _MIPS_SIM == _ABIO32
185  register double fzero asm("$f0");
186  register double ftmp0 asm("$f2");
187  register double ftmp1 asm("$f4");
188  register double ftmp2 asm("$f6");
189  register double ftmp3 asm("$f8");
190  register double ftmp4 asm("$f10");
191  register double ftmp5 asm("$f12");
192  register double ftmp6 asm("$f14");
193  register double ftmp7 asm("$f16");
194  register double ftmp8 asm("$f18");
195  register double ftmp9 asm("$f20");
196  register double ftmp10 asm("$f22");
197  register double ftmp11 asm("$f24");
198  register double ftmp12 asm("$f26");
199  register double ftmp13 asm("$f28");
200#else
201  register double fzero asm("$f0");
202  register double ftmp0 asm("$f1");
203  register double ftmp1 asm("$f2");
204  register double ftmp2 asm("$f3");
205  register double ftmp3 asm("$f4");
206  register double ftmp4 asm("$f5");
207  register double ftmp5 asm("$f6");
208  register double ftmp6 asm("$f7");
209  register double ftmp7 asm("$f8");
210  register double ftmp8 asm("$f9");
211  register double ftmp9 asm("$f10");
212  register double ftmp10 asm("$f11");
213  register double ftmp11 asm("$f12");
214  register double ftmp12 asm("$f13");
215  register double ftmp13 asm("$f14");
216#endif  // _MIPS_SIM == _ABIO32
217
218  __asm__ volatile (
219    "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
220    "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
221    "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
222    "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
223    "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
224    "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
225    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
226    "li         %[tmp0],      0x07                                    \n\t"
227    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
228
229    /* In order to make full use of memory load delay slot,
230     * Operation of memory loading and calculating has been rearranged.
231     */
232    "1:                                                               \n\t"
233    "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
234    "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
235    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
236    "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
237    "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
238    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
239    "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
240    "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"
241
242    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
243    "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
244    "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
245    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
246    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
247    "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
248    "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
249    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
250    "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
251    "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"
252
253    "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
254
255    "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
256    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
257
258    "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
259    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
260
261    "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
262    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
263
264    "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
265    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
266
267    "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
268    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
269
270    "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
271    "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
272    "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
273    "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
274    "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"
275
276    MMI_ADDIU(%[output_height], %[output_height], -0x01)
277    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
278    "bnez       %[output_height], 1b                                  \n\t"
279    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
280      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
281      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
282      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
283      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
284      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
285      [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
286      [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
287      [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
288      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
289    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
290      [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
291      [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
292      [vp8_filter]"r"(vp8_filter),
293      [output_pitch]"r"((mips_reg)output_pitch),
294      [ff_ph_40]"f"(ff_ph_40)
295    : "memory"
296    );
297}
298
299/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
300   function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
301   be simplified */
302static INLINE void vp8_filter_block1d_h6_filter0_mmi(
303    unsigned char *src_ptr, uint16_t *output_ptr,
304    unsigned int src_pixels_per_line, unsigned int output_height,
305    unsigned int output_width) {
306#if _MIPS_SIM == _ABIO32
307  register double fzero asm("$f0");
308  register double ftmp0 asm("$f2");
309  register double ftmp1 asm("$f4");
310#else
311  register double fzero asm("$f0");
312  register double ftmp0 asm("$f1");
313  register double ftmp1 asm("$f2");
314#endif  // _MIPS_SIM == _ABIO32
315
316  __asm__ volatile (
317    "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
318
319    "1:                                                               \n\t"
320    "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
321    "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
322    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
323
324    "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
325    "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
326    "gssdrc1    %[ftmp1],       0x00(%[output_ptr])                   \n\t"
327
328    "addiu      %[output_height], %[output_height], -0x01             \n\t"
329    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
330    "bnez       %[output_height],               1b                    \n\t"
331    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
332      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
333      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
334    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
335      [output_width]"r"(output_width)
336    : "memory"
337    );
338}
339
340static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
341    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
342    int output_pitch, unsigned int pixels_per_line) {
343#if _MIPS_SIM == _ABIO32
344  register double fzero asm("$f0");
345  register double ftmp0 asm("$f2");
346  register double ftmp1 asm("$f4");
347#else
348  register double fzero asm("$f0");
349  register double ftmp0 asm("$f1");
350  register double ftmp1 asm("$f2");
351#endif  // _MIPS_SIM == _ABIO32
352
353  __asm__ volatile (
354    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
355
356    "1:                                                               \n\t"
357    "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
358    "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
359    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
360    MMI_ADDIU(%[output_height], %[output_height], -0x01)
361    "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
362    "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
363    "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"
364
365    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
366    "bnez       %[output_height], 1b                                  \n\t"
367    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
368      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
369      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
370    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
371      [output_pitch]"r"((mips_reg)output_pitch)
372    : "memory"
373    );
374}
375
376#define sixtapNxM(n, m)                                                        \
377  void vp8_sixtap_predict##n##x##m##_mmi(                                      \
378      unsigned char *src_ptr, int src_pixels_per_line, int xoffset,            \
379      int yoffset, unsigned char *dst_ptr, int dst_pitch) {                    \
380    DECLARE_ALIGNED(16, uint16_t,                                              \
381                    FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]);     \
382    const int16_t *HFilter, *VFilter;                                          \
383    int i, loop = n / 4;                                                       \
384    HFilter = vp8_six_tap_mmi[xoffset];                                        \
385    VFilter = vp8_six_tap_mmi[yoffset];                                        \
386                                                                               \
387    if (xoffset == 0) {                                                        \
388      for (i = 0; i < loop; ++i) {                                             \
389        vp8_filter_block1d_h6_filter0_mmi(                                     \
390            src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4,       \
391            src_pixels_per_line, m + 5, n * 2);                                \
392      }                                                                        \
393    } else {                                                                   \
394      for (i = 0; i < loop; ++i) {                                             \
395        vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
396                                  FData2 + i * 4, src_pixels_per_line, m + 5,  \
397                                  n * 2, HFilter);                             \
398      }                                                                        \
399    }                                                                          \
400    if (yoffset == 0) {                                                        \
401      for (i = 0; i < loop; ++i) {                                             \
402        vp8_filter_block1dc_v6_filter0_mmi(                                    \
403            FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2);     \
404      }                                                                        \
405    } else {                                                                   \
406      for (i = 0; i < loop; ++i) {                                             \
407        vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
408                                   dst_pitch, n * 2, VFilter);                 \
409      }                                                                        \
410    }                                                                          \
411  }
412
413sixtapNxM(4, 4);
414sixtapNxM(8, 8);
415sixtapNxM(8, 4);
416sixtapNxM(16, 16);
417