1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
12#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
13
14#include <stdlib.h>
15
16#include "./vpx_dsp_rtcd.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_mem/vpx_mem.h"
19#include "vpx_ports/mem.h"
20
21#ifdef __cplusplus
22extern "C" {
23#endif
24
25#if HAVE_DSPR2
26/* inputs & outputs are quad-byte vectors */
27static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
28                                uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
29  int32_t vpx_filter_l, vpx_filter_r;
30  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
31  int32_t subr_r, subr_l;
32  uint32_t t1, t2, HWM, t3;
33  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
34  int32_t vps1, vps0, vqs0, vqs1;
35  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
36  uint32_t N128;
37
38  N128 = 0x80808080;
39  t1 = 0x03000300;
40  t2 = 0x04000400;
41  t3 = 0x01000100;
42  HWM = 0xFF00FF00;
43
44  vps0 = (*ps0) ^ N128;
45  vps1 = (*ps1) ^ N128;
46  vqs0 = (*qs0) ^ N128;
47  vqs1 = (*qs1) ^ N128;
48
49  /* use halfword pairs instead quad-bytes because of accuracy */
50  vps0_l = vps0 & HWM;
51  vps0_r = vps0 << 8;
52  vps0_r = vps0_r & HWM;
53
54  vps1_l = vps1 & HWM;
55  vps1_r = vps1 << 8;
56  vps1_r = vps1_r & HWM;
57
58  vqs0_l = vqs0 & HWM;
59  vqs0_r = vqs0 << 8;
60  vqs0_r = vqs0_r & HWM;
61
62  vqs1_l = vqs1 & HWM;
63  vqs1_r = vqs1 << 8;
64  vqs1_r = vqs1_r & HWM;
65
66  mask_l = mask & HWM;
67  mask_r = mask << 8;
68  mask_r = mask_r & HWM;
69
70  hev_l = hev & HWM;
71  hev_r = hev << 8;
72  hev_r = hev_r & HWM;
73
74  __asm__ __volatile__(
75      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
76      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
77      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
78
79      /* qs0 - ps0 */
80      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
81      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
82
83      /* vpx_filter &= hev; */
84      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
85      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
86
87      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
88      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
89      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
90      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
91      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
92      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
93      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
94      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
95      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
96
97      /* vpx_filter &= mask; */
98      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
99      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
100
101      : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
102        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
103        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
104      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
105        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
106        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
107        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
108        [HWM] "r"(HWM));
109
110  /* save bottom 3 bits so that we round one side +4 and the other +3 */
111  __asm__ __volatile__(
112      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
113      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
114      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
115
116      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
117      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
118      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
119      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
120      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
121
122      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
123      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
124
125      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
126      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
127
128      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
129      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
130      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
131
132      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
133      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
134      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
135
136      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
137        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
138        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
139        [vqs0_r] "+r"(vqs0_r)
140      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
141        [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
142
143  __asm__ __volatile__(
144      /* (vpx_filter += 1) >>= 1 */
145      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
146      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
147
148      /* vpx_filter &= ~hev; */
149      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
150      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
151
152      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
153      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
154      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
155
156      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
157      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
158      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
159
160      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
161        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
162        [vqs1_r] "+r"(vqs1_r)
163      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
164
165  /* Create quad-bytes from halfword pairs */
166  vqs0_l = vqs0_l & HWM;
167  vqs1_l = vqs1_l & HWM;
168  vps0_l = vps0_l & HWM;
169  vps1_l = vps1_l & HWM;
170
171  __asm__ __volatile__(
172      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
173      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
174      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
175      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
176
177      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
178        [vqs0_r] "+r"(vqs0_r)
179      :);
180
181  vqs0 = vqs0_l | vqs0_r;
182  vqs1 = vqs1_l | vqs1_r;
183  vps0 = vps0_l | vps0_r;
184  vps1 = vps1_l | vps1_r;
185
186  *ps0 = vps0 ^ N128;
187  *ps1 = vps1 ^ N128;
188  *qs0 = vqs0 ^ N128;
189  *qs1 = vqs1 ^ N128;
190}
191
192static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
193                                 uint32_t ps0, uint32_t qs0, uint32_t qs1,
194                                 uint32_t *p1_f0, uint32_t *p0_f0,
195                                 uint32_t *q0_f0, uint32_t *q1_f0) {
196  int32_t vpx_filter_l, vpx_filter_r;
197  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
198  int32_t subr_r, subr_l;
199  uint32_t t1, t2, HWM, t3;
200  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
201  int32_t vps1, vps0, vqs0, vqs1;
202  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
203  uint32_t N128;
204
205  N128 = 0x80808080;
206  t1 = 0x03000300;
207  t2 = 0x04000400;
208  t3 = 0x01000100;
209  HWM = 0xFF00FF00;
210
211  vps0 = (ps0) ^ N128;
212  vps1 = (ps1) ^ N128;
213  vqs0 = (qs0) ^ N128;
214  vqs1 = (qs1) ^ N128;
215
216  /* use halfword pairs instead quad-bytes because of accuracy */
217  vps0_l = vps0 & HWM;
218  vps0_r = vps0 << 8;
219  vps0_r = vps0_r & HWM;
220
221  vps1_l = vps1 & HWM;
222  vps1_r = vps1 << 8;
223  vps1_r = vps1_r & HWM;
224
225  vqs0_l = vqs0 & HWM;
226  vqs0_r = vqs0 << 8;
227  vqs0_r = vqs0_r & HWM;
228
229  vqs1_l = vqs1 & HWM;
230  vqs1_r = vqs1 << 8;
231  vqs1_r = vqs1_r & HWM;
232
233  mask_l = mask & HWM;
234  mask_r = mask << 8;
235  mask_r = mask_r & HWM;
236
237  hev_l = hev & HWM;
238  hev_r = hev << 8;
239  hev_r = hev_r & HWM;
240
241  __asm__ __volatile__(
242      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
243      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
244      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
245
246      /* qs0 - ps0 */
247      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
248      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
249
250      /* vpx_filter &= hev; */
251      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
252      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
253
254      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
255      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
256      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
257      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
258      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
259      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
260      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
261      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
262      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
263
264      /* vpx_filter &= mask; */
265      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
266      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
267
268      : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
269        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
270        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
271      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
272        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
273        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
274        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
275        [HWM] "r"(HWM));
276
277  /* save bottom 3 bits so that we round one side +4 and the other +3 */
278  __asm__ __volatile__(
279      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
280      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
281      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
282
283      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
284      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
285      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
286      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
287      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
288
289      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
290      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
291
292      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
293      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
294
295      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
296      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
297      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
298
299      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
300      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
301      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
302
303      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
304        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
305        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
306        [vqs0_r] "+r"(vqs0_r)
307      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
308        [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
309
310  __asm__ __volatile__(
311      /* (vpx_filter += 1) >>= 1 */
312      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
313      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
314
315      /* vpx_filter &= ~hev; */
316      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
317      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
318
319      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
320      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
321      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
322
323      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
324      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
325      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
326
327      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
328        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
329        [vqs1_r] "+r"(vqs1_r)
330      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
331
332  /* Create quad-bytes from halfword pairs */
333  vqs0_l = vqs0_l & HWM;
334  vqs1_l = vqs1_l & HWM;
335  vps0_l = vps0_l & HWM;
336  vps1_l = vps1_l & HWM;
337
338  __asm__ __volatile__(
339      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
340      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
341      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
342      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
343
344      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
345        [vqs0_r] "+r"(vqs0_r)
346      :);
347
348  vqs0 = vqs0_l | vqs0_r;
349  vqs1 = vqs1_l | vqs1_r;
350  vps0 = vps0_l | vps0_r;
351  vps1 = vps1_l | vps1_r;
352
353  *p0_f0 = vps0 ^ N128;
354  *p1_f0 = vps1 ^ N128;
355  *q0_f0 = vqs0 ^ N128;
356  *q1_f0 = vqs1 ^ N128;
357}
358
359static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
360                                  uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
361                                  uint32_t *oq2, uint32_t *oq3) {
362  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
363  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
364  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
365  uint32_t res_op2, res_op1, res_op0;
366  uint32_t res_oq0, res_oq1, res_oq2;
367  uint32_t tmp;
368  uint32_t add_p210_q012;
369  uint32_t u32Four = 0x00040004;
370
371  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
372  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
373  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
374  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
375  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
376  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
377
378  __asm__ __volatile__(
379      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
380      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
381      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
382      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
383      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
384      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
385
386      "shll.ph    %[tmp],            %[p3],             1                \n\t"
387      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
388      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
389      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
390      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
391      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
392      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
393      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
394      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
395      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
396      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
397      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
398      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
399      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
400      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
401      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
402      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
403      "shll.ph    %[tmp],            %[q3],             1                \n\t"
404      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
405      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
406      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
407      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
408      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
409      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
410      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
411      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
412      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
413      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
414      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
415      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
416
417      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
418        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
419        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
420        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
421      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
422        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
423
424  *op2 = res_op2;
425  *op1 = res_op1;
426  *op0 = res_op0;
427  *oq0 = res_oq0;
428  *oq1 = res_oq1;
429  *oq2 = res_oq2;
430}
431
432static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
433                                   uint32_t p0, uint32_t q0, uint32_t q1,
434                                   uint32_t q2, uint32_t q3, uint32_t *op2_f1,
435                                   uint32_t *op1_f1, uint32_t *op0_f1,
436                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
437                                   uint32_t *oq2_f1) {
438  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
439  uint32_t res_op2, res_op1, res_op0;
440  uint32_t res_oq0, res_oq1, res_oq2;
441  uint32_t tmp;
442  uint32_t add_p210_q012;
443  uint32_t u32Four = 0x00040004;
444
445  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
446  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
447  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
448  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
449  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
450  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
451
452  __asm__ __volatile__(
453      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
454      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
455      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
456      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
457      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
458      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
459
460      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
461      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
462      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
463      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
464      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
465      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
466      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
467      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
468      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
469      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
470      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
471      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
472      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
473      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
474      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
475      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
476      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
477      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
478      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
479      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
480      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
481      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
482      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
483      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
484      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
485      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
486      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
487      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
488      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
489      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
490
491      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
492        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
493        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
494        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
495      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
496        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
497
498  *op2_f1 = res_op2;
499  *op1_f1 = res_op1;
500  *op0_f1 = res_op0;
501  *oq0_f1 = res_oq0;
502  *oq1_f1 = res_oq1;
503  *oq2_f1 = res_oq2;
504}
505
506static INLINE void wide_mbfilter_dspr2(
507    uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
508    uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
509    uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
510    uint32_t *oq7) {
511  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
512  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
513  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
514  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
515  uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
516  uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
517  uint32_t tmp;
518  uint32_t add_p6toq6;
519  uint32_t u32Eight = 0x00080008;
520
521  __asm__ __volatile__(
522      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
523         which is used most of the time */
524      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
525      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
526      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
527      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
528      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
529      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
530      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
531      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
532      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
533      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
534      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
535      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
536      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
537      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
538
539      : [add_p6toq6] "=&r"(add_p6toq6)
540      : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
541        [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
542        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
543        [u32Eight] "r"(u32Eight));
544
545  __asm__ __volatile__(
546      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
547                                   p3 + p2 + p1 + p0 + q0, 4) */
548      "shll.ph       %[tmp],            %[p7],            3               \n\t"
549      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
550      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
551      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
552      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
553      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
554      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
555      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
556      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
557      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
558      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
559
560      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
561                                   p2 + p1 + p0 + q0 + q1, 4) */
562      "shll.ph       %[tmp],            %[p7],            2               \n\t"
563      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
564      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
565      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
566      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
567      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
568      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
569      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
570      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
571      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
572      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
573
574      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
575                                   p1 + p0 + q0 + q1 + q2, 4) */
576      "shll.ph       %[tmp],            %[p7],            2               \n\t"
577      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
578      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
579      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
580      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
581      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
582      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
583      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
584      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
585
586      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
587                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
588      "shll.ph       %[tmp],            %[p7],            2               \n\t"
589      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
590      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
591      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
592      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
593      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
594      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
595
596      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
597                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
598      "shll.ph       %[tmp],            %[p7],            1               \n\t"
599      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
600      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
601      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
602      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
603      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
604      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
605
606      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
607                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
608      "shll.ph       %[tmp],            %[p7],            1               \n\t"
609      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
610      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
611      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
612      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
613
614      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
615                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
616      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
617      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
618      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
619
620      : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
621        [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
622        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
623        [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
624      : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
625        [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
626        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
627        [add_p6toq6] "r"(add_p6toq6));
628
629  *op6 = res_op6;
630  *op5 = res_op5;
631  *op4 = res_op4;
632  *op3 = res_op3;
633  *op2 = res_op2;
634  *op1 = res_op1;
635  *op0 = res_op0;
636
637  __asm__ __volatile__(
638      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
639                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
640      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
641      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
642      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
643
644      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
645                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
646      "shll.ph       %[tmp],            %[q7],            1               \n\t"
647      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
648      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
649      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
650      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
651
652      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
653                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
654      "shll.ph       %[tmp],            %[q7],            1               \n\t"
655      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
656      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
657      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
658      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
659      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
660      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
661
662      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
663                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
664      "shll.ph       %[tmp],            %[q7],            2               \n\t"
665      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
666      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
667      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
668      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
669      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
670      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
671
672      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
673                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
674      "shll.ph       %[tmp],            %[q7],            2               \n\t"
675      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
676      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
677      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
678      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
679      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
680      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
681      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
682      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
683
684      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
685                                   q5 * 2 + q6 + q7 * 6, 4) */
686      "shll.ph       %[tmp],            %[q7],            2               \n\t"
687      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
688      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
689      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
690      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
691      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
692      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
693      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
694      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
695      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
696      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
697
698      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
699                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
700      "shll.ph       %[tmp],            %[q7],            3               \n\t"
701      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
702      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
703      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
704      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
705      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
706      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
707      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
708      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
709      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
710      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
711
712      : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
713        [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
714        [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
715        [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
716      : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
717        [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
718        [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
719        [add_p6toq6] "r"(add_p6toq6));
720
721  *oq0 = res_oq0;
722  *oq1 = res_oq1;
723  *oq2 = res_oq2;
724  *oq3 = res_oq3;
725  *oq4 = res_oq4;
726  *oq5 = res_oq5;
727  *oq6 = res_oq6;
728}
729#endif  // #if HAVE_DSPR2
730#ifdef __cplusplus
731}  // extern "C"
732#endif
733
734#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
735