1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
12#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
13
14#include <stdlib.h>
15
16#include "./vp9_rtcd.h"
17#include "vp9/common/vp9_common.h"
18#include "vp9/common/vp9_onyxc_int.h"
19
20#ifdef __cplusplus
21extern "C" {
22#endif
23
24#if HAVE_DSPR2
25/* inputs & outputs are quad-byte vectors */
26static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
27                                    uint32_t *ps1, uint32_t *ps0,
28                                    uint32_t *qs0, uint32_t *qs1) {
29  int32_t   vp9_filter_l, vp9_filter_r;
30  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
31  int32_t   subr_r, subr_l;
32  uint32_t  t1, t2, HWM, t3;
33  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
34  int32_t   vps1, vps0, vqs0, vqs1;
35  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
36  uint32_t  N128;
37
38  N128 = 0x80808080;
39  t1  = 0x03000300;
40  t2  = 0x04000400;
41  t3  = 0x01000100;
42  HWM = 0xFF00FF00;
43
44  vps0 = (*ps0) ^ N128;
45  vps1 = (*ps1) ^ N128;
46  vqs0 = (*qs0) ^ N128;
47  vqs1 = (*qs1) ^ N128;
48
49  /* use halfword pairs instead quad-bytes because of accuracy */
50  vps0_l = vps0 & HWM;
51  vps0_r = vps0 << 8;
52  vps0_r = vps0_r & HWM;
53
54  vps1_l = vps1 & HWM;
55  vps1_r = vps1 << 8;
56  vps1_r = vps1_r & HWM;
57
58  vqs0_l = vqs0 & HWM;
59  vqs0_r = vqs0 << 8;
60  vqs0_r = vqs0_r & HWM;
61
62  vqs1_l = vqs1 & HWM;
63  vqs1_r = vqs1 << 8;
64  vqs1_r = vqs1_r & HWM;
65
66  mask_l = mask & HWM;
67  mask_r = mask << 8;
68  mask_r = mask_r & HWM;
69
70  hev_l = hev & HWM;
71  hev_r = hev << 8;
72  hev_r = hev_r & HWM;
73
74  __asm__ __volatile__ (
75      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
76      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
77      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
78
79      /* qs0 - ps0 */
80      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
81      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
82
83      /* vp9_filter &= hev; */
84      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
85      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
86
87      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
88      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
89      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
90      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
91      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
92      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
93      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
94      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
95      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
96
97      /* vp9_filter &= mask; */
98      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
99      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
100
101      : [vp9_filter_l] "=&r" (vp9_filter_l),
102        [vp9_filter_r] "=&r" (vp9_filter_r),
103        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
104        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
105      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
106        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
107        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
108        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
109        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
110        [HWM] "r" (HWM)
111  );
112
113  /* save bottom 3 bits so that we round one side +4 and the other +3 */
114  __asm__ __volatile__ (
115      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
116      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
117      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
118
119      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
120      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
121      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
122      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
123      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
124
125      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
126      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
127
128      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
129      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
130
131      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
132      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
133      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
134
135      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
136      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
137      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
138
139      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
140        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
141        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
142        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
143      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
144        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
145  );
146
147  __asm__ __volatile__ (
148      /* (vp9_filter += 1) >>= 1 */
149      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
150      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
151
152      /* vp9_filter &= ~hev; */
153      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
154      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
155
156      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
157      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
158      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
159
160      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
161      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
162      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
163
164      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
165        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
166        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
167      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
168  );
169
170  /* Create quad-bytes from halfword pairs */
171  vqs0_l = vqs0_l & HWM;
172  vqs1_l = vqs1_l & HWM;
173  vps0_l = vps0_l & HWM;
174  vps1_l = vps1_l & HWM;
175
176  __asm__ __volatile__ (
177      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
178      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
179      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
180      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
181
182      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
183        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
184      :
185  );
186
187  vqs0 = vqs0_l | vqs0_r;
188  vqs1 = vqs1_l | vqs1_r;
189  vps0 = vps0_l | vps0_r;
190  vps1 = vps1_l | vps1_r;
191
192  *ps0 = vps0 ^ N128;
193  *ps1 = vps1 ^ N128;
194  *qs0 = vqs0 ^ N128;
195  *qs1 = vqs1 ^ N128;
196}
197
198static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
199                                     uint32_t ps1, uint32_t ps0,
200                                     uint32_t qs0, uint32_t qs1,
201                                     uint32_t *p1_f0, uint32_t *p0_f0,
202                                     uint32_t *q0_f0, uint32_t *q1_f0) {
203  int32_t   vp9_filter_l, vp9_filter_r;
204  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
205  int32_t   subr_r, subr_l;
206  uint32_t  t1, t2, HWM, t3;
207  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
208  int32_t   vps1, vps0, vqs0, vqs1;
209  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
210  uint32_t  N128;
211
212  N128 = 0x80808080;
213  t1  = 0x03000300;
214  t2  = 0x04000400;
215  t3  = 0x01000100;
216  HWM = 0xFF00FF00;
217
218  vps0 = (ps0) ^ N128;
219  vps1 = (ps1) ^ N128;
220  vqs0 = (qs0) ^ N128;
221  vqs1 = (qs1) ^ N128;
222
223  /* use halfword pairs instead quad-bytes because of accuracy */
224  vps0_l = vps0 & HWM;
225  vps0_r = vps0 << 8;
226  vps0_r = vps0_r & HWM;
227
228  vps1_l = vps1 & HWM;
229  vps1_r = vps1 << 8;
230  vps1_r = vps1_r & HWM;
231
232  vqs0_l = vqs0 & HWM;
233  vqs0_r = vqs0 << 8;
234  vqs0_r = vqs0_r & HWM;
235
236  vqs1_l = vqs1 & HWM;
237  vqs1_r = vqs1 << 8;
238  vqs1_r = vqs1_r & HWM;
239
240  mask_l = mask & HWM;
241  mask_r = mask << 8;
242  mask_r = mask_r & HWM;
243
244  hev_l = hev & HWM;
245  hev_r = hev << 8;
246  hev_r = hev_r & HWM;
247
248  __asm__ __volatile__ (
249      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
250      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
251      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
252
253      /* qs0 - ps0 */
254      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
255      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
256
257      /* vp9_filter &= hev; */
258      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
259      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
260
261      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
262      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
263      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
264      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
265      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
266      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
267      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
268      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
269      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
270
271      /* vp9_filter &= mask; */
272      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
273      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
274
275      : [vp9_filter_l] "=&r" (vp9_filter_l),
276        [vp9_filter_r] "=&r" (vp9_filter_r),
277        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
278        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
279      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
280        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
281        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
282        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
283        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
284  );
285
286  /* save bottom 3 bits so that we round one side +4 and the other +3 */
287  __asm__ __volatile__ (
288      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
289      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
290      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
291
292      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
293      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
294      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
295      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
296      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
297
298      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
299      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
300
301      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
302      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
303
304      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
305      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
306      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
307
308      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
309      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
310      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
311
312      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
313        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
314        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
315        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
316      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
317        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
318  );
319
320  __asm__ __volatile__ (
321      /* (vp9_filter += 1) >>= 1 */
322      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
323      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
324
325      /* vp9_filter &= ~hev; */
326      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
327      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
328
329      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
330      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
331      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
332
333      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
334      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
335      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
336
337      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
338        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
339        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
340      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
341  );
342
343  /* Create quad-bytes from halfword pairs */
344  vqs0_l = vqs0_l & HWM;
345  vqs1_l = vqs1_l & HWM;
346  vps0_l = vps0_l & HWM;
347  vps1_l = vps1_l & HWM;
348
349  __asm__ __volatile__ (
350      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
351      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
352      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
353      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
354
355      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
356        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
357      :
358  );
359
360  vqs0 = vqs0_l | vqs0_r;
361  vqs1 = vqs1_l | vqs1_r;
362  vps0 = vps0_l | vps0_r;
363  vps1 = vps1_l | vps1_r;
364
365  *p0_f0 = vps0 ^ N128;
366  *p1_f0 = vps1 ^ N128;
367  *q0_f0 = vqs0 ^ N128;
368  *q1_f0 = vqs1 ^ N128;
369}
370
371static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
372                                      uint32_t *op1, uint32_t *op0,
373                                      uint32_t *oq0, uint32_t *oq1,
374                                      uint32_t *oq2, uint32_t *oq3) {
375  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
376  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
377  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
378  uint32_t       res_op2, res_op1, res_op0;
379  uint32_t       res_oq0, res_oq1, res_oq2;
380  uint32_t       tmp;
381  uint32_t       add_p210_q012;
382  uint32_t       u32Four = 0x00040004;
383
384  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
385  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
386  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
387  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
388  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
389  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
390
391  __asm__ __volatile__ (
392      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
393      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
394      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
395      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
396      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
397      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
398
399      "shll.ph    %[tmp],            %[p3],             1                \n\t"
400      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
401      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
402      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
403      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
404      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
405      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
406      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
407      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
408      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
409      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
410      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
411      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
412      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
413      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
414      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
415      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
416      "shll.ph    %[tmp],            %[q3],             1                \n\t"
417      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
418      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
419      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
420      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
421      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
422      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
423      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
424      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
425      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
426      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
427      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
428      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
429
430      : [add_p210_q012] "=&r" (add_p210_q012),
431        [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
432        [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
433        [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
434        [res_oq2] "=&r" (res_oq2)
435      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
436        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
437        [u32Four] "r" (u32Four)
438  );
439
440  *op2 = res_op2;
441  *op1 = res_op1;
442  *op0 = res_op0;
443  *oq0 = res_oq0;
444  *oq1 = res_oq1;
445  *oq2 = res_oq2;
446}
447
448static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
449                                       uint32_t p1, uint32_t p0,
450                                       uint32_t q0, uint32_t q1,
451                                       uint32_t q2, uint32_t q3,
452                                       uint32_t *op2_f1,
453                                       uint32_t *op1_f1, uint32_t *op0_f1,
454                                       uint32_t *oq0_f1, uint32_t *oq1_f1,
455                                       uint32_t *oq2_f1) {
456  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
457  uint32_t  res_op2, res_op1, res_op0;
458  uint32_t  res_oq0, res_oq1, res_oq2;
459  uint32_t  tmp;
460  uint32_t  add_p210_q012;
461  uint32_t  u32Four = 0x00040004;
462
463  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
464  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
465  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
466  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
467  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
468  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
469
470  __asm__ __volatile__ (
471      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
472      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
473      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
474      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
475      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
476      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
477
478      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
479      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
480      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
481      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
482      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
483      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
484      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
485      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
486      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
487      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
488      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
489      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
490      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
491      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
492      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
493      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
494      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
495      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
496      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
497      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
498      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
499      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
500      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
501      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
502      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
503      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
504      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
505      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
506      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
507      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
508
509      : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
510        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
511        [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
512        [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
513      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
514        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
515        [u32Four] "r" (u32Four)
516  );
517
518  *op2_f1 = res_op2;
519  *op1_f1 = res_op1;
520  *op0_f1 = res_op0;
521  *oq0_f1 = res_oq0;
522  *oq1_f1 = res_oq1;
523  *oq2_f1 = res_oq2;
524}
525
526static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
527                                           uint32_t *op5, uint32_t *op4,
528                                           uint32_t *op3, uint32_t *op2,
529                                           uint32_t *op1, uint32_t *op0,
530                                           uint32_t *oq0, uint32_t *oq1,
531                                           uint32_t *oq2, uint32_t *oq3,
532                                           uint32_t *oq4, uint32_t *oq5,
533                                           uint32_t *oq6, uint32_t *oq7) {
534  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
535  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
536  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
537  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
538  uint32_t       res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
539  uint32_t       res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
540  uint32_t       tmp;
541  uint32_t       add_p6toq6;
542  uint32_t       u32Eight = 0x00080008;
543
544  __asm__ __volatile__ (
545      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
546         which is used most of the time */
547      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
548      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
549      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
550      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
551      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
552      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
553      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
554      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
555      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
556      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
557      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
558      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
559      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
560      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
561
562      : [add_p6toq6] "=&r" (add_p6toq6)
563      : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
564        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
565        [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
566        [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
567        [u32Eight] "r" (u32Eight)
568  );
569
570  __asm__ __volatile__ (
571      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
572                                   p3 + p2 + p1 + p0 + q0, 4) */
573      "shll.ph       %[tmp],            %[p7],            3               \n\t"
574      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
575      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
576      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
577      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
578      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
579      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
580      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
581      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
582      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
583      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
584
585      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
586                                   p2 + p1 + p0 + q0 + q1, 4) */
587      "shll.ph       %[tmp],            %[p7],            2               \n\t"
588      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
589      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
590      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
591      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
592      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
593      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
594      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
595      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
596      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
597      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
598
599      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
600                                   p1 + p0 + q0 + q1 + q2, 4) */
601      "shll.ph       %[tmp],            %[p7],            2               \n\t"
602      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
603      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
604      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
605      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
606      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
607      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
608      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
609      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
610
611      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
612                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
613      "shll.ph       %[tmp],            %[p7],            2               \n\t"
614      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
615      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
616      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
617      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
618      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
619      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
620
621      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
622                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
623      "shll.ph       %[tmp],            %[p7],            1               \n\t"
624      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
625      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
626      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
627      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
628      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
629      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
630
631      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
632                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
633      "shll.ph       %[tmp],            %[p7],            1               \n\t"
634      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
635      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
636      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
637      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
638
639      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
640                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
641      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
642      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
643      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
644
645      : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
646        [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
647        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
648        [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
649      : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
650        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
651        [q2] "r" (q2), [q1] "r" (q1),
652        [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
653        [add_p6toq6] "r" (add_p6toq6)
654  );
655
656  *op6 = res_op6;
657  *op5 = res_op5;
658  *op4 = res_op4;
659  *op3 = res_op3;
660  *op2 = res_op2;
661  *op1 = res_op1;
662  *op0 = res_op0;
663
664  __asm__ __volatile__ (
665      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
666                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
667      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
668      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
669      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
670
671      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
672                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
673      "shll.ph       %[tmp],            %[q7],            1               \n\t"
674      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
675      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
676      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
677      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
678
679      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
680                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
681      "shll.ph       %[tmp],            %[q7],            1               \n\t"
682      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
683      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
684      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
685      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
686      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
687      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
688
689      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
690                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
691      "shll.ph       %[tmp],            %[q7],            2               \n\t"
692      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
693      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
694      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
695      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
696      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
697      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
698
699      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
700                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
701      "shll.ph       %[tmp],            %[q7],            2               \n\t"
702      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
703      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
704      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
705      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
706      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
707      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
708      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
709      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
710
711      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
712                                   q5 * 2 + q6 + q7 * 6, 4) */
713      "shll.ph       %[tmp],            %[q7],            2               \n\t"
714      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
715      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
716      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
717      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
718      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
719      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
720      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
721      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
722      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
723      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
724
725      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
726                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
727      "shll.ph       %[tmp],            %[q7],            3               \n\t"
728      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
729      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
730      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
731      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
732      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
733      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
734      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
735      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
736      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
737      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
738
739      : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
740        [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
741        [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
742        [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
743      : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
744        [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
745        [p1] "r" (p1), [p2] "r" (p2),
746        [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
747        [add_p6toq6] "r" (add_p6toq6)
748  );
749
750  *oq0 = res_oq0;
751  *oq1 = res_oq1;
752  *oq2 = res_oq2;
753  *oq3 = res_oq3;
754  *oq4 = res_oq4;
755  *oq5 = res_oq5;
756  *oq6 = res_oq6;
757}
758#endif  // #if HAVE_DSPR2
759#ifdef __cplusplus
760}  // extern "C"
761#endif
762
763#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
764