1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
12#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
13
14#include <stdlib.h>
15
16#include "./vp9_rtcd.h"
17#include "vp9/common/vp9_common.h"
18#include "vp9/common/vp9_onyxc_int.h"
19
20#ifdef __cplusplus
21extern "C" {
22#endif
23
24#if HAVE_DSPR2
25/* processing 4 pixels at the same time
26 * compute hev and mask in the same function */
27static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
28                                             uint32_t p1, uint32_t p0,
29                                             uint32_t p3, uint32_t p2,
30                                             uint32_t q0, uint32_t q1,
31                                             uint32_t q2, uint32_t q3,
32                                             uint32_t thresh, uint32_t *hev,
33                                             uint32_t *mask) {
34  uint32_t  c, r, r3, r_k;
35  uint32_t  s1, s2, s3;
36  uint32_t  ones = 0xFFFFFFFF;
37  uint32_t  hev1;
38
39  __asm__ __volatile__ (
40      /* mask |= (abs(p3 - p2) > limit) */
41      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
42      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
43      "or             %[r_k], %[r_k],    %[c]         \n\t"
44      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
45      "or             %[r],   $0,        %[c]         \n\t"
46
47      /* mask |= (abs(p2 - p1) > limit) */
48      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
49      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
50      "or             %[r_k], %[r_k],    %[c]         \n\t"
51      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
52      "or             %[r],   %[r],      %[c]         \n\t"
53
54      /* mask |= (abs(p1 - p0) > limit)
55       * hev  |= (abs(p1 - p0) > thresh)
56       */
57      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
58      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
59      "or             %[r_k], %[r_k],    %[c]         \n\t"
60      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
61      "or             %[r3],  $0,        %[c]         \n\t"
62      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
63      "or             %[r],   %[r],      %[c]         \n\t"
64
65      /* mask |= (abs(q1 - q0) > limit)
66       * hev  |= (abs(q1 - q0) > thresh)
67       */
68      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
69      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
70      "or             %[r_k], %[r_k],    %[c]         \n\t"
71      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
72      "or             %[r3],  %[r3],     %[c]         \n\t"
73      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
74      "or             %[r],   %[r],      %[c]         \n\t"
75
76      /* mask |= (abs(q2 - q1) > limit) */
77      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
78      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
79      "or             %[r_k], %[r_k],    %[c]         \n\t"
80      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
81      "or             %[r],   %[r],      %[c]         \n\t"
82      "sll            %[r3],    %[r3],    24          \n\t"
83
84      /* mask |= (abs(q3 - q2) > limit) */
85      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
86      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
87      "or             %[r_k], %[r_k],    %[c]         \n\t"
88      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
89      "or             %[r],   %[r],      %[c]         \n\t"
90
91      : [c] "=&r" (c), [r_k] "=&r" (r_k),
92        [r] "=&r" (r), [r3] "=&r" (r3)
93      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
94        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
95        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
96  );
97
98  __asm__ __volatile__ (
99      /* abs(p0 - q0) */
100      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
101      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
102      "wrdsp          %[r3]                           \n\t"
103      "or             %[s1],  %[r_k],    %[c]         \n\t"
104
105      /* abs(p1 - q1) */
106      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
107      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
108      "pick.qb        %[hev1], %[ones],  $0           \n\t"
109      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
110      "or             %[s2],   %[r_k],   %[c]         \n\t"
111
112      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
113      "shrl.qb        %[s2],   %[s2],     1           \n\t"
114      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
115      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
116      "or             %[r],    %[r],      %[c]        \n\t"
117      "sll            %[r],    %[r],      24          \n\t"
118
119      "wrdsp          %[r]                            \n\t"
120      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
121
122      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
123        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
124      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
125        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
126  );
127
128  *hev = hev1;
129  *mask = s2;
130}
131
132static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
133                                                       uint32_t flimit,
134                                                       uint32_t thresh,
135                                                       uint32_t p1, uint32_t p0,
136                                                       uint32_t p3, uint32_t p2,
137                                                       uint32_t q0, uint32_t q1,
138                                                       uint32_t q2, uint32_t q3,
139                                                       uint32_t *hev,
140                                                       uint32_t *mask,
141                                                       uint32_t *flat) {
142  uint32_t  c, r, r3, r_k, r_flat;
143  uint32_t  s1, s2, s3;
144  uint32_t  ones = 0xFFFFFFFF;
145  uint32_t  flat_thresh = 0x01010101;
146  uint32_t  hev1;
147  uint32_t  flat1;
148
149  __asm__ __volatile__ (
150      /* mask |= (abs(p3 - p2) > limit) */
151      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
152      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
153      "or             %[r_k],     %[r_k],         %[c]         \n\t"
154      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
155      "or             %[r],       $0,             %[c]         \n\t"
156
157      /* mask |= (abs(p2 - p1) > limit) */
158      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
159      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
160      "or             %[r_k],     %[r_k],         %[c]         \n\t"
161      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
162      "or             %[r],       %[r],           %[c]         \n\t"
163
164      /* mask |= (abs(p1 - p0) > limit)
165       * hev  |= (abs(p1 - p0) > thresh)
166       * flat |= (abs(p1 - p0) > thresh)
167       */
168      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
169      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
170      "or             %[r_k],     %[r_k],         %[c]         \n\t"
171      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
172      "or             %[r3],      $0,             %[c]         \n\t"
173      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
174      "or             %[r],       %[r],           %[c]         \n\t"
175      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
176      "or             %[r_flat],  $0,             %[c]         \n\t"
177
178      /* mask |= (abs(q1 - q0) > limit)
179       * hev  |= (abs(q1 - q0) > thresh)
180       * flat |= (abs(q1 - q0) > thresh)
181       */
182      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
183      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
184      "or             %[r_k],     %[r_k],         %[c]         \n\t"
185      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
186      "or             %[r3],      %[r3],          %[c]         \n\t"
187      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
188      "or             %[r],       %[r],           %[c]         \n\t"
189      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
190      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
191
192      /* flat |= (abs(p0 - p2) > thresh) */
193      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
194      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
195      "or             %[r_k],     %[r_k],         %[c]         \n\t"
196      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
197      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
198
199      /* flat |= (abs(q0 - q2) > thresh) */
200      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
201      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
202      "or             %[r_k],     %[r_k],         %[c]         \n\t"
203      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
204      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
205
206      /* flat |= (abs(p3 - p0) > thresh) */
207      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
208      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
209      "or             %[r_k],     %[r_k],         %[c]         \n\t"
210      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
211      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
212
213      /* flat |= (abs(q3 - q0) > thresh) */
214      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
215      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
216      "or             %[r_k],     %[r_k],         %[c]         \n\t"
217      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
218      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
219      "sll            %[r_flat],  %[r_flat],      24           \n\t"
220      /* look at stall here */
221      "wrdsp          %[r_flat]                                \n\t"
222      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
223
224      /* mask |= (abs(q2 - q1) > limit) */
225      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
226      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
227      "or             %[r_k],     %[r_k],         %[c]         \n\t"
228      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
229      "or             %[r],       %[r],           %[c]         \n\t"
230      "sll            %[r3],      %[r3],          24           \n\t"
231
232      /* mask |= (abs(q3 - q2) > limit) */
233      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
234      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
235      "or             %[r_k],     %[r_k],         %[c]         \n\t"
236      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
237      "or             %[r],       %[r],           %[c]         \n\t"
238
239      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
240        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
241      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
242        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
243        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
244        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
245  );
246
247  __asm__ __volatile__ (
248      /* abs(p0 - q0) */
249      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
250      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
251      "wrdsp          %[r3]                           \n\t"
252      "or             %[s1],  %[r_k],    %[c]         \n\t"
253
254      /* abs(p1 - q1) */
255      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
256      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
257      "pick.qb        %[hev1], %[ones],  $0           \n\t"
258      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
259      "or             %[s2],   %[r_k],   %[c]         \n\t"
260
261      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
262      "shrl.qb        %[s2],   %[s2],     1           \n\t"
263      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
264      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
265      "or             %[r],    %[r],      %[c]        \n\t"
266      "sll            %[r],    %[r],      24          \n\t"
267
268      "wrdsp          %[r]                            \n\t"
269      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
270
271      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
272        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
273      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
274        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
275  );
276
277  *hev = hev1;
278  *mask = s2;
279  *flat = flat1;
280}
281
282static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
283                                 uint32_t p2, uint32_t p1,
284                                 uint32_t p0, uint32_t q0,
285                                 uint32_t q1, uint32_t q2,
286                                 uint32_t q3, uint32_t q4,
287                                 uint32_t *flat2) {
288  uint32_t  c, r, r_k, r_flat;
289  uint32_t  ones = 0xFFFFFFFF;
290  uint32_t  flat_thresh = 0x01010101;
291  uint32_t  flat1, flat3;
292
293  __asm__ __volatile__ (
294      /* flat |= (abs(p4 - p0) > thresh) */
295      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
296      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
297      "or             %[r_k], %[r_k],          %[c]         \n\t"
298      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
299      "or             %[r],   $0,              %[c]         \n\t"
300
301      /* flat |= (abs(q4 - q0) > thresh) */
302      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
303      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
304      "or             %[r_k],   %[r_k],          %[c]      \n\t"
305      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
306      "or             %[r],     %[r],            %[c]      \n\t"
307      "sll            %[r],     %[r],            24        \n\t"
308      "wrdsp          %[r]                                 \n\t"
309      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
310
311      /* flat |= (abs(p1 - p0) > thresh) */
312      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
313      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
314      "or             %[r_k],     %[r_k],         %[c]         \n\t"
315      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
316      "or             %[r_flat],  $0,             %[c]         \n\t"
317
318      /* flat |= (abs(q1 - q0) > thresh) */
319      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
320      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
321      "or             %[r_k],    %[r_k],          %[c]         \n\t"
322      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
323      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
324
325      /* flat |= (abs(p0 - p2) > thresh) */
326      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
327      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
328      "or             %[r_k],     %[r_k],         %[c]         \n\t"
329      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
330      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
331
332      /* flat |= (abs(q0 - q2) > thresh) */
333      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
334      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
335      "or             %[r_k],     %[r_k],         %[c]         \n\t"
336      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
337      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
338
339      /* flat |= (abs(p3 - p0) > thresh) */
340      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
341      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
342      "or             %[r_k],     %[r_k],         %[c]         \n\t"
343      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
344      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
345
346      /* flat |= (abs(q3 - q0) > thresh) */
347      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
348      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
349      "or             %[r_k],     %[r_k],         %[c]         \n\t"
350      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
351      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
352      "sll            %[r_flat],  %[r_flat],      24           \n\t"
353      "wrdsp          %[r_flat]                                \n\t"
354      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
355      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
356      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
357
358      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r),
359        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
360      : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
361        [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
362        [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
363        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
364  );
365
366  *flat2 = flat1;
367}
368#endif  // #if HAVE_DSPR2
369#ifdef __cplusplus
370}  // extern "C"
371#endif
372
373#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
374