1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
12#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
13
14#include <stdlib.h>
15
16#include "./vp9_rtcd.h"
17#include "vp9/common/vp9_common.h"
18#include "vp9/common/vp9_onyxc_int.h"
19
20#if HAVE_DSPR2
21/* processing 4 pixels at the same time
22 * compute hev and mask in the same function */
23static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
24                                             uint32_t p1, uint32_t p0,
25                                             uint32_t p3, uint32_t p2,
26                                             uint32_t q0, uint32_t q1,
27                                             uint32_t q2, uint32_t q3,
28                                             uint32_t thresh, uint32_t *hev,
29                                             uint32_t *mask) {
30  uint32_t  c, r, r3, r_k;
31  uint32_t  s1, s2, s3;
32  uint32_t  ones = 0xFFFFFFFF;
33  uint32_t  hev1;
34
35  __asm__ __volatile__ (
36      /* mask |= (abs(p3 - p2) > limit) */
37      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
38      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
39      "or             %[r_k], %[r_k],    %[c]         \n\t"
40      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
41      "or             %[r],   $0,        %[c]         \n\t"
42
43      /* mask |= (abs(p2 - p1) > limit) */
44      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
45      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
46      "or             %[r_k], %[r_k],    %[c]         \n\t"
47      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
48      "or             %[r],   %[r],      %[c]         \n\t"
49
50      /* mask |= (abs(p1 - p0) > limit)
51       * hev  |= (abs(p1 - p0) > thresh)
52       */
53      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
54      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
55      "or             %[r_k], %[r_k],    %[c]         \n\t"
56      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
57      "or             %[r3],  $0,        %[c]         \n\t"
58      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
59      "or             %[r],   %[r],      %[c]         \n\t"
60
61      /* mask |= (abs(q1 - q0) > limit)
62       * hev  |= (abs(q1 - q0) > thresh)
63       */
64      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
65      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
66      "or             %[r_k], %[r_k],    %[c]         \n\t"
67      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
68      "or             %[r3],  %[r3],     %[c]         \n\t"
69      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
70      "or             %[r],   %[r],      %[c]         \n\t"
71
72      /* mask |= (abs(q2 - q1) > limit) */
73      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
74      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
75      "or             %[r_k], %[r_k],    %[c]         \n\t"
76      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
77      "or             %[r],   %[r],      %[c]         \n\t"
78      "sll            %[r3],    %[r3],    24          \n\t"
79
80      /* mask |= (abs(q3 - q2) > limit) */
81      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
82      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
83      "or             %[r_k], %[r_k],    %[c]         \n\t"
84      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
85      "or             %[r],   %[r],      %[c]         \n\t"
86
87      : [c] "=&r" (c), [r_k] "=&r" (r_k),
88        [r] "=&r" (r), [r3] "=&r" (r3)
89      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
90        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
91        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
92  );
93
94  __asm__ __volatile__ (
95      /* abs(p0 - q0) */
96      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
97      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
98      "wrdsp          %[r3]                           \n\t"
99      "or             %[s1],  %[r_k],    %[c]         \n\t"
100
101      /* abs(p1 - q1) */
102      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
103      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
104      "pick.qb        %[hev1], %[ones],  $0           \n\t"
105      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
106      "or             %[s2],   %[r_k],   %[c]         \n\t"
107
108      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
109      "shrl.qb        %[s2],   %[s2],     1           \n\t"
110      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
111      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
112      "or             %[r],    %[r],      %[c]        \n\t"
113      "sll            %[r],    %[r],      24          \n\t"
114
115      "wrdsp          %[r]                            \n\t"
116      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
117
118      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
119        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
120      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
121        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
122  );
123
124  *hev = hev1;
125  *mask = s2;
126}
127
128static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
129                                                       uint32_t flimit,
130                                                       uint32_t thresh,
131                                                       uint32_t p1, uint32_t p0,
132                                                       uint32_t p3, uint32_t p2,
133                                                       uint32_t q0, uint32_t q1,
134                                                       uint32_t q2, uint32_t q3,
135                                                       uint32_t *hev,
136                                                       uint32_t *mask,
137                                                       uint32_t *flat) {
138  uint32_t  c, r, r3, r_k, r_flat;
139  uint32_t  s1, s2, s3;
140  uint32_t  ones = 0xFFFFFFFF;
141  uint32_t  flat_thresh = 0x01010101;
142  uint32_t  hev1;
143  uint32_t  flat1;
144
145  __asm__ __volatile__ (
146      /* mask |= (abs(p3 - p2) > limit) */
147      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
148      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
149      "or             %[r_k],     %[r_k],         %[c]         \n\t"
150      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
151      "or             %[r],       $0,             %[c]         \n\t"
152
153      /* mask |= (abs(p2 - p1) > limit) */
154      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
155      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
156      "or             %[r_k],     %[r_k],         %[c]         \n\t"
157      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
158      "or             %[r],       %[r],           %[c]         \n\t"
159
160      /* mask |= (abs(p1 - p0) > limit)
161       * hev  |= (abs(p1 - p0) > thresh)
162       * flat |= (abs(p1 - p0) > thresh)
163       */
164      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
165      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
166      "or             %[r_k],     %[r_k],         %[c]         \n\t"
167      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
168      "or             %[r3],      $0,             %[c]         \n\t"
169      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
170      "or             %[r],       %[r],           %[c]         \n\t"
171      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
172      "or             %[r_flat],  $0,             %[c]         \n\t"
173
174      /* mask |= (abs(q1 - q0) > limit)
175       * hev  |= (abs(q1 - q0) > thresh)
176       * flat |= (abs(q1 - q0) > thresh)
177       */
178      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
179      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
180      "or             %[r_k],     %[r_k],         %[c]         \n\t"
181      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
182      "or             %[r3],      %[r3],          %[c]         \n\t"
183      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
184      "or             %[r],       %[r],           %[c]         \n\t"
185      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
186      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
187
188      /* flat |= (abs(p0 - p2) > thresh) */
189      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
190      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
191      "or             %[r_k],     %[r_k],         %[c]         \n\t"
192      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
193      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
194
195      /* flat |= (abs(q0 - q2) > thresh) */
196      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
197      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
198      "or             %[r_k],     %[r_k],         %[c]         \n\t"
199      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
200      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
201
202      /* flat |= (abs(p3 - p0) > thresh) */
203      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
204      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
205      "or             %[r_k],     %[r_k],         %[c]         \n\t"
206      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
207      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
208
209      /* flat |= (abs(q3 - q0) > thresh) */
210      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
211      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
212      "or             %[r_k],     %[r_k],         %[c]         \n\t"
213      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
214      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
215      "sll            %[r_flat],  %[r_flat],      24           \n\t"
216      /* look at stall here */
217      "wrdsp          %[r_flat]                                \n\t"
218      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
219
220      /* mask |= (abs(q2 - q1) > limit) */
221      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
222      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
223      "or             %[r_k],     %[r_k],         %[c]         \n\t"
224      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
225      "or             %[r],       %[r],           %[c]         \n\t"
226      "sll            %[r3],      %[r3],          24           \n\t"
227
228      /* mask |= (abs(q3 - q2) > limit) */
229      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
230      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
231      "or             %[r_k],     %[r_k],         %[c]         \n\t"
232      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
233      "or             %[r],       %[r],           %[c]         \n\t"
234
235      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
236        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
237      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
238        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
239        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
240        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
241  );
242
243  __asm__ __volatile__ (
244      /* abs(p0 - q0) */
245      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
246      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
247      "wrdsp          %[r3]                           \n\t"
248      "or             %[s1],  %[r_k],    %[c]         \n\t"
249
250      /* abs(p1 - q1) */
251      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
252      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
253      "pick.qb        %[hev1], %[ones],  $0           \n\t"
254      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
255      "or             %[s2],   %[r_k],   %[c]         \n\t"
256
257      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
258      "shrl.qb        %[s2],   %[s2],     1           \n\t"
259      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
260      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
261      "or             %[r],    %[r],      %[c]        \n\t"
262      "sll            %[r],    %[r],      24          \n\t"
263
264      "wrdsp          %[r]                            \n\t"
265      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
266
267      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
268        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
269      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
270        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
271  );
272
273  *hev = hev1;
274  *mask = s2;
275  *flat = flat1;
276}
277
278static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
279                                 uint32_t p2, uint32_t p1,
280                                 uint32_t p0, uint32_t q0,
281                                 uint32_t q1, uint32_t q2,
282                                 uint32_t q3, uint32_t q4,
283                                 uint32_t *flat2) {
284  uint32_t  c, r, r_k, r_flat;
285  uint32_t  ones = 0xFFFFFFFF;
286  uint32_t  flat_thresh = 0x01010101;
287  uint32_t  flat1, flat3;
288
289  __asm__ __volatile__ (
290      /* flat |= (abs(p4 - p0) > thresh) */
291      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
292      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
293      "or             %[r_k], %[r_k],          %[c]         \n\t"
294      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
295      "or             %[r],   $0,              %[c]         \n\t"
296
297      /* flat |= (abs(q4 - q0) > thresh) */
298      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
299      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
300      "or             %[r_k],   %[r_k],          %[c]      \n\t"
301      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
302      "or             %[r],     %[r],            %[c]      \n\t"
303      "sll            %[r],     %[r],            24        \n\t"
304      "wrdsp          %[r]                                 \n\t"
305      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
306
307      /* flat |= (abs(p1 - p0) > thresh) */
308      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
309      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
310      "or             %[r_k],     %[r_k],         %[c]         \n\t"
311      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
312      "or             %[r_flat],  $0,             %[c]         \n\t"
313
314      /* flat |= (abs(q1 - q0) > thresh) */
315      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
316      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
317      "or             %[r_k],    %[r_k],          %[c]         \n\t"
318      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
319      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
320
321      /* flat |= (abs(p0 - p2) > thresh) */
322      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
323      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
324      "or             %[r_k],     %[r_k],         %[c]         \n\t"
325      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
326      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
327
328      /* flat |= (abs(q0 - q2) > thresh) */
329      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
330      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
331      "or             %[r_k],     %[r_k],         %[c]         \n\t"
332      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
333      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
334
335      /* flat |= (abs(p3 - p0) > thresh) */
336      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
337      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
338      "or             %[r_k],     %[r_k],         %[c]         \n\t"
339      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
340      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
341
342      /* flat |= (abs(q3 - q0) > thresh) */
343      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
344      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
345      "or             %[r_k],     %[r_k],         %[c]         \n\t"
346      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
347      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
348      "sll            %[r_flat],  %[r_flat],      24           \n\t"
349      "wrdsp          %[r_flat]                                \n\t"
350      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
351      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
352      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
353
354      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r),
355        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
356      : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
357        [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
358        [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
359        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
360  );
361
362  *flat2 = flat1;
363}
364#endif  // #if HAVE_DSPR2
365#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
366