loopfilter_filters_dspr2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <stdlib.h>
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx/vpx_integer.h"
15#include "vpx_dsp/mips/common_dspr2.h"
16#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19#include "vpx_mem/vpx_mem.h"
20
21#if HAVE_DSPR2
22void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
23                                int pitch,
24                                const uint8_t *blimit,
25                                const uint8_t *limit,
26                                const uint8_t *thresh,
27                                int count) {
28  uint8_t   i;
29  uint32_t  mask;
30  uint32_t  hev;
31  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
32  uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
33  uint32_t  thresh_vec, flimit_vec, limit_vec;
34  uint32_t  uflimit, ulimit, uthresh;
35
36  uflimit = *blimit;
37  ulimit = *limit;
38  uthresh = *thresh;
39
40  /* create quad-byte */
41  __asm__ __volatile__ (
42      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
43      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
44      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
45
46      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
47        [limit_vec] "=r" (limit_vec)
48      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
49  );
50
51  /* prefetch data for store */
52  prefetch_store(s);
53
54  /* loop filter designed to work using chars so that we can make maximum use
55     of 8 bit simd instructions. */
56  for (i = 0; i < 2; i++) {
57    sm1 = s - (pitch << 2);
58    s0 = sm1 + pitch;
59    s1 = s0 + pitch;
60    s2 = s - pitch;
61    s3 = s;
62    s4 = s + pitch;
63    s5 = s4 + pitch;
64    s6 = s5 + pitch;
65
66    __asm__ __volatile__ (
67        "lw     %[p1],  (%[s1])    \n\t"
68        "lw     %[p2],  (%[s2])    \n\t"
69        "lw     %[p3],  (%[s3])    \n\t"
70        "lw     %[p4],  (%[s4])    \n\t"
71
72        : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
73        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
74    );
75
76    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
77       mask will be zero and filtering is not needed */
78    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
79      __asm__ __volatile__ (
80          "lw       %[pm1], (%[sm1])   \n\t"
81          "lw       %[p0],  (%[s0])    \n\t"
82          "lw       %[p5],  (%[s5])    \n\t"
83          "lw       %[p6],  (%[s6])    \n\t"
84
85          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
86            [p6] "=&r" (p6)
87          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
88      );
89
90      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
91                            pm1, p0, p3, p4, p5, p6,
92                            thresh_vec, &hev, &mask);
93
94      /* if mask == 0 do filtering is not needed */
95      if (mask) {
96        /* filtering */
97        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
98
99        __asm__ __volatile__ (
100            "sw     %[p1],  (%[s1])    \n\t"
101            "sw     %[p2],  (%[s2])    \n\t"
102            "sw     %[p3],  (%[s3])    \n\t"
103            "sw     %[p4],  (%[s4])    \n\t"
104
105            :
106            : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
107              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
108        );
109      }
110    }
111
112    s = s + 4;
113  }
114}
115
116void vpx_lpf_vertical_4_dspr2(unsigned char *s,
117                              int pitch,
118                              const uint8_t *blimit,
119                              const uint8_t *limit,
120                              const uint8_t *thresh,
121                              int count) {
122  uint8_t   i;
123  uint32_t  mask, hev;
124  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
125  uint8_t   *s1, *s2, *s3, *s4;
126  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
127  uint32_t  thresh_vec, flimit_vec, limit_vec;
128  uint32_t  uflimit, ulimit, uthresh;
129
130  uflimit = *blimit;
131  ulimit = *limit;
132  uthresh = *thresh;
133
134  /* create quad-byte */
135  __asm__ __volatile__ (
136      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
137      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
138      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
139
140      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
141        [limit_vec] "=r" (limit_vec)
142      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
143  );
144
145  /* prefetch data for store */
146  prefetch_store(s + pitch);
147
148  for (i = 0; i < 2; i++) {
149    s1 = s;
150    s2 = s + pitch;
151    s3 = s2 + pitch;
152    s4 = s3 + pitch;
153    s  = s4 + pitch;
154
155    /* load quad-byte vectors
156     * memory is 4 byte aligned
157     */
158    p2  = *((uint32_t *)(s1 - 4));
159    p6  = *((uint32_t *)(s1));
160    p1  = *((uint32_t *)(s2 - 4));
161    p5  = *((uint32_t *)(s2));
162    p0  = *((uint32_t *)(s3 - 4));
163    p4  = *((uint32_t *)(s3));
164    pm1 = *((uint32_t *)(s4 - 4));
165    p3  = *((uint32_t *)(s4));
166
167    /* transpose pm1, p0, p1, p2 */
168    __asm__ __volatile__ (
169        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
170        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
171        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
172        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
173
174        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
175        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
176        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
177        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
178
179        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
180        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
181        "append         %[p1],      %[sec3],    16          \n\t"
182        "append         %[pm1],     %[sec4],    16          \n\t"
183
184        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
185          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
186          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
187          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
188        :
189    );
190
191    /* transpose p3, p4, p5, p6 */
192    __asm__ __volatile__ (
193        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
194        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
195        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
196        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
197
198        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
199        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
200        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
201        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
202
203        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
204        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
205        "append         %[p5],      %[sec3],    16          \n\t"
206        "append         %[p3],      %[sec4],    16          \n\t"
207
208        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
209          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
210          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
211          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
212        :
213    );
214
215    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
216     * mask will be zero and filtering is not needed
217     */
218    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
219      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
220                            p0, p3, p4, p5, p6, thresh_vec,
221                            &hev, &mask);
222
223      /* if mask == 0 do filtering is not needed */
224      if (mask) {
225        /* filtering */
226        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
227
228        /* unpack processed 4x4 neighborhood
229         * don't use transpose on output data
230         * because memory isn't aligned
231         */
232        __asm__ __volatile__ (
233            "sb     %[p4],   1(%[s4])    \n\t"
234            "sb     %[p3],   0(%[s4])    \n\t"
235            "sb     %[p2],  -1(%[s4])    \n\t"
236            "sb     %[p1],  -2(%[s4])    \n\t"
237
238            :
239            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
240              [s4] "r" (s4)
241        );
242
243        __asm__ __volatile__ (
244            "srl    %[p4],  %[p4],  8     \n\t"
245            "srl    %[p3],  %[p3],  8     \n\t"
246            "srl    %[p2],  %[p2],  8     \n\t"
247            "srl    %[p1],  %[p1],  8     \n\t"
248
249            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
250            :
251        );
252
253        __asm__ __volatile__ (
254            "sb     %[p4],   1(%[s3])    \n\t"
255            "sb     %[p3],   0(%[s3])    \n\t"
256            "sb     %[p2],  -1(%[s3])    \n\t"
257            "sb     %[p1],  -2(%[s3])    \n\t"
258
259            : [p1] "+r" (p1)
260            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
261        );
262
263        __asm__ __volatile__ (
264            "srl    %[p4],  %[p4],  8     \n\t"
265            "srl    %[p3],  %[p3],  8     \n\t"
266            "srl    %[p2],  %[p2],  8     \n\t"
267            "srl    %[p1],  %[p1],  8     \n\t"
268
269            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
270            :
271        );
272
273        __asm__ __volatile__ (
274            "sb     %[p4],   1(%[s2])    \n\t"
275            "sb     %[p3],   0(%[s2])    \n\t"
276            "sb     %[p2],  -1(%[s2])    \n\t"
277            "sb     %[p1],  -2(%[s2])    \n\t"
278
279            :
280            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
281              [s2] "r" (s2)
282        );
283
284        __asm__ __volatile__ (
285            "srl    %[p4],  %[p4],  8     \n\t"
286            "srl    %[p3],  %[p3],  8     \n\t"
287            "srl    %[p2],  %[p2],  8     \n\t"
288            "srl    %[p1],  %[p1],  8     \n\t"
289
290            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
291            :
292        );
293
294        __asm__ __volatile__ (
295            "sb     %[p4],   1(%[s1])    \n\t"
296            "sb     %[p3],   0(%[s1])    \n\t"
297            "sb     %[p2],  -1(%[s1])    \n\t"
298            "sb     %[p1],  -2(%[s1])    \n\t"
299
300            :
301            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
302              [s1] "r" (s1)
303        );
304      }
305    }
306  }
307}
308
309void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
310                                     const uint8_t *blimit0,
311                                     const uint8_t *limit0,
312                                     const uint8_t *thresh0,
313                                     const uint8_t *blimit1,
314                                     const uint8_t *limit1,
315                                     const uint8_t *thresh1) {
316  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
317  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
318}
319
320void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
321                                     const uint8_t *blimit0,
322                                     const uint8_t *limit0,
323                                     const uint8_t *thresh0,
324                                     const uint8_t *blimit1,
325                                     const uint8_t *limit1,
326                                     const uint8_t *thresh1) {
327  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
328  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
329}
330
331void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
332                                   const uint8_t *blimit0,
333                                   const uint8_t *limit0,
334                                   const uint8_t *thresh0,
335                                   const uint8_t *blimit1,
336                                   const uint8_t *limit1,
337                                   const uint8_t *thresh1) {
338  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
339  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
340}
341
342void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
343                                   const uint8_t *blimit0,
344                                   const uint8_t *limit0,
345                                   const uint8_t *thresh0,
346                                   const uint8_t *blimit1,
347                                   const uint8_t *limit1,
348                                   const uint8_t *thresh1) {
349  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
350  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
351                                       1);
352}
353
354void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
355                                    const uint8_t *blimit,
356                                    const uint8_t *limit,
357                                    const uint8_t *thresh) {
358  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
359  vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
360}
361#endif  // #if HAVE_DSPR2
362