1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <stdlib.h>
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx/vpx_integer.h"
15#include "vpx_dsp/mips/common_dspr2.h"
16#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19#include "vpx_mem/vpx_mem.h"
20
21#if HAVE_DSPR2
22void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
23                                const uint8_t *blimit, const uint8_t *limit,
24                                const uint8_t *thresh) {
25  uint8_t i;
26  uint32_t mask;
27  uint32_t hev;
28  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
29  uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
30  uint32_t thresh_vec, flimit_vec, limit_vec;
31  uint32_t uflimit, ulimit, uthresh;
32
33  uflimit = *blimit;
34  ulimit = *limit;
35  uthresh = *thresh;
36
37  /* create quad-byte */
38  __asm__ __volatile__(
39      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
40      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
41      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
42
43      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
44        [limit_vec] "=r"(limit_vec)
45      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
46
47  /* prefetch data for store */
48  prefetch_store(s);
49
50  /* loop filter designed to work using chars so that we can make maximum use
51     of 8 bit simd instructions. */
52  for (i = 0; i < 2; i++) {
53    sm1 = s - (pitch << 2);
54    s0 = sm1 + pitch;
55    s1 = s0 + pitch;
56    s2 = s - pitch;
57    s3 = s;
58    s4 = s + pitch;
59    s5 = s4 + pitch;
60    s6 = s5 + pitch;
61
62    __asm__ __volatile__(
63        "lw     %[p1],  (%[s1])    \n\t"
64        "lw     %[p2],  (%[s2])    \n\t"
65        "lw     %[p3],  (%[s3])    \n\t"
66        "lw     %[p4],  (%[s4])    \n\t"
67
68        : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
69        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
70
71    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
72       mask will be zero and filtering is not needed */
73    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
74      __asm__ __volatile__(
75          "lw       %[pm1], (%[sm1])   \n\t"
76          "lw       %[p0],  (%[s0])    \n\t"
77          "lw       %[p5],  (%[s5])    \n\t"
78          "lw       %[p6],  (%[s6])    \n\t"
79
80          : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
81          : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
82
83      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
84                            p6, thresh_vec, &hev, &mask);
85
86      /* if mask == 0 do filtering is not needed */
87      if (mask) {
88        /* filtering */
89        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
90
91        __asm__ __volatile__(
92            "sw     %[p1],  (%[s1])    \n\t"
93            "sw     %[p2],  (%[s2])    \n\t"
94            "sw     %[p3],  (%[s3])    \n\t"
95            "sw     %[p4],  (%[s4])    \n\t"
96
97            :
98            : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
99              [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
100      }
101    }
102
103    s = s + 4;
104  }
105}
106
107void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
108                              const uint8_t *blimit, const uint8_t *limit,
109                              const uint8_t *thresh) {
110  uint8_t i;
111  uint32_t mask, hev;
112  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
113  uint8_t *s1, *s2, *s3, *s4;
114  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
115  uint32_t thresh_vec, flimit_vec, limit_vec;
116  uint32_t uflimit, ulimit, uthresh;
117
118  uflimit = *blimit;
119  ulimit = *limit;
120  uthresh = *thresh;
121
122  /* create quad-byte */
123  __asm__ __volatile__(
124      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
125      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
126      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
127
128      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
129        [limit_vec] "=r"(limit_vec)
130      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
131
132  /* prefetch data for store */
133  prefetch_store(s + pitch);
134
135  for (i = 0; i < 2; i++) {
136    s1 = s;
137    s2 = s + pitch;
138    s3 = s2 + pitch;
139    s4 = s3 + pitch;
140    s = s4 + pitch;
141
142    /* load quad-byte vectors
143     * memory is 4 byte aligned
144     */
145    p2 = *((uint32_t *)(s1 - 4));
146    p6 = *((uint32_t *)(s1));
147    p1 = *((uint32_t *)(s2 - 4));
148    p5 = *((uint32_t *)(s2));
149    p0 = *((uint32_t *)(s3 - 4));
150    p4 = *((uint32_t *)(s3));
151    pm1 = *((uint32_t *)(s4 - 4));
152    p3 = *((uint32_t *)(s4));
153
154    /* transpose pm1, p0, p1, p2 */
155    __asm__ __volatile__(
156        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
157        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
158        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
159        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
160
161        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
162        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
163        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
164        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
165
166        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
167        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
168        "append         %[p1],      %[sec3],    16          \n\t"
169        "append         %[pm1],     %[sec4],    16          \n\t"
170
171        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
172          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
173          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
174        :);
175
176    /* transpose p3, p4, p5, p6 */
177    __asm__ __volatile__(
178        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
179        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
180        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
181        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
182
183        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
184        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
185        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
186        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
187
188        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
189        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
190        "append         %[p5],      %[sec3],    16          \n\t"
191        "append         %[p3],      %[sec4],    16          \n\t"
192
193        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
194          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
195          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
196        :);
197
198    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
199     * mask will be zero and filtering is not needed
200     */
201    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
202      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
203                            p6, thresh_vec, &hev, &mask);
204
205      /* if mask == 0 do filtering is not needed */
206      if (mask) {
207        /* filtering */
208        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
209
210        /* unpack processed 4x4 neighborhood
211         * don't use transpose on output data
212         * because memory isn't aligned
213         */
214        __asm__ __volatile__(
215            "sb     %[p4],   1(%[s4])    \n\t"
216            "sb     %[p3],   0(%[s4])    \n\t"
217            "sb     %[p2],  -1(%[s4])    \n\t"
218            "sb     %[p1],  -2(%[s4])    \n\t"
219
220            :
221            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
222              [s4] "r"(s4));
223
224        __asm__ __volatile__(
225            "srl    %[p4],  %[p4],  8     \n\t"
226            "srl    %[p3],  %[p3],  8     \n\t"
227            "srl    %[p2],  %[p2],  8     \n\t"
228            "srl    %[p1],  %[p1],  8     \n\t"
229
230            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
231            :);
232
233        __asm__ __volatile__(
234            "sb     %[p4],   1(%[s3])    \n\t"
235            "sb     %[p3],   0(%[s3])    \n\t"
236            "sb     %[p2],  -1(%[s3])    \n\t"
237            "sb     %[p1],  -2(%[s3])    \n\t"
238
239            : [p1] "+r"(p1)
240            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
241
242        __asm__ __volatile__(
243            "srl    %[p4],  %[p4],  8     \n\t"
244            "srl    %[p3],  %[p3],  8     \n\t"
245            "srl    %[p2],  %[p2],  8     \n\t"
246            "srl    %[p1],  %[p1],  8     \n\t"
247
248            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
249            :);
250
251        __asm__ __volatile__(
252            "sb     %[p4],   1(%[s2])    \n\t"
253            "sb     %[p3],   0(%[s2])    \n\t"
254            "sb     %[p2],  -1(%[s2])    \n\t"
255            "sb     %[p1],  -2(%[s2])    \n\t"
256
257            :
258            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
259              [s2] "r"(s2));
260
261        __asm__ __volatile__(
262            "srl    %[p4],  %[p4],  8     \n\t"
263            "srl    %[p3],  %[p3],  8     \n\t"
264            "srl    %[p2],  %[p2],  8     \n\t"
265            "srl    %[p1],  %[p1],  8     \n\t"
266
267            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
268            :);
269
270        __asm__ __volatile__(
271            "sb     %[p4],   1(%[s1])    \n\t"
272            "sb     %[p3],   0(%[s1])    \n\t"
273            "sb     %[p2],  -1(%[s1])    \n\t"
274            "sb     %[p1],  -2(%[s1])    \n\t"
275
276            :
277            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
278              [s1] "r"(s1));
279      }
280    }
281  }
282}
283
284void vpx_lpf_horizontal_4_dual_dspr2(
285    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
286    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
287    const uint8_t *limit1, const uint8_t *thresh1) {
288  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
289  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
290}
291
292void vpx_lpf_horizontal_8_dual_dspr2(
293    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
294    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
295    const uint8_t *limit1, const uint8_t *thresh1) {
296  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
297  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
298}
299
300void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
301                                   const uint8_t *limit0,
302                                   const uint8_t *thresh0,
303                                   const uint8_t *blimit1,
304                                   const uint8_t *limit1,
305                                   const uint8_t *thresh1) {
306  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
307  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
308}
309
310void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
311                                   const uint8_t *limit0,
312                                   const uint8_t *thresh0,
313                                   const uint8_t *blimit1,
314                                   const uint8_t *limit1,
315                                   const uint8_t *thresh1) {
316  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
317  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
318}
319
320void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
321                                    const uint8_t *limit,
322                                    const uint8_t *thresh) {
323  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
324  vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
325}
326#endif  // #if HAVE_DSPR2
327