1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_ports/mem.h"
19#include "vp9/common/vp9_convolve.h"
20#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22#if HAVE_DSPR2
23static void convolve_vert_4_dspr2(const uint8_t *src,
24                                  int32_t src_stride,
25                                  uint8_t *dst,
26                                  int32_t dst_stride,
27                                  const int16_t *filter_y,
28                                  int32_t w,
29                                  int32_t h) {
30  int32_t x, y;
31  const uint8_t *src_ptr;
32  uint8_t *dst_ptr;
33  uint8_t *cm = vp9_ff_cropTbl;
34  uint32_t vector4a = 64;
35  uint32_t load1, load2, load3, load4;
36  uint32_t p1, p2;
37  uint32_t n1, n2;
38  uint32_t scratch1, scratch2;
39  uint32_t store1, store2;
40  int32_t vector1b, vector2b, vector3b, vector4b;
41  int32_t Temp1, Temp2;
42
43  vector1b = ((const int32_t *)filter_y)[0];
44  vector2b = ((const int32_t *)filter_y)[1];
45  vector3b = ((const int32_t *)filter_y)[2];
46  vector4b = ((const int32_t *)filter_y)[3];
47
48  src -= 3 * src_stride;
49
50  for (y = h; y--;) {
51    /* prefetch data to cache memory */
52    vp9_prefetch_store(dst + dst_stride);
53
54    for (x = 0; x < w; x += 4) {
55      src_ptr = src + x;
56      dst_ptr = dst + x;
57
58      __asm__ __volatile__ (
59          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
60          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
62          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
63          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
64          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
65          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
66
67          "mtlo             %[vector4a],  $ac0                            \n\t"
68          "mtlo             %[vector4a],  $ac1                            \n\t"
69          "mtlo             %[vector4a],  $ac2                            \n\t"
70          "mtlo             %[vector4a],  $ac3                            \n\t"
71          "mthi             $zero,        $ac0                            \n\t"
72          "mthi             $zero,        $ac1                            \n\t"
73          "mthi             $zero,        $ac2                            \n\t"
74          "mthi             $zero,        $ac3                            \n\t"
75
76          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
77          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
78          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
79          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
80          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
81          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
82          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
83          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
84
85          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
86          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
87          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
88          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
89
90          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
91          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
92          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
93          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
94          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
95          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
96          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
97          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
98
99          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
100          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
101          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
102          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
103
104          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
106          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
108          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
109          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
110          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
111          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
112
113          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
114          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
115          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
116          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
117          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
118          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
119          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
120          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
121
122          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
123          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
124          "extp             %[Temp1],     $ac0,           31              \n\t"
125          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
126          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
127          "extp             %[Temp2],     $ac1,           31              \n\t"
128
129          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
130          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
131          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
132          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
133          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
134          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
135          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
136          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
137
138          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
139          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
140          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
141          "extp             %[Temp1],     $ac2,           31              \n\t"
142
143          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
144          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
145          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
146          "extp             %[Temp2],     $ac3,           31              \n\t"
147
148          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
149          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
150
151          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
152          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
153
154          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
155          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
156
157          : [load1] "=&r" (load1), [load2] "=&r" (load2),
158            [load3] "=&r" (load3), [load4] "=&r" (load4),
159            [p1] "=&r" (p1), [p2] "=&r" (p2),
160            [n1] "=&r" (n1), [n2] "=&r" (n2),
161            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
162            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
163            [store1] "=&r" (store1), [store2] "=&r" (store2),
164            [src_ptr] "+r" (src_ptr)
165          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
166            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
167            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
168            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
169      );
170    }
171
172    /* Next row... */
173    src += src_stride;
174    dst += dst_stride;
175  }
176}
177
178static void convolve_vert_64_dspr2(const uint8_t *src,
179                                   int32_t src_stride,
180                                   uint8_t *dst,
181                                   int32_t dst_stride,
182                                   const int16_t *filter_y,
183                                   int32_t h) {
184  int32_t x, y;
185  const uint8_t *src_ptr;
186  uint8_t *dst_ptr;
187  uint8_t *cm = vp9_ff_cropTbl;
188  uint32_t vector4a = 64;
189  uint32_t load1, load2, load3, load4;
190  uint32_t p1, p2;
191  uint32_t n1, n2;
192  uint32_t scratch1, scratch2;
193  uint32_t store1, store2;
194  int32_t vector1b, vector2b, vector3b, vector4b;
195  int32_t Temp1, Temp2;
196
197  vector1b = ((const int32_t *)filter_y)[0];
198  vector2b = ((const int32_t *)filter_y)[1];
199  vector3b = ((const int32_t *)filter_y)[2];
200  vector4b = ((const int32_t *)filter_y)[3];
201
202  src -= 3 * src_stride;
203
204  for (y = h; y--;) {
205    /* prefetch data to cache memory */
206    vp9_prefetch_store(dst + dst_stride);
207    vp9_prefetch_store(dst + dst_stride + 32);
208
209    for (x = 0; x < 64; x += 4) {
210      src_ptr = src + x;
211      dst_ptr = dst + x;
212
213      __asm__ __volatile__ (
214          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
215          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
216          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
217          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
218          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
219          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
220          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
221
222          "mtlo             %[vector4a],  $ac0                            \n\t"
223          "mtlo             %[vector4a],  $ac1                            \n\t"
224          "mtlo             %[vector4a],  $ac2                            \n\t"
225          "mtlo             %[vector4a],  $ac3                            \n\t"
226          "mthi             $zero,        $ac0                            \n\t"
227          "mthi             $zero,        $ac1                            \n\t"
228          "mthi             $zero,        $ac2                            \n\t"
229          "mthi             $zero,        $ac3                            \n\t"
230
231          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
232          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
233          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
234          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
235          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
236          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
237          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
238          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
239
240          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
241          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
242          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
243          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
244
245          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
246          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
247          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
248          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
249          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
250          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
251          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
252          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
253
254          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
255          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
256          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
257          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
258
259          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
260          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
261          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
262          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
263          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
264          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
265          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
266          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
267
268          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
269          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
270          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
271          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
272          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
273          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
274          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
275          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
276
277          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
278          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
279          "extp             %[Temp1],     $ac0,           31              \n\t"
280          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
281          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
282          "extp             %[Temp2],     $ac1,           31              \n\t"
283
284          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
285          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
286          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
287          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
288          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
289          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
290          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
291          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
292
293          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
294          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
295          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
296          "extp             %[Temp1],     $ac2,           31              \n\t"
297
298          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
299          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
300          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
301          "extp             %[Temp2],     $ac3,           31              \n\t"
302
303          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
304          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
305
306          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
307          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
308
309          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
310          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
311
312          : [load1] "=&r" (load1), [load2] "=&r" (load2),
313            [load3] "=&r" (load3), [load4] "=&r" (load4),
314            [p1] "=&r" (p1), [p2] "=&r" (p2),
315            [n1] "=&r" (n1), [n2] "=&r" (n2),
316            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
317            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
318            [store1] "=&r" (store1), [store2] "=&r" (store2),
319            [src_ptr] "+r" (src_ptr)
320          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
321            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
322            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
323            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
324      );
325    }
326
327    /* Next row... */
328    src += src_stride;
329    dst += dst_stride;
330  }
331}
332
333void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
334                              uint8_t *dst, ptrdiff_t dst_stride,
335                              const int16_t *filter_x, int x_step_q4,
336                              const int16_t *filter_y, int y_step_q4,
337                              int w, int h) {
338  if (((const int32_t *)filter_y)[1] == 0x800000) {
339    vp9_convolve_copy(src, src_stride,
340                      dst, dst_stride,
341                      filter_x, x_step_q4,
342                      filter_y, y_step_q4,
343                      w, h);
344  } else if (((const int32_t *)filter_y)[0] == 0) {
345    vp9_convolve2_vert_dspr2(src, src_stride,
346                             dst, dst_stride,
347                             filter_x, x_step_q4,
348                             filter_y, y_step_q4,
349                             w, h);
350  } else {
351    if (16 == y_step_q4) {
352      uint32_t pos = 38;
353
354      /* bit positon for extract from acc */
355      __asm__ __volatile__ (
356        "wrdsp      %[pos],     1           \n\t"
357        :
358        : [pos] "r" (pos)
359      );
360
361      vp9_prefetch_store(dst);
362
363      switch (w) {
364        case 4 :
365        case 8 :
366        case 16 :
367        case 32 :
368          convolve_vert_4_dspr2(src, src_stride,
369                                dst, dst_stride,
370                                filter_y, w, h);
371          break;
372        case 64 :
373          vp9_prefetch_store(dst + 32);
374          convolve_vert_64_dspr2(src, src_stride,
375                                 dst, dst_stride,
376                                 filter_y, h);
377          break;
378        default:
379          vp9_convolve8_vert_c(src, src_stride,
380                               dst, dst_stride,
381                               filter_x, x_step_q4,
382                               filter_y, y_step_q4,
383                               w, h);
384          break;
385      }
386    } else {
387      vp9_convolve8_vert_c(src, src_stride,
388                           dst, dst_stride,
389                           filter_x, x_step_q4,
390                           filter_y, y_step_q4,
391                           w, h);
392    }
393  }
394}
395
396#endif
397