1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_ports/mem.h"
19#include "vp9/common/vp9_convolve.h"
20#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22#if HAVE_DSPR2
23static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
24                                         int32_t src_stride,
25                                         uint8_t *dst,
26                                         int32_t dst_stride,
27                                         const int16_t *filter_y,
28                                         int32_t w,
29                                         int32_t h) {
30  int32_t       x, y;
31  const uint8_t *src_ptr;
32  uint8_t       *dst_ptr;
33  uint8_t       *cm = vp9_ff_cropTbl;
34  uint32_t      vector4a = 64;
35  uint32_t      load1, load2;
36  uint32_t      p1, p2;
37  uint32_t      scratch1, scratch2;
38  uint32_t      store1, store2;
39  int32_t       Temp1, Temp2;
40  const int16_t *filter = &filter_y[3];
41  uint32_t      filter45;
42
43  filter45 = ((const int32_t *)filter)[0];
44
45  for (y = h; y--;) {
46    /* prefetch data to cache memory */
47    vp9_prefetch_store(dst + dst_stride);
48
49    for (x = 0; x < w; x += 4) {
50      src_ptr = src + x;
51      dst_ptr = dst + x;
52
53      __asm__ __volatile__ (
54          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
55          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
56          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
57
58          "mtlo             %[vector4a],  $ac0                            \n\t"
59          "mtlo             %[vector4a],  $ac1                            \n\t"
60          "mtlo             %[vector4a],  $ac2                            \n\t"
61          "mtlo             %[vector4a],  $ac3                            \n\t"
62          "mthi             $zero,        $ac0                            \n\t"
63          "mthi             $zero,        $ac1                            \n\t"
64          "mthi             $zero,        $ac2                            \n\t"
65          "mthi             $zero,        $ac3                            \n\t"
66
67          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
68          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
69          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
70          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
71
72          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
73          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
74
75          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
76          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
77          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
78          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
79
80          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
81          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
82
83          "extp             %[Temp1],     $ac0,           31              \n\t"
84          "extp             %[Temp2],     $ac1,           31              \n\t"
85
86          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
87          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
88
89          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
90          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
91          "extp             %[Temp1],     $ac2,           31              \n\t"
92
93          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
94          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
95          "extp             %[Temp2],     $ac3,           31              \n\t"
96          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
97
98          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
99          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
100          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
101
102          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
103          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
104          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
105          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
106
107          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
108          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
109
110          : [load1] "=&r" (load1), [load2] "=&r" (load2),
111            [p1] "=&r" (p1), [p2] "=&r" (p2),
112            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
113            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
114            [store1] "=&r" (store1), [store2] "=&r" (store2),
115            [src_ptr] "+r" (src_ptr)
116          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
117            [src_stride] "r" (src_stride), [cm] "r" (cm),
118            [dst_ptr] "r" (dst_ptr)
119      );
120    }
121
122    /* Next row... */
123    src += src_stride;
124    dst += dst_stride;
125  }
126}
127
128static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
129                                          int32_t src_stride,
130                                          uint8_t *dst,
131                                          int32_t dst_stride,
132                                          const int16_t *filter_y,
133                                          int32_t h) {
134  int32_t       x, y;
135  const uint8_t *src_ptr;
136  uint8_t       *dst_ptr;
137  uint8_t       *cm = vp9_ff_cropTbl;
138  uint32_t      vector4a = 64;
139  uint32_t      load1, load2;
140  uint32_t      p1, p2;
141  uint32_t      scratch1, scratch2;
142  uint32_t      store1, store2;
143  int32_t       Temp1, Temp2;
144  const int16_t *filter = &filter_y[3];
145  uint32_t filter45;;
146
147  filter45 = ((const int32_t *)filter)[0];
148
149  for (y = h; y--;) {
150    /* prefetch data to cache memory */
151    vp9_prefetch_store(dst + dst_stride);
152    vp9_prefetch_store(dst + dst_stride + 32);
153
154    for (x = 0; x < 64; x += 4) {
155      src_ptr = src + x;
156      dst_ptr = dst + x;
157
158      __asm__ __volatile__ (
159          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
160          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
161          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
162
163          "mtlo             %[vector4a],  $ac0                            \n\t"
164          "mtlo             %[vector4a],  $ac1                            \n\t"
165          "mtlo             %[vector4a],  $ac2                            \n\t"
166          "mtlo             %[vector4a],  $ac3                            \n\t"
167          "mthi             $zero,        $ac0                            \n\t"
168          "mthi             $zero,        $ac1                            \n\t"
169          "mthi             $zero,        $ac2                            \n\t"
170          "mthi             $zero,        $ac3                            \n\t"
171
172          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
173          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
174          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
175          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
176
177          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
178          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
179
180          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
181          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
182          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
183          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
184
185          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
186          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
187
188          "extp             %[Temp1],     $ac0,           31              \n\t"
189          "extp             %[Temp2],     $ac1,           31              \n\t"
190
191          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
192          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
193
194          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
195          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
196          "extp             %[Temp1],     $ac2,           31              \n\t"
197
198          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
199          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
200          "extp             %[Temp2],     $ac3,           31              \n\t"
201          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
202
203          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
204          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
205          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
206
207          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
208          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
209          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
210          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
211
212          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
213          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
214
215          : [load1] "=&r" (load1), [load2] "=&r" (load2),
216            [p1] "=&r" (p1), [p2] "=&r" (p2),
217            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
218            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
219            [store1] "=&r" (store1), [store2] "=&r" (store2),
220            [src_ptr] "+r" (src_ptr)
221          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
222            [src_stride] "r" (src_stride), [cm] "r" (cm),
223            [dst_ptr] "r" (dst_ptr)
224      );
225    }
226
227    /* Next row... */
228    src += src_stride;
229    dst += dst_stride;
230  }
231}
232
233void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
234                                  uint8_t *dst, ptrdiff_t dst_stride,
235                                  const int16_t *filter_x, int x_step_q4,
236                                  const int16_t *filter_y, int y_step_q4,
237                                  int w, int h) {
238  if (16 == y_step_q4) {
239    uint32_t pos = 38;
240
241    /* bit positon for extract from acc */
242    __asm__ __volatile__ (
243      "wrdsp      %[pos],     1           \n\t"
244      :
245      : [pos] "r" (pos)
246    );
247
248    vp9_prefetch_store(dst);
249
250    switch (w) {
251      case 4:
252      case 8:
253      case 16:
254      case 32:
255        convolve_bi_avg_vert_4_dspr2(src, src_stride,
256                                     dst, dst_stride,
257                                     filter_y, w, h);
258        break;
259      case 64:
260        vp9_prefetch_store(dst + 32);
261        convolve_bi_avg_vert_64_dspr2(src, src_stride,
262                                      dst, dst_stride,
263                                      filter_y, h);
264        break;
265      default:
266        vp9_convolve8_avg_vert_c(src, src_stride,
267                                 dst, dst_stride,
268                                 filter_x, x_step_q4,
269                                 filter_y, y_step_q4,
270                                 w, h);
271        break;
272    }
273  } else {
274    vp9_convolve8_avg_vert_c(src, src_stride,
275                             dst, dst_stride,
276                             filter_x, x_step_q4,
277                             filter_y, y_step_q4,
278                             w, h);
279  }
280}
281#endif
282