1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_ports/mem.h"
19#include "vp9/common/vp9_convolve.h"
20#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22#if HAVE_DSPR2
23static void convolve_avg_vert_4_dspr2(const uint8_t *src,
24                                      int32_t src_stride,
25                                      uint8_t *dst,
26                                      int32_t dst_stride,
27                                      const int16_t *filter_y,
28                                      int32_t w,
29                                      int32_t h) {
30  int32_t       x, y;
31  const uint8_t *src_ptr;
32  uint8_t       *dst_ptr;
33  uint8_t       *cm = vp9_ff_cropTbl;
34  uint32_t      vector4a = 64;
35  uint32_t      load1, load2, load3, load4;
36  uint32_t      p1, p2;
37  uint32_t      n1, n2;
38  uint32_t      scratch1, scratch2;
39  uint32_t      store1, store2;
40  int32_t       vector1b, vector2b, vector3b, vector4b;
41  int32_t       Temp1, Temp2;
42
43  vector1b = ((const int32_t *)filter_y)[0];
44  vector2b = ((const int32_t *)filter_y)[1];
45  vector3b = ((const int32_t *)filter_y)[2];
46  vector4b = ((const int32_t *)filter_y)[3];
47
48  src -= 3 * src_stride;
49
50  for (y = h; y--;) {
51    /* prefetch data to cache memory */
52    vp9_prefetch_store(dst + dst_stride);
53
54    for (x = 0; x < w; x += 4) {
55      src_ptr = src + x;
56      dst_ptr = dst + x;
57
58      __asm__ __volatile__ (
59          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
60          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
62          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
63          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
64          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
65          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
66
67          "mtlo             %[vector4a],  $ac0                            \n\t"
68          "mtlo             %[vector4a],  $ac1                            \n\t"
69          "mtlo             %[vector4a],  $ac2                            \n\t"
70          "mtlo             %[vector4a],  $ac3                            \n\t"
71          "mthi             $zero,        $ac0                            \n\t"
72          "mthi             $zero,        $ac1                            \n\t"
73          "mthi             $zero,        $ac2                            \n\t"
74          "mthi             $zero,        $ac3                            \n\t"
75
76          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
77          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
78          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
79          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
80          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
81          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
82          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
83          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
84
85          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
86          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
87          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
88          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
89
90          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
91          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
92          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
93          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
94          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
95          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
96          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
97          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
98
99          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
100          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
101          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
102          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
103
104          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
106          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
108          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
109          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
110          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
111          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
112
113          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
114          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
115          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
116          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
117          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
118          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
119          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
120          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
121
122          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
123          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
124          "extp             %[Temp1],     $ac0,           31              \n\t"
125          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
126          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
127          "extp             %[Temp2],     $ac1,           31              \n\t"
128
129          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
130          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
131          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
132          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
133          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
134          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
135          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
136          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
137          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
138          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
139
140          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
141          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
142          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
143          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
144          "extp             %[Temp1],     $ac2,           31              \n\t"
145
146          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
147          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
148          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
149          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
150          "extp             %[Temp2],     $ac3,           31              \n\t"
151          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
152
153          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
154          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
155          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
156
157          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
158          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
159          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
160          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
161
162          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
163          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
164
165          : [load1] "=&r" (load1), [load2] "=&r" (load2),
166            [load3] "=&r" (load3), [load4] "=&r" (load4),
167            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
168            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
169            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
170            [store1] "=&r" (store1), [store2] "=&r" (store2),
171            [src_ptr] "+r" (src_ptr)
172          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
173            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
174            [vector4a] "r" (vector4a),
175            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
176      );
177    }
178
179    /* Next row... */
180    src += src_stride;
181    dst += dst_stride;
182  }
183}
184
185static void convolve_avg_vert_64_dspr2(const uint8_t *src,
186                                       int32_t src_stride,
187                                       uint8_t *dst,
188                                       int32_t dst_stride,
189                                       const int16_t *filter_y,
190                                       int32_t h) {
191  int32_t       x, y;
192  const uint8_t *src_ptr;
193  uint8_t       *dst_ptr;
194  uint8_t       *cm = vp9_ff_cropTbl;
195  uint32_t      vector4a = 64;
196  uint32_t      load1, load2, load3, load4;
197  uint32_t      p1, p2;
198  uint32_t      n1, n2;
199  uint32_t      scratch1, scratch2;
200  uint32_t      store1, store2;
201  int32_t       vector1b, vector2b, vector3b, vector4b;
202  int32_t       Temp1, Temp2;
203
204  vector1b = ((const int32_t *)filter_y)[0];
205  vector2b = ((const int32_t *)filter_y)[1];
206  vector3b = ((const int32_t *)filter_y)[2];
207  vector4b = ((const int32_t *)filter_y)[3];
208
209  src -= 3 * src_stride;
210
211  for (y = h; y--;) {
212    /* prefetch data to cache memory */
213    vp9_prefetch_store(dst + dst_stride);
214    vp9_prefetch_store(dst + dst_stride + 32);
215
216    for (x = 0; x < 64; x += 4) {
217      src_ptr = src + x;
218      dst_ptr = dst + x;
219
220      __asm__ __volatile__ (
221          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
222          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
223          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
224          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
225          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
226          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
227          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
228
229          "mtlo             %[vector4a],  $ac0                            \n\t"
230          "mtlo             %[vector4a],  $ac1                            \n\t"
231          "mtlo             %[vector4a],  $ac2                            \n\t"
232          "mtlo             %[vector4a],  $ac3                            \n\t"
233          "mthi             $zero,        $ac0                            \n\t"
234          "mthi             $zero,        $ac1                            \n\t"
235          "mthi             $zero,        $ac2                            \n\t"
236          "mthi             $zero,        $ac3                            \n\t"
237
238          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
239          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
240          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
241          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
242          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
243          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
244          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
245          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
246
247          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
248          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
249          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
250          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
251
252          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
253          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
254          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
255          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
256          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
257          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
258          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
259          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
260
261          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
262          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
263          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
264          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
265
266          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
267          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
268          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
269          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
270          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
271          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
272          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
273          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
274
275          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
276          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
277          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
278          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
279          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
280          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
281          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
282          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
283
284          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
285          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
286          "extp             %[Temp1],     $ac0,           31              \n\t"
287          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
288          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
289          "extp             %[Temp2],     $ac1,           31              \n\t"
290
291          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
292          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
293          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
294          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
295          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
296          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
297          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
298          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
299          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
300          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
301
302          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
303          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
304          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
305          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
306          "extp             %[Temp1],     $ac2,           31              \n\t"
307
308          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
309          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
310          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
311          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
312          "extp             %[Temp2],     $ac3,           31              \n\t"
313          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
314
315          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
316          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
317          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
318
319          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
320          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
321          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
322          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
323
324          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
325          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
326
327          : [load1] "=&r" (load1), [load2] "=&r" (load2),
328            [load3] "=&r" (load3), [load4] "=&r" (load4),
329            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
330            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
331            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
332            [store1] "=&r" (store1), [store2] "=&r" (store2),
333            [src_ptr] "+r" (src_ptr)
334          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
335            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
336            [vector4a] "r" (vector4a),
337            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
338      );
339    }
340
341    /* Next row... */
342    src += src_stride;
343    dst += dst_stride;
344  }
345}
346
347void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
348                                  uint8_t *dst, ptrdiff_t dst_stride,
349                                  const int16_t *filter_x, int x_step_q4,
350                                  const int16_t *filter_y, int y_step_q4,
351                                  int w, int h) {
352  if (((const int32_t *)filter_y)[1] == 0x800000) {
353    vp9_convolve_avg(src, src_stride,
354                     dst, dst_stride,
355                     filter_x, x_step_q4,
356                     filter_y, y_step_q4,
357                     w, h);
358  } else if (((const int32_t *)filter_y)[0] == 0) {
359    vp9_convolve2_avg_vert_dspr2(src, src_stride,
360                                 dst, dst_stride,
361                                 filter_x, x_step_q4,
362                                 filter_y, y_step_q4,
363                                 w, h);
364  } else {
365    if (16 == y_step_q4) {
366      uint32_t pos = 38;
367
368      /* bit positon for extract from acc */
369      __asm__ __volatile__ (
370        "wrdsp      %[pos],     1           \n\t"
371        :
372        : [pos] "r" (pos)
373      );
374
375      vp9_prefetch_store(dst);
376
377      switch (w) {
378        case 4:
379        case 8:
380        case 16:
381        case 32:
382          convolve_avg_vert_4_dspr2(src, src_stride,
383                                    dst, dst_stride,
384                                    filter_y, w, h);
385          break;
386        case 64:
387          vp9_prefetch_store(dst + 32);
388          convolve_avg_vert_64_dspr2(src, src_stride,
389                                     dst, dst_stride,
390                                     filter_y, h);
391          break;
392        default:
393          vp9_convolve8_avg_vert_c(src, src_stride,
394                                   dst, dst_stride,
395                                   filter_x, x_step_q4,
396                                   filter_y, y_step_q4,
397                                   w, h);
398          break;
399      }
400    } else {
401      vp9_convolve8_avg_vert_c(src, src_stride,
402                               dst, dst_stride,
403                               filter_x, x_step_q4,
404                               filter_y, y_step_q4,
405                               w, h);
406    }
407  }
408}
409
410void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
411                             uint8_t *dst, ptrdiff_t dst_stride,
412                             const int16_t *filter_x, int x_step_q4,
413                             const int16_t *filter_y, int y_step_q4,
414                             int w, int h) {
415  /* Fixed size intermediate buffer places limits on parameters. */
416  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
417  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
418
419  assert(w <= 64);
420  assert(h <= 64);
421
422  if (intermediate_height < h)
423    intermediate_height = h;
424
425  if (x_step_q4 != 16 || y_step_q4 != 16)
426    return vp9_convolve8_avg_c(src, src_stride,
427                               dst, dst_stride,
428                               filter_x, x_step_q4,
429                               filter_y, y_step_q4,
430                               w, h);
431
432  vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
433                      temp, 64,
434                      filter_x, x_step_q4,
435                      filter_y, y_step_q4,
436                      w, intermediate_height);
437
438  vp9_convolve8_avg_vert(temp + 64 * 3, 64,
439                         dst, dst_stride,
440                         filter_x, x_step_q4,
441                         filter_y, y_step_q4,
442                         w, h);
443}
444
445void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
446                            uint8_t *dst, ptrdiff_t dst_stride,
447                            const int16_t *filter_x, int filter_x_stride,
448                            const int16_t *filter_y, int filter_y_stride,
449                            int w, int h) {
450  int x, y;
451  uint32_t tp1, tp2, tn1;
452  uint32_t tp3, tp4, tn2;
453
454  /* prefetch data to cache memory */
455  vp9_prefetch_load(src);
456  vp9_prefetch_load(src + 32);
457  vp9_prefetch_store(dst);
458
459  switch (w) {
460    case 4:
461      /* 1 word storage */
462      for (y = h; y--; ) {
463        vp9_prefetch_load(src + src_stride);
464        vp9_prefetch_load(src + src_stride + 32);
465        vp9_prefetch_store(dst + dst_stride);
466
467        __asm__ __volatile__ (
468            "ulw              %[tp1],         0(%[src])      \n\t"
469            "ulw              %[tp2],         0(%[dst])      \n\t"
470            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
471            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
472
473            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
474              [tp2] "=&r" (tp2)
475            : [src] "r" (src), [dst] "r" (dst)
476        );
477
478        src += src_stride;
479        dst += dst_stride;
480      }
481      break;
482    case 8:
483      /* 2 word storage */
484      for (y = h; y--; ) {
485        vp9_prefetch_load(src + src_stride);
486        vp9_prefetch_load(src + src_stride + 32);
487        vp9_prefetch_store(dst + dst_stride);
488
489        __asm__ __volatile__ (
490            "ulw              %[tp1],         0(%[src])      \n\t"
491            "ulw              %[tp2],         0(%[dst])      \n\t"
492            "ulw              %[tp3],         4(%[src])      \n\t"
493            "ulw              %[tp4],         4(%[dst])      \n\t"
494            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
495            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
496            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
497            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
498
499            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
500              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
501              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
502            : [src] "r" (src), [dst] "r" (dst)
503        );
504
505        src += src_stride;
506        dst += dst_stride;
507      }
508      break;
509    case 16:
510      /* 4 word storage */
511      for (y = h; y--; ) {
512        vp9_prefetch_load(src + src_stride);
513        vp9_prefetch_load(src + src_stride + 32);
514        vp9_prefetch_store(dst + dst_stride);
515
516        __asm__ __volatile__ (
517            "ulw              %[tp1],         0(%[src])      \n\t"
518            "ulw              %[tp2],         0(%[dst])      \n\t"
519            "ulw              %[tp3],         4(%[src])      \n\t"
520            "ulw              %[tp4],         4(%[dst])      \n\t"
521            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
522            "ulw              %[tp1],         8(%[src])      \n\t"
523            "ulw              %[tp2],         8(%[dst])      \n\t"
524            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
525            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
526            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
527            "ulw              %[tp3],         12(%[src])     \n\t"
528            "ulw              %[tp4],         12(%[dst])     \n\t"
529            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
530            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
531            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
532            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
533
534            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
535              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
536              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
537            : [src] "r" (src), [dst] "r" (dst)
538        );
539
540        src += src_stride;
541        dst += dst_stride;
542      }
543      break;
544    case 32:
545      /* 8 word storage */
546      for (y = h; y--; ) {
547        vp9_prefetch_load(src + src_stride);
548        vp9_prefetch_load(src + src_stride + 32);
549        vp9_prefetch_store(dst + dst_stride);
550
551        __asm__ __volatile__ (
552            "ulw              %[tp1],         0(%[src])      \n\t"
553            "ulw              %[tp2],         0(%[dst])      \n\t"
554            "ulw              %[tp3],         4(%[src])      \n\t"
555            "ulw              %[tp4],         4(%[dst])      \n\t"
556            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
557            "ulw              %[tp1],         8(%[src])      \n\t"
558            "ulw              %[tp2],         8(%[dst])      \n\t"
559            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
560            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
561            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
562            "ulw              %[tp3],         12(%[src])     \n\t"
563            "ulw              %[tp4],         12(%[dst])     \n\t"
564            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
565            "ulw              %[tp1],         16(%[src])     \n\t"
566            "ulw              %[tp2],         16(%[dst])     \n\t"
567            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
568            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
569            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
570            "ulw              %[tp3],         20(%[src])     \n\t"
571            "ulw              %[tp4],         20(%[dst])     \n\t"
572            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
573            "ulw              %[tp1],         24(%[src])     \n\t"
574            "ulw              %[tp2],         24(%[dst])     \n\t"
575            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
576            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
577            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
578            "ulw              %[tp3],         28(%[src])     \n\t"
579            "ulw              %[tp4],         28(%[dst])     \n\t"
580            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
581            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
582            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
583            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
584
585            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
586              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
587              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
588            : [src] "r" (src), [dst] "r" (dst)
589        );
590
591        src += src_stride;
592        dst += dst_stride;
593      }
594      break;
595    case 64:
596      vp9_prefetch_load(src + 64);
597      vp9_prefetch_store(dst + 32);
598
599      /* 16 word storage */
600      for (y = h; y--; ) {
601        vp9_prefetch_load(src + src_stride);
602        vp9_prefetch_load(src + src_stride + 32);
603        vp9_prefetch_load(src + src_stride + 64);
604        vp9_prefetch_store(dst + dst_stride);
605        vp9_prefetch_store(dst + dst_stride + 32);
606
607        __asm__ __volatile__ (
608            "ulw              %[tp1],         0(%[src])      \n\t"
609            "ulw              %[tp2],         0(%[dst])      \n\t"
610            "ulw              %[tp3],         4(%[src])      \n\t"
611            "ulw              %[tp4],         4(%[dst])      \n\t"
612            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
613            "ulw              %[tp1],         8(%[src])      \n\t"
614            "ulw              %[tp2],         8(%[dst])      \n\t"
615            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
616            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
617            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
618            "ulw              %[tp3],         12(%[src])     \n\t"
619            "ulw              %[tp4],         12(%[dst])     \n\t"
620            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
621            "ulw              %[tp1],         16(%[src])     \n\t"
622            "ulw              %[tp2],         16(%[dst])     \n\t"
623            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
624            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
625            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
626            "ulw              %[tp3],         20(%[src])     \n\t"
627            "ulw              %[tp4],         20(%[dst])     \n\t"
628            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
629            "ulw              %[tp1],         24(%[src])     \n\t"
630            "ulw              %[tp2],         24(%[dst])     \n\t"
631            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
632            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
633            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
634            "ulw              %[tp3],         28(%[src])     \n\t"
635            "ulw              %[tp4],         28(%[dst])     \n\t"
636            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
637            "ulw              %[tp1],         32(%[src])     \n\t"
638            "ulw              %[tp2],         32(%[dst])     \n\t"
639            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
640            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
641            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
642            "ulw              %[tp3],         36(%[src])     \n\t"
643            "ulw              %[tp4],         36(%[dst])     \n\t"
644            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
645            "ulw              %[tp1],         40(%[src])     \n\t"
646            "ulw              %[tp2],         40(%[dst])     \n\t"
647            "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
648            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
649            "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
650            "ulw              %[tp3],         44(%[src])     \n\t"
651            "ulw              %[tp4],         44(%[dst])     \n\t"
652            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
653            "ulw              %[tp1],         48(%[src])     \n\t"
654            "ulw              %[tp2],         48(%[dst])     \n\t"
655            "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
656            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
657            "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
658            "ulw              %[tp3],         52(%[src])     \n\t"
659            "ulw              %[tp4],         52(%[dst])     \n\t"
660            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
661            "ulw              %[tp1],         56(%[src])     \n\t"
662            "ulw              %[tp2],         56(%[dst])     \n\t"
663            "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
664            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
665            "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
666            "ulw              %[tp3],         60(%[src])     \n\t"
667            "ulw              %[tp4],         60(%[dst])     \n\t"
668            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
669            "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
670            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
671            "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
672
673            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
674              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
675              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
676            : [src] "r" (src), [dst] "r" (dst)
677        );
678
679        src += src_stride;
680        dst += dst_stride;
681      }
682      break;
683    default:
684      for (y = h; y > 0; --y) {
685        for (x = 0; x < w; ++x) {
686          dst[x] = (dst[x] + src[x] + 1) >> 1;
687        }
688
689        src += src_stride;
690        dst += dst_stride;
691      }
692      break;
693  }
694}
695#endif
696