1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_convolve.h"
17#include "vpx_dsp/vpx_dsp_common.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_avg_vert_4_dspr2(const uint8_t *src,
22                                      int32_t src_stride,
23                                      uint8_t *dst,
24                                      int32_t dst_stride,
25                                      const int16_t *filter_y,
26                                      int32_t w,
27                                      int32_t h) {
28  int32_t       x, y;
29  const uint8_t *src_ptr;
30  uint8_t       *dst_ptr;
31  uint8_t       *cm = vpx_ff_cropTbl;
32  uint32_t      vector4a = 64;
33  uint32_t      load1, load2, load3, load4;
34  uint32_t      p1, p2;
35  uint32_t      n1, n2;
36  uint32_t      scratch1, scratch2;
37  uint32_t      store1, store2;
38  int32_t       vector1b, vector2b, vector3b, vector4b;
39  int32_t       Temp1, Temp2;
40
41  vector1b = ((const int32_t *)filter_y)[0];
42  vector2b = ((const int32_t *)filter_y)[1];
43  vector3b = ((const int32_t *)filter_y)[2];
44  vector4b = ((const int32_t *)filter_y)[3];
45
46  src -= 3 * src_stride;
47
48  for (y = h; y--;) {
49    /* prefetch data to cache memory */
50    prefetch_store(dst + dst_stride);
51
52    for (x = 0; x < w; x += 4) {
53      src_ptr = src + x;
54      dst_ptr = dst + x;
55
56      __asm__ __volatile__ (
57          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
58          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
59          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
60          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
62          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
63          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
64
65          "mtlo             %[vector4a],  $ac0                            \n\t"
66          "mtlo             %[vector4a],  $ac1                            \n\t"
67          "mtlo             %[vector4a],  $ac2                            \n\t"
68          "mtlo             %[vector4a],  $ac3                            \n\t"
69          "mthi             $zero,        $ac0                            \n\t"
70          "mthi             $zero,        $ac1                            \n\t"
71          "mthi             $zero,        $ac2                            \n\t"
72          "mthi             $zero,        $ac3                            \n\t"
73
74          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
75          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
76          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
77          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
78          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
79          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
80          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
81          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
82
83          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
84          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
85          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
86          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
87
88          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
89          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
90          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
91          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
92          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
93          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
94          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
95          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
96
97          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
98          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
99          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
100          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
101
102          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
103          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
104          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
106          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
108          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
109          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
110
111          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
112          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
113          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
114          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
115          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
116          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
117          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
118          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
119
120          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
121          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
122          "extp             %[Temp1],     $ac0,           31              \n\t"
123          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
124          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
125          "extp             %[Temp2],     $ac1,           31              \n\t"
126
127          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
128          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
129          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
130          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
131          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
132          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
133          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
134          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
135          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
136          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
137
138          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
139          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
140          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
141          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
142          "extp             %[Temp1],     $ac2,           31              \n\t"
143
144          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
145          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
146          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
147          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
148          "extp             %[Temp2],     $ac3,           31              \n\t"
149          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
150
151          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
152          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
153          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
154
155          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
156          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
157          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
158          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
159
160          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
161          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
162
163          : [load1] "=&r" (load1), [load2] "=&r" (load2),
164            [load3] "=&r" (load3), [load4] "=&r" (load4),
165            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
166            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
167            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
168            [store1] "=&r" (store1), [store2] "=&r" (store2),
169            [src_ptr] "+r" (src_ptr)
170          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
171            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
172            [vector4a] "r" (vector4a),
173            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
174      );
175    }
176
177    /* Next row... */
178    src += src_stride;
179    dst += dst_stride;
180  }
181}
182
183static void convolve_avg_vert_64_dspr2(const uint8_t *src,
184                                       int32_t src_stride,
185                                       uint8_t *dst,
186                                       int32_t dst_stride,
187                                       const int16_t *filter_y,
188                                       int32_t h) {
189  int32_t       x, y;
190  const uint8_t *src_ptr;
191  uint8_t       *dst_ptr;
192  uint8_t       *cm = vpx_ff_cropTbl;
193  uint32_t      vector4a = 64;
194  uint32_t      load1, load2, load3, load4;
195  uint32_t      p1, p2;
196  uint32_t      n1, n2;
197  uint32_t      scratch1, scratch2;
198  uint32_t      store1, store2;
199  int32_t       vector1b, vector2b, vector3b, vector4b;
200  int32_t       Temp1, Temp2;
201
202  vector1b = ((const int32_t *)filter_y)[0];
203  vector2b = ((const int32_t *)filter_y)[1];
204  vector3b = ((const int32_t *)filter_y)[2];
205  vector4b = ((const int32_t *)filter_y)[3];
206
207  src -= 3 * src_stride;
208
209  for (y = h; y--;) {
210    /* prefetch data to cache memory */
211    prefetch_store(dst + dst_stride);
212    prefetch_store(dst + dst_stride + 32);
213
214    for (x = 0; x < 64; x += 4) {
215      src_ptr = src + x;
216      dst_ptr = dst + x;
217
218      __asm__ __volatile__ (
219          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
220          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
221          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
222          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
223          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
224          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
225          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
226
227          "mtlo             %[vector4a],  $ac0                            \n\t"
228          "mtlo             %[vector4a],  $ac1                            \n\t"
229          "mtlo             %[vector4a],  $ac2                            \n\t"
230          "mtlo             %[vector4a],  $ac3                            \n\t"
231          "mthi             $zero,        $ac0                            \n\t"
232          "mthi             $zero,        $ac1                            \n\t"
233          "mthi             $zero,        $ac2                            \n\t"
234          "mthi             $zero,        $ac3                            \n\t"
235
236          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
237          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
238          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
239          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
240          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
241          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
242          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
243          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
244
245          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
246          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
247          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
248          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
249
250          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
251          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
252          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
253          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
254          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
255          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
256          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
257          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
258
259          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
260          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
261          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
262          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
263
264          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
265          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
266          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
267          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
268          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
269          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
270          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
271          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
272
273          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
274          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
275          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
276          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
277          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
278          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
279          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
280          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
281
282          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
283          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
284          "extp             %[Temp1],     $ac0,           31              \n\t"
285          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
286          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
287          "extp             %[Temp2],     $ac1,           31              \n\t"
288
289          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
290          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
291          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
292          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
293          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
294          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
295          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
296          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
297          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
298          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
299
300          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
301          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
302          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
303          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
304          "extp             %[Temp1],     $ac2,           31              \n\t"
305
306          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
307          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
308          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
309          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
310          "extp             %[Temp2],     $ac3,           31              \n\t"
311          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
312
313          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
314          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
315          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
316
317          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
318          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
319          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
320          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
321
322          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
323          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
324
325          : [load1] "=&r" (load1), [load2] "=&r" (load2),
326            [load3] "=&r" (load3), [load4] "=&r" (load4),
327            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
328            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
329            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
330            [store1] "=&r" (store1), [store2] "=&r" (store2),
331            [src_ptr] "+r" (src_ptr)
332          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
333            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
334            [vector4a] "r" (vector4a),
335            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
336      );
337    }
338
339    /* Next row... */
340    src += src_stride;
341    dst += dst_stride;
342  }
343}
344
345void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
346                                  uint8_t *dst, ptrdiff_t dst_stride,
347                                  const int16_t *filter_x, int x_step_q4,
348                                  const int16_t *filter_y, int y_step_q4,
349                                  int w, int h) {
350  assert(y_step_q4 == 16);
351  assert(((const int32_t *)filter_y)[1] != 0x800000);
352
353  if (((const int32_t *)filter_y)[0] == 0) {
354    vpx_convolve2_avg_vert_dspr2(src, src_stride,
355                                 dst, dst_stride,
356                                 filter_x, x_step_q4,
357                                 filter_y, y_step_q4,
358                                 w, h);
359  } else {
360    uint32_t pos = 38;
361
362    /* bit positon for extract from acc */
363    __asm__ __volatile__ (
364      "wrdsp      %[pos],     1           \n\t"
365      :
366      : [pos] "r" (pos)
367    );
368
369    prefetch_store(dst);
370
371    switch (w) {
372      case 4:
373      case 8:
374      case 16:
375      case 32:
376        convolve_avg_vert_4_dspr2(src, src_stride,
377                                  dst, dst_stride,
378                                  filter_y, w, h);
379        break;
380      case 64:
381        prefetch_store(dst + 32);
382        convolve_avg_vert_64_dspr2(src, src_stride,
383                                   dst, dst_stride,
384                                   filter_y, h);
385        break;
386      default:
387        vpx_convolve8_avg_vert_c(src, src_stride,
388                                 dst, dst_stride,
389                                 filter_x, x_step_q4,
390                                 filter_y, y_step_q4,
391                                 w, h);
392        break;
393    }
394  }
395}
396
397void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
398                             uint8_t *dst, ptrdiff_t dst_stride,
399                             const int16_t *filter_x, int x_step_q4,
400                             const int16_t *filter_y, int y_step_q4,
401                             int w, int h) {
402  /* Fixed size intermediate buffer places limits on parameters. */
403  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
404  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
405
406  assert(w <= 64);
407  assert(h <= 64);
408  assert(x_step_q4 == 16);
409  assert(y_step_q4 == 16);
410
411  if (intermediate_height < h)
412    intermediate_height = h;
413
414  vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
415                      temp, 64,
416                      filter_x, x_step_q4,
417                      filter_y, y_step_q4,
418                      w, intermediate_height);
419
420  vpx_convolve8_avg_vert(temp + 64 * 3, 64,
421                         dst, dst_stride,
422                         filter_x, x_step_q4,
423                         filter_y, y_step_q4,
424                         w, h);
425}
426
427void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
428                            uint8_t *dst, ptrdiff_t dst_stride,
429                            const int16_t *filter_x, int filter_x_stride,
430                            const int16_t *filter_y, int filter_y_stride,
431                            int w, int h) {
432  int x, y;
433  uint32_t tp1, tp2, tn1;
434  uint32_t tp3, tp4, tn2;
435
436  /* prefetch data to cache memory */
437  prefetch_load(src);
438  prefetch_load(src + 32);
439  prefetch_store(dst);
440
441  switch (w) {
442    case 4:
443      /* 1 word storage */
444      for (y = h; y--; ) {
445        prefetch_load(src + src_stride);
446        prefetch_load(src + src_stride + 32);
447        prefetch_store(dst + dst_stride);
448
449        __asm__ __volatile__ (
450            "ulw              %[tp1],         0(%[src])      \n\t"
451            "ulw              %[tp2],         0(%[dst])      \n\t"
452            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
453            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
454
455            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
456              [tp2] "=&r" (tp2)
457            : [src] "r" (src), [dst] "r" (dst)
458        );
459
460        src += src_stride;
461        dst += dst_stride;
462      }
463      break;
464    case 8:
465      /* 2 word storage */
466      for (y = h; y--; ) {
467        prefetch_load(src + src_stride);
468        prefetch_load(src + src_stride + 32);
469        prefetch_store(dst + dst_stride);
470
471        __asm__ __volatile__ (
472            "ulw              %[tp1],         0(%[src])      \n\t"
473            "ulw              %[tp2],         0(%[dst])      \n\t"
474            "ulw              %[tp3],         4(%[src])      \n\t"
475            "ulw              %[tp4],         4(%[dst])      \n\t"
476            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
477            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
478            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
479            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
480
481            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
482              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
483              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
484            : [src] "r" (src), [dst] "r" (dst)
485        );
486
487        src += src_stride;
488        dst += dst_stride;
489      }
490      break;
491    case 16:
492      /* 4 word storage */
493      for (y = h; y--; ) {
494        prefetch_load(src + src_stride);
495        prefetch_load(src + src_stride + 32);
496        prefetch_store(dst + dst_stride);
497
498        __asm__ __volatile__ (
499            "ulw              %[tp1],         0(%[src])      \n\t"
500            "ulw              %[tp2],         0(%[dst])      \n\t"
501            "ulw              %[tp3],         4(%[src])      \n\t"
502            "ulw              %[tp4],         4(%[dst])      \n\t"
503            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
504            "ulw              %[tp1],         8(%[src])      \n\t"
505            "ulw              %[tp2],         8(%[dst])      \n\t"
506            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
507            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
508            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
509            "ulw              %[tp3],         12(%[src])     \n\t"
510            "ulw              %[tp4],         12(%[dst])     \n\t"
511            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
512            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
513            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
514            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
515
516            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
517              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
518              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
519            : [src] "r" (src), [dst] "r" (dst)
520        );
521
522        src += src_stride;
523        dst += dst_stride;
524      }
525      break;
526    case 32:
527      /* 8 word storage */
528      for (y = h; y--; ) {
529        prefetch_load(src + src_stride);
530        prefetch_load(src + src_stride + 32);
531        prefetch_store(dst + dst_stride);
532
533        __asm__ __volatile__ (
534            "ulw              %[tp1],         0(%[src])      \n\t"
535            "ulw              %[tp2],         0(%[dst])      \n\t"
536            "ulw              %[tp3],         4(%[src])      \n\t"
537            "ulw              %[tp4],         4(%[dst])      \n\t"
538            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
539            "ulw              %[tp1],         8(%[src])      \n\t"
540            "ulw              %[tp2],         8(%[dst])      \n\t"
541            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
542            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
543            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
544            "ulw              %[tp3],         12(%[src])     \n\t"
545            "ulw              %[tp4],         12(%[dst])     \n\t"
546            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
547            "ulw              %[tp1],         16(%[src])     \n\t"
548            "ulw              %[tp2],         16(%[dst])     \n\t"
549            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
550            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
551            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
552            "ulw              %[tp3],         20(%[src])     \n\t"
553            "ulw              %[tp4],         20(%[dst])     \n\t"
554            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
555            "ulw              %[tp1],         24(%[src])     \n\t"
556            "ulw              %[tp2],         24(%[dst])     \n\t"
557            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
558            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
559            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
560            "ulw              %[tp3],         28(%[src])     \n\t"
561            "ulw              %[tp4],         28(%[dst])     \n\t"
562            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
563            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
564            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
565            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
566
567            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
568              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
569              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
570            : [src] "r" (src), [dst] "r" (dst)
571        );
572
573        src += src_stride;
574        dst += dst_stride;
575      }
576      break;
577    case 64:
578      prefetch_load(src + 64);
579      prefetch_store(dst + 32);
580
581      /* 16 word storage */
582      for (y = h; y--; ) {
583        prefetch_load(src + src_stride);
584        prefetch_load(src + src_stride + 32);
585        prefetch_load(src + src_stride + 64);
586        prefetch_store(dst + dst_stride);
587        prefetch_store(dst + dst_stride + 32);
588
589        __asm__ __volatile__ (
590            "ulw              %[tp1],         0(%[src])      \n\t"
591            "ulw              %[tp2],         0(%[dst])      \n\t"
592            "ulw              %[tp3],         4(%[src])      \n\t"
593            "ulw              %[tp4],         4(%[dst])      \n\t"
594            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
595            "ulw              %[tp1],         8(%[src])      \n\t"
596            "ulw              %[tp2],         8(%[dst])      \n\t"
597            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
598            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
599            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
600            "ulw              %[tp3],         12(%[src])     \n\t"
601            "ulw              %[tp4],         12(%[dst])     \n\t"
602            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
603            "ulw              %[tp1],         16(%[src])     \n\t"
604            "ulw              %[tp2],         16(%[dst])     \n\t"
605            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
606            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
607            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
608            "ulw              %[tp3],         20(%[src])     \n\t"
609            "ulw              %[tp4],         20(%[dst])     \n\t"
610            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
611            "ulw              %[tp1],         24(%[src])     \n\t"
612            "ulw              %[tp2],         24(%[dst])     \n\t"
613            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
614            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
615            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
616            "ulw              %[tp3],         28(%[src])     \n\t"
617            "ulw              %[tp4],         28(%[dst])     \n\t"
618            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
619            "ulw              %[tp1],         32(%[src])     \n\t"
620            "ulw              %[tp2],         32(%[dst])     \n\t"
621            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
622            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
623            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
624            "ulw              %[tp3],         36(%[src])     \n\t"
625            "ulw              %[tp4],         36(%[dst])     \n\t"
626            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
627            "ulw              %[tp1],         40(%[src])     \n\t"
628            "ulw              %[tp2],         40(%[dst])     \n\t"
629            "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
630            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
631            "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
632            "ulw              %[tp3],         44(%[src])     \n\t"
633            "ulw              %[tp4],         44(%[dst])     \n\t"
634            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
635            "ulw              %[tp1],         48(%[src])     \n\t"
636            "ulw              %[tp2],         48(%[dst])     \n\t"
637            "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
638            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
639            "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
640            "ulw              %[tp3],         52(%[src])     \n\t"
641            "ulw              %[tp4],         52(%[dst])     \n\t"
642            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
643            "ulw              %[tp1],         56(%[src])     \n\t"
644            "ulw              %[tp2],         56(%[dst])     \n\t"
645            "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
646            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
647            "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
648            "ulw              %[tp3],         60(%[src])     \n\t"
649            "ulw              %[tp4],         60(%[dst])     \n\t"
650            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
651            "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
652            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
653            "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
654
655            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
656              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
657              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
658            : [src] "r" (src), [dst] "r" (dst)
659        );
660
661        src += src_stride;
662        dst += dst_stride;
663      }
664      break;
665    default:
666      for (y = h; y > 0; --y) {
667        for (x = 0; x < w; ++x) {
668          dst[x] = (dst[x] + src[x] + 1) >> 1;
669        }
670
671        src += src_stride;
672        dst += dst_stride;
673      }
674      break;
675  }
676}
677#endif
678