1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_dsp_common.h"
17#include "vpx_dsp/vpx_filter.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
22                                              int32_t src_stride, uint8_t *dst,
23                                              int32_t dst_stride,
24                                              const int16_t *filter_x0,
25                                              int32_t h) {
26  int32_t y;
27  uint8_t *cm = vpx_ff_cropTbl;
28  uint8_t *dst_ptr;
29  int32_t vector1b, vector2b, vector3b, vector4b;
30  int32_t Temp1, Temp2, Temp3, Temp4;
31  uint32_t vector4a = 64;
32  uint32_t tp1, tp2;
33  uint32_t p1, p2, p3, p4;
34  uint32_t tn1, tn2;
35
36  vector1b = ((const int32_t *)filter_x0)[0];
37  vector2b = ((const int32_t *)filter_x0)[1];
38  vector3b = ((const int32_t *)filter_x0)[2];
39  vector4b = ((const int32_t *)filter_x0)[3];
40
41  for (y = h; y--;) {
42    dst_ptr = dst;
43    /* prefetch data to cache memory */
44    prefetch_load(src + src_stride);
45    prefetch_load(src + src_stride + 32);
46
47    __asm__ __volatile__(
48        "ulw              %[tp1],         0(%[src])                      \n\t"
49        "ulw              %[tp2],         4(%[src])                      \n\t"
50
51        /* even 1. pixel */
52        "mtlo             %[vector4a],    $ac3                           \n\t"
53        "mthi             $zero,          $ac3                           \n\t"
54        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
55        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
56        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
57        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
58        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
59        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
60        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
61        "ulw              %[tn2],         8(%[src])                      \n\t"
62        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
63        "extp             %[Temp1],       $ac3,           31             \n\t"
64
65        /* even 2. pixel */
66        "mtlo             %[vector4a],    $ac2                           \n\t"
67        "mthi             $zero,          $ac2                           \n\t"
68        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
69        "balign           %[tn1],         %[tn2],         3              \n\t"
70        "balign           %[tn2],         %[tp2],         3              \n\t"
71        "balign           %[tp2],         %[tp1],         3              \n\t"
72        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
73        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
74        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
75        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
76        "extp             %[Temp3],       $ac2,           31             \n\t"
77
78        /* odd 1. pixel */
79        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
80        "mtlo             %[vector4a],    $ac3                           \n\t"
81        "mthi             $zero,          $ac3                           \n\t"
82        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
83        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
84        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
85        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
86        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
87        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
88        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
89        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
90        "extp             %[Temp2],       $ac3,           31             \n\t"
91
92        /* odd 2. pixel */
93        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
94        "mtlo             %[vector4a],    $ac2                           \n\t"
95        "mthi             $zero,          $ac2                           \n\t"
96        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
97        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
98        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
99        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
100        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
101        "extp             %[Temp4],       $ac2,           31             \n\t"
102
103        /* clamp */
104        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
105        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
106
107        /* store bytes */
108        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
109        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
110
111        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
112        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
113
114        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
115        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
116
117        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
118        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
119
120        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
121          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
122          [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
123          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
124        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
125          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
126          [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
127          [dst_stride] "r"(dst_stride));
128
129    /* Next row... */
130    src += src_stride;
131    dst += 1;
132  }
133}
134
135static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
136                                              int32_t src_stride, uint8_t *dst,
137                                              int32_t dst_stride,
138                                              const int16_t *filter_x0,
139                                              int32_t h) {
140  int32_t y;
141  uint8_t *cm = vpx_ff_cropTbl;
142  uint8_t *dst_ptr;
143  uint32_t vector4a = 64;
144  int32_t vector1b, vector2b, vector3b, vector4b;
145  int32_t Temp1, Temp2, Temp3;
146  uint32_t tp1, tp2, tp3;
147  uint32_t p1, p2, p3, p4, n1;
148  uint8_t *odd_dst;
149  uint32_t dst_pitch_2 = (dst_stride << 1);
150
151  vector1b = ((const int32_t *)filter_x0)[0];
152  vector2b = ((const int32_t *)filter_x0)[1];
153  vector3b = ((const int32_t *)filter_x0)[2];
154  vector4b = ((const int32_t *)filter_x0)[3];
155
156  for (y = h; y--;) {
157    /* prefetch data to cache memory */
158    prefetch_load(src + src_stride);
159    prefetch_load(src + src_stride + 32);
160
161    dst_ptr = dst;
162    odd_dst = (dst_ptr + dst_stride);
163
164    __asm__ __volatile__(
165        "ulw              %[tp2],         0(%[src])                       \n\t"
166        "ulw              %[tp1],         4(%[src])                       \n\t"
167
168        /* even 1. pixel */
169        "mtlo             %[vector4a],    $ac3                            \n\t"
170        "mthi             $zero,          $ac3                            \n\t"
171        "mtlo             %[vector4a],    $ac2                            \n\t"
172        "mthi             $zero,          $ac2                            \n\t"
173        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
174        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
175        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
176        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
177        "ulw              %[tp3],         8(%[src])                       \n\t"
178        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
179        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
180        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
181        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
182        "extp             %[Temp1],       $ac3,           31              \n\t"
183
184        /* even 2. pixel */
185        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
186        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
187        "ulw              %[tp2],         12(%[src])                      \n\t"
188        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
189        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
190        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
191        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
192        "extp             %[Temp3],       $ac2,           31              \n\t"
193
194        /* even 3. pixel */
195        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
196        "mtlo             %[vector4a],    $ac1                            \n\t"
197        "mthi             $zero,          $ac1                            \n\t"
198        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
199        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
200        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
201        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
202        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
203        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
204        "extp             %[p3],          $ac1,           31              \n\t"
205
206        /* even 4. pixel */
207        "mtlo             %[vector4a],    $ac2                            \n\t"
208        "mthi             $zero,          $ac2                            \n\t"
209        "mtlo             %[vector4a],    $ac3                            \n\t"
210        "mthi             $zero,          $ac3                            \n\t"
211        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
212        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
213        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
214        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
215
216        "ulw              %[tp1],         1(%[src])                       \n\t"
217        "ulw              %[tp3],         5(%[src])                       \n\t"
218
219        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
220        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
221        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
222        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
223        "extp             %[Temp3],       $ac2,           31              \n\t"
224
225        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
226
227        /* odd 1. pixel */
228        "mtlo             %[vector4a],    $ac1                            \n\t"
229        "mthi             $zero,          $ac1                            \n\t"
230        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
231        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
232        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
233        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
234        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
235        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
236        "ulw              %[tp2],         9(%[src])                       \n\t"
237
238        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
239        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
240        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
241        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
242        "extp             %[Temp2],       $ac3,           31              \n\t"
243
244        /* odd 2. pixel */
245        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
246        "mtlo             %[vector4a],    $ac3                            \n\t"
247        "mthi             $zero,          $ac3                            \n\t"
248        "mtlo             %[vector4a],    $ac2                            \n\t"
249        "mthi             $zero,          $ac2                            \n\t"
250        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
251        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
252        "ulw              %[Temp1],       13(%[src])                      \n\t"
253        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
254        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
255        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
256        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
257        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
258        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
259        "extp             %[Temp3],       $ac1,           31              \n\t"
260
261        /* odd 3. pixel */
262        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
263        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
264        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
265        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
266        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
267        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
268        "extp             %[Temp2],       $ac3,           31              \n\t"
269
270        /* odd 4. pixel */
271        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
272        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
273        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
274        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
275        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
276        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
277        "extp             %[Temp1],       $ac2,           31              \n\t"
278
279        /* clamp */
280        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
281        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
282        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
283
284        /* store bytes */
285        "sb               %[p4],          0(%[odd_dst])                   \n\t"
286        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
287
288        "sb               %[p2],          0(%[odd_dst])                   \n\t"
289        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
290
291        "sb               %[n1],          0(%[odd_dst])                   \n\t"
292
293        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
294          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
295          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
296          [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
297        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
298          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
299          [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
300          [dst_pitch_2] "r"(dst_pitch_2));
301
302    /* Next row... */
303    src += src_stride;
304    dst += 1;
305  }
306}
307
308static void convolve_horiz_16_transposed_dspr2(
309    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
310    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
311  int32_t c, y;
312  const uint8_t *src;
313  uint8_t *dst;
314  uint8_t *cm = vpx_ff_cropTbl;
315  uint32_t vector_64 = 64;
316  int32_t filter12, filter34, filter56, filter78;
317  int32_t Temp1, Temp2, Temp3;
318  uint32_t qload1, qload2;
319  uint32_t p1, p2, p3, p4, p5;
320  uint32_t st1, st2, st3;
321  uint32_t dst_pitch_2 = (dst_stride << 1);
322  uint8_t *odd_dst;
323
324  filter12 = ((const int32_t *)filter_x0)[0];
325  filter34 = ((const int32_t *)filter_x0)[1];
326  filter56 = ((const int32_t *)filter_x0)[2];
327  filter78 = ((const int32_t *)filter_x0)[3];
328
329  for (y = h; y--;) {
330    /* prefetch data to cache memory */
331    prefetch_load(src_ptr + src_stride);
332    prefetch_load(src_ptr + src_stride + 32);
333
334    src = src_ptr;
335    dst = dst_ptr;
336
337    odd_dst = (dst + dst_stride);
338
339    for (c = 0; c < count; c++) {
340      __asm__ __volatile__(
341          "ulw              %[qload1],        0(%[src])                       "
342          "\n\t"
343          "ulw              %[qload2],        4(%[src])                       "
344          "\n\t"
345
346          /* even 1. pixel */
347          "mtlo             %[vector_64],     $ac1                            "
348          "\n\t" /* even 1 */
349          "mthi             $zero,            $ac1                            "
350          "\n\t"
351          "mtlo             %[vector_64],     $ac2                            "
352          "\n\t" /* even 2 */
353          "mthi             $zero,            $ac2                            "
354          "\n\t"
355          "preceu.ph.qbr    %[p3],            %[qload2]                       "
356          "\n\t"
357          "preceu.ph.qbl    %[p4],            %[qload2]                       "
358          "\n\t"
359          "preceu.ph.qbr    %[p1],            %[qload1]                       "
360          "\n\t"
361          "preceu.ph.qbl    %[p2],            %[qload1]                       "
362          "\n\t"
363          "ulw              %[qload2],        8(%[src])                       "
364          "\n\t"
365          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
366          "\n\t" /* even 1 */
367          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
368          "\n\t" /* even 1 */
369          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
370          "\n\t" /* even 1 */
371          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
372          "\n\t" /* even 1 */
373          "extp             %[Temp1],         $ac1,           31              "
374          "\n\t" /* even 1 */
375
376          /* even 2. pixel */
377          "mtlo             %[vector_64],     $ac3                            "
378          "\n\t" /* even 3 */
379          "mthi             $zero,            $ac3                            "
380          "\n\t"
381          "preceu.ph.qbr    %[p1],            %[qload2]                       "
382          "\n\t"
383          "preceu.ph.qbl    %[p5],            %[qload2]                       "
384          "\n\t"
385          "ulw              %[qload1],        12(%[src])                      "
386          "\n\t"
387          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
388          "\n\t" /* even 1 */
389          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
390          "\n\t" /* even 1 */
391          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
392          "\n\t" /* even 1 */
393          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
394          "\n\t" /* even 1 */
395          "lbux             %[st1],           %[Temp1](%[cm])                 "
396          "\n\t" /* even 1 */
397          "extp             %[Temp2],         $ac2,           31              "
398          "\n\t" /* even 1 */
399
400          /* even 3. pixel */
401          "mtlo             %[vector_64],     $ac1                            "
402          "\n\t" /* even 4 */
403          "mthi             $zero,            $ac1                            "
404          "\n\t"
405          "preceu.ph.qbr    %[p2],            %[qload1]                       "
406          "\n\t"
407          "sb               %[st1],           0(%[dst])                       "
408          "\n\t" /* even 1 */
409          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
410          "          \n\t"
411          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
412          "\n\t" /* even 3 */
413          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
414          "\n\t" /* even 3 */
415          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
416          "\n\t" /* even 3 */
417          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
418          "\n\t" /* even 3 */
419          "extp             %[Temp3],         $ac3,           31              "
420          "\n\t" /* even 3 */
421          "lbux             %[st2],           %[Temp2](%[cm])                 "
422          "\n\t" /* even 1 */
423
424          /* even 4. pixel */
425          "mtlo             %[vector_64],     $ac2                            "
426          "\n\t" /* even 5 */
427          "mthi             $zero,            $ac2                            "
428          "\n\t"
429          "preceu.ph.qbl    %[p3],            %[qload1]                       "
430          "\n\t"
431          "sb               %[st2],           0(%[dst])                       "
432          "\n\t" /* even 2 */
433          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
434          "\n\t"
435          "ulw              %[qload2],        16(%[src])                      "
436          "\n\t"
437          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
438          "\n\t" /* even 4 */
439          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
440          "\n\t" /* even 4 */
441          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
442          "\n\t" /* even 4 */
443          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
444          "\n\t" /* even 4 */
445          "extp             %[Temp1],         $ac1,           31              "
446          "\n\t" /* even 4 */
447          "lbux             %[st3],           %[Temp3](%[cm])                 "
448          "\n\t" /* even 3 */
449
450          /* even 5. pixel */
451          "mtlo             %[vector_64],     $ac3                            "
452          "\n\t" /* even 6 */
453          "mthi             $zero,            $ac3                            "
454          "\n\t"
455          "preceu.ph.qbr    %[p4],            %[qload2]                       "
456          "\n\t"
457          "sb               %[st3],           0(%[dst])                       "
458          "\n\t" /* even 3 */
459          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
460          "\n\t"
461          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
462          "\n\t" /* even 5 */
463          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
464          "\n\t" /* even 5 */
465          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
466          "\n\t" /* even 5 */
467          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
468          "\n\t" /* even 5 */
469          "extp             %[Temp2],         $ac2,           31              "
470          "\n\t" /* even 5 */
471          "lbux             %[st1],           %[Temp1](%[cm])                 "
472          "\n\t" /* even 4 */
473
474          /* even 6. pixel */
475          "mtlo             %[vector_64],     $ac1                            "
476          "\n\t" /* even 7 */
477          "mthi             $zero,            $ac1                            "
478          "\n\t"
479          "preceu.ph.qbl    %[p1],            %[qload2]                       "
480          "\n\t"
481          "sb               %[st1],           0(%[dst])                       "
482          "\n\t" /* even 4 */
483          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
484          "\n\t"
485          "ulw              %[qload1],        20(%[src])                      "
486          "\n\t"
487          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
488          "\n\t" /* even 6 */
489          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
490          "\n\t" /* even 6 */
491          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
492          "\n\t" /* even 6 */
493          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
494          "\n\t" /* even 6 */
495          "extp             %[Temp3],         $ac3,           31              "
496          "\n\t" /* even 6 */
497          "lbux             %[st2],           %[Temp2](%[cm])                 "
498          "\n\t" /* even 5 */
499
500          /* even 7. pixel */
501          "mtlo             %[vector_64],     $ac2                            "
502          "\n\t" /* even 8 */
503          "mthi             $zero,            $ac2                            "
504          "\n\t"
505          "preceu.ph.qbr    %[p5],            %[qload1]                       "
506          "\n\t"
507          "sb               %[st2],           0(%[dst])                       "
508          "\n\t" /* even 5 */
509          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
510          "\n\t"
511          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
512          "\n\t" /* even 7 */
513          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
514          "\n\t" /* even 7 */
515          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
516          "\n\t" /* even 7 */
517          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
518          "\n\t" /* even 7 */
519          "extp             %[Temp1],         $ac1,           31              "
520          "\n\t" /* even 7 */
521          "lbux             %[st3],           %[Temp3](%[cm])                 "
522          "\n\t" /* even 6 */
523
524          /* even 8. pixel */
525          "mtlo             %[vector_64],     $ac3                            "
526          "\n\t" /* odd 1 */
527          "mthi             $zero,            $ac3                            "
528          "\n\t"
529          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
530          "\n\t" /* even 8 */
531          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
532          "\n\t" /* even 8 */
533          "sb               %[st3],           0(%[dst])                       "
534          "\n\t" /* even 6 */
535          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
536          "\n\t"
537          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
538          "\n\t" /* even 8 */
539          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
540          "\n\t" /* even 8 */
541          "extp             %[Temp2],         $ac2,           31              "
542          "\n\t" /* even 8 */
543          "lbux             %[st1],           %[Temp1](%[cm])                 "
544          "\n\t" /* even 7 */
545
546          /* ODD pixels */
547          "ulw              %[qload1],        1(%[src])                       "
548          "\n\t"
549          "ulw              %[qload2],        5(%[src])                       "
550          "\n\t"
551
552          /* odd 1. pixel */
553          "mtlo             %[vector_64],     $ac1                            "
554          "\n\t" /* odd 2 */
555          "mthi             $zero,            $ac1                            "
556          "\n\t"
557          "preceu.ph.qbr    %[p1],            %[qload1]                       "
558          "\n\t"
559          "preceu.ph.qbl    %[p2],            %[qload1]                       "
560          "\n\t"
561          "preceu.ph.qbr    %[p3],            %[qload2]                       "
562          "\n\t"
563          "preceu.ph.qbl    %[p4],            %[qload2]                       "
564          "\n\t"
565          "sb               %[st1],           0(%[dst])                       "
566          "\n\t" /* even 7 */
567          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
568          "\n\t"
569          "ulw              %[qload2],        9(%[src])                       "
570          "\n\t"
571          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
572          "\n\t" /* odd 1 */
573          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
574          "\n\t" /* odd 1 */
575          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
576          "\n\t" /* odd 1 */
577          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
578          "\n\t" /* odd 1 */
579          "extp             %[Temp3],         $ac3,           31              "
580          "\n\t" /* odd 1 */
581          "lbux             %[st2],           %[Temp2](%[cm])                 "
582          "\n\t" /* even 8 */
583
584          /* odd 2. pixel */
585          "mtlo             %[vector_64],     $ac2                            "
586          "\n\t" /* odd 3 */
587          "mthi             $zero,            $ac2                            "
588          "\n\t"
589          "preceu.ph.qbr    %[p1],            %[qload2]                       "
590          "\n\t"
591          "preceu.ph.qbl    %[p5],            %[qload2]                       "
592          "\n\t"
593          "sb               %[st2],           0(%[dst])                       "
594          "\n\t" /* even 8 */
595          "ulw              %[qload1],        13(%[src])                      "
596          "\n\t"
597          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
598          "\n\t" /* odd 2 */
599          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
600          "\n\t" /* odd 2 */
601          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
602          "\n\t" /* odd 2 */
603          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
604          "\n\t" /* odd 2 */
605          "extp             %[Temp1],         $ac1,           31              "
606          "\n\t" /* odd 2 */
607          "lbux             %[st3],           %[Temp3](%[cm])                 "
608          "\n\t" /* odd 1 */
609
610          /* odd 3. pixel */
611          "mtlo             %[vector_64],     $ac3                            "
612          "\n\t" /* odd 4 */
613          "mthi             $zero,            $ac3                            "
614          "\n\t"
615          "preceu.ph.qbr    %[p2],            %[qload1]                       "
616          "\n\t"
617          "sb               %[st3],           0(%[odd_dst])                   "
618          "\n\t" /* odd 1 */
619          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
620          "\n\t"
621          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
622          "\n\t" /* odd 3 */
623          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
624          "\n\t" /* odd 3 */
625          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
626          "\n\t" /* odd 3 */
627          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
628          "\n\t" /* odd 3 */
629          "extp             %[Temp2],         $ac2,           31              "
630          "\n\t" /* odd 3 */
631          "lbux             %[st1],           %[Temp1](%[cm])                 "
632          "\n\t" /* odd 2 */
633
634          /* odd 4. pixel */
635          "mtlo             %[vector_64],     $ac1                            "
636          "\n\t" /* odd 5 */
637          "mthi             $zero,            $ac1                            "
638          "\n\t"
639          "preceu.ph.qbl    %[p3],            %[qload1]                       "
640          "\n\t"
641          "sb               %[st1],           0(%[odd_dst])                   "
642          "\n\t" /* odd 2 */
643          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
644          "\n\t"
645          "ulw              %[qload2],        17(%[src])                      "
646          "\n\t"
647          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
648          "\n\t" /* odd 4 */
649          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
650          "\n\t" /* odd 4 */
651          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
652          "\n\t" /* odd 4 */
653          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
654          "\n\t" /* odd 4 */
655          "extp             %[Temp3],         $ac3,           31              "
656          "\n\t" /* odd 4 */
657          "lbux             %[st2],           %[Temp2](%[cm])                 "
658          "\n\t" /* odd 3 */
659
660          /* odd 5. pixel */
661          "mtlo             %[vector_64],     $ac2                            "
662          "\n\t" /* odd 6 */
663          "mthi             $zero,            $ac2                            "
664          "\n\t"
665          "preceu.ph.qbr    %[p4],            %[qload2]                       "
666          "\n\t"
667          "sb               %[st2],           0(%[odd_dst])                   "
668          "\n\t" /* odd 3 */
669          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
670          "\n\t"
671          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
672          "\n\t" /* odd 5 */
673          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
674          "\n\t" /* odd 5 */
675          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
676          "\n\t" /* odd 5 */
677          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
678          "\n\t" /* odd 5 */
679          "extp             %[Temp1],         $ac1,           31              "
680          "\n\t" /* odd 5 */
681          "lbux             %[st3],           %[Temp3](%[cm])                 "
682          "\n\t" /* odd 4 */
683
684          /* odd 6. pixel */
685          "mtlo             %[vector_64],     $ac3                            "
686          "\n\t" /* odd 7 */
687          "mthi             $zero,            $ac3                            "
688          "\n\t"
689          "preceu.ph.qbl    %[p1],            %[qload2]                       "
690          "\n\t"
691          "sb               %[st3],           0(%[odd_dst])                   "
692          "\n\t" /* odd 4 */
693          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
694          "\n\t"
695          "ulw              %[qload1],        21(%[src])                      "
696          "\n\t"
697          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
698          "\n\t" /* odd 6 */
699          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
700          "\n\t" /* odd 6 */
701          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
702          "\n\t" /* odd 6 */
703          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
704          "\n\t" /* odd 6 */
705          "extp             %[Temp2],         $ac2,           31              "
706          "\n\t" /* odd 6 */
707          "lbux             %[st1],           %[Temp1](%[cm])                 "
708          "\n\t" /* odd 5 */
709
710          /* odd 7. pixel */
711          "mtlo             %[vector_64],     $ac1                            "
712          "\n\t" /* odd 8 */
713          "mthi             $zero,            $ac1                            "
714          "\n\t"
715          "preceu.ph.qbr    %[p5],            %[qload1]                       "
716          "\n\t"
717          "sb               %[st1],           0(%[odd_dst])                   "
718          "\n\t" /* odd 5 */
719          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
720          "\n\t"
721          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
722          "\n\t" /* odd 7 */
723          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
724          "\n\t" /* odd 7 */
725          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
726          "\n\t" /* odd 7 */
727          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
728          "\n\t" /* odd 7 */
729          "extp             %[Temp3],         $ac3,           31              "
730          "\n\t" /* odd 7 */
731
732          /* odd 8. pixel */
733          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
734          "\n\t" /* odd 8 */
735          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
736          "\n\t" /* odd 8 */
737          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
738          "\n\t" /* odd 8 */
739          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
740          "\n\t" /* odd 8 */
741          "extp             %[Temp1],         $ac1,           31              "
742          "\n\t" /* odd 8 */
743
744          "lbux             %[st2],           %[Temp2](%[cm])                 "
745          "\n\t" /* odd 6 */
746          "lbux             %[st3],           %[Temp3](%[cm])                 "
747          "\n\t" /* odd 7 */
748          "lbux             %[st1],           %[Temp1](%[cm])                 "
749          "\n\t" /* odd 8 */
750
751          "sb               %[st2],           0(%[odd_dst])                   "
752          "\n\t" /* odd 6 */
753          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
754          "\n\t"
755
756          "sb               %[st3],           0(%[odd_dst])                   "
757          "\n\t" /* odd 7 */
758          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
759          "\n\t"
760
761          "sb               %[st1],           0(%[odd_dst])                   "
762          "\n\t" /* odd 8 */
763
764          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
765            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
766            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
767            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
768            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
769          : [filter12] "r"(filter12), [filter34] "r"(filter34),
770            [filter56] "r"(filter56), [filter78] "r"(filter78),
771            [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
772            [dst_pitch_2] "r"(dst_pitch_2));
773
774      src += 16;
775      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
776      odd_dst = (dst + dst_stride);
777    }
778
779    /* Next row... */
780    src_ptr += src_stride;
781
782    dst_ptr += 1;
783  }
784}
785
786static void convolve_horiz_64_transposed_dspr2(
787    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
788    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
789  int32_t c, y;
790  const uint8_t *src;
791  uint8_t *dst;
792  uint8_t *cm = vpx_ff_cropTbl;
793  uint32_t vector_64 = 64;
794  int32_t filter12, filter34, filter56, filter78;
795  int32_t Temp1, Temp2, Temp3;
796  uint32_t qload1, qload2;
797  uint32_t p1, p2, p3, p4, p5;
798  uint32_t st1, st2, st3;
799  uint32_t dst_pitch_2 = (dst_stride << 1);
800  uint8_t *odd_dst;
801
802  filter12 = ((const int32_t *)filter_x0)[0];
803  filter34 = ((const int32_t *)filter_x0)[1];
804  filter56 = ((const int32_t *)filter_x0)[2];
805  filter78 = ((const int32_t *)filter_x0)[3];
806
807  for (y = h; y--;) {
808    /* prefetch data to cache memory */
809    prefetch_load(src_ptr + src_stride);
810    prefetch_load(src_ptr + src_stride + 32);
811    prefetch_load(src_ptr + src_stride + 64);
812
813    src = src_ptr;
814    dst = dst_ptr;
815
816    odd_dst = (dst + dst_stride);
817
818    for (c = 0; c < 4; c++) {
819      __asm__ __volatile__(
820          "ulw              %[qload1],        0(%[src])                       "
821          "\n\t"
822          "ulw              %[qload2],        4(%[src])                       "
823          "\n\t"
824
825          /* even 1. pixel */
826          "mtlo             %[vector_64],     $ac1                            "
827          "\n\t" /* even 1 */
828          "mthi             $zero,            $ac1                            "
829          "\n\t"
830          "mtlo             %[vector_64],     $ac2                            "
831          "\n\t" /* even 2 */
832          "mthi             $zero,            $ac2                            "
833          "\n\t"
834          "preceu.ph.qbr    %[p3],            %[qload2]                       "
835          "\n\t"
836          "preceu.ph.qbl    %[p4],            %[qload2]                       "
837          "\n\t"
838          "preceu.ph.qbr    %[p1],            %[qload1]                       "
839          "\n\t"
840          "preceu.ph.qbl    %[p2],            %[qload1]                       "
841          "\n\t"
842          "ulw              %[qload2],        8(%[src])                       "
843          "\n\t"
844          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
845          "\n\t" /* even 1 */
846          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
847          "\n\t" /* even 1 */
848          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
849          "\n\t" /* even 1 */
850          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
851          "\n\t" /* even 1 */
852          "extp             %[Temp1],         $ac1,           31              "
853          "\n\t" /* even 1 */
854
855          /* even 2. pixel */
856          "mtlo             %[vector_64],     $ac3                            "
857          "\n\t" /* even 3 */
858          "mthi             $zero,            $ac3                            "
859          "\n\t"
860          "preceu.ph.qbr    %[p1],            %[qload2]                       "
861          "\n\t"
862          "preceu.ph.qbl    %[p5],            %[qload2]                       "
863          "\n\t"
864          "ulw              %[qload1],        12(%[src])                      "
865          "\n\t"
866          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
867          "\n\t" /* even 1 */
868          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
869          "\n\t" /* even 1 */
870          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
871          "\n\t" /* even 1 */
872          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
873          "\n\t" /* even 1 */
874          "lbux             %[st1],           %[Temp1](%[cm])                 "
875          "\n\t" /* even 1 */
876          "extp             %[Temp2],         $ac2,           31              "
877          "\n\t" /* even 1 */
878
879          /* even 3. pixel */
880          "mtlo             %[vector_64],     $ac1                            "
881          "\n\t" /* even 4 */
882          "mthi             $zero,            $ac1                            "
883          "\n\t"
884          "preceu.ph.qbr    %[p2],            %[qload1]                       "
885          "\n\t"
886          "sb               %[st1],           0(%[dst])                       "
887          "\n\t" /* even 1 */
888          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
889          "          \n\t"
890          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
891          "\n\t" /* even 3 */
892          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
893          "\n\t" /* even 3 */
894          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
895          "\n\t" /* even 3 */
896          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
897          "\n\t" /* even 3 */
898          "extp             %[Temp3],         $ac3,           31              "
899          "\n\t" /* even 3 */
900          "lbux             %[st2],           %[Temp2](%[cm])                 "
901          "\n\t" /* even 1 */
902
903          /* even 4. pixel */
904          "mtlo             %[vector_64],     $ac2                            "
905          "\n\t" /* even 5 */
906          "mthi             $zero,            $ac2                            "
907          "\n\t"
908          "preceu.ph.qbl    %[p3],            %[qload1]                       "
909          "\n\t"
910          "sb               %[st2],           0(%[dst])                       "
911          "\n\t" /* even 2 */
912          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
913          "\n\t"
914          "ulw              %[qload2],        16(%[src])                      "
915          "\n\t"
916          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
917          "\n\t" /* even 4 */
918          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
919          "\n\t" /* even 4 */
920          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
921          "\n\t" /* even 4 */
922          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
923          "\n\t" /* even 4 */
924          "extp             %[Temp1],         $ac1,           31              "
925          "\n\t" /* even 4 */
926          "lbux             %[st3],           %[Temp3](%[cm])                 "
927          "\n\t" /* even 3 */
928
929          /* even 5. pixel */
930          "mtlo             %[vector_64],     $ac3                            "
931          "\n\t" /* even 6 */
932          "mthi             $zero,            $ac3                            "
933          "\n\t"
934          "preceu.ph.qbr    %[p4],            %[qload2]                       "
935          "\n\t"
936          "sb               %[st3],           0(%[dst])                       "
937          "\n\t" /* even 3 */
938          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
939          "\n\t"
940          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
941          "\n\t" /* even 5 */
942          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
943          "\n\t" /* even 5 */
944          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
945          "\n\t" /* even 5 */
946          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
947          "\n\t" /* even 5 */
948          "extp             %[Temp2],         $ac2,           31              "
949          "\n\t" /* even 5 */
950          "lbux             %[st1],           %[Temp1](%[cm])                 "
951          "\n\t" /* even 4 */
952
953          /* even 6. pixel */
954          "mtlo             %[vector_64],     $ac1                            "
955          "\n\t" /* even 7 */
956          "mthi             $zero,            $ac1                            "
957          "\n\t"
958          "preceu.ph.qbl    %[p1],            %[qload2]                       "
959          "\n\t"
960          "sb               %[st1],           0(%[dst])                       "
961          "\n\t" /* even 4 */
962          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
963          "\n\t"
964          "ulw              %[qload1],        20(%[src])                      "
965          "\n\t"
966          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
967          "\n\t" /* even 6 */
968          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
969          "\n\t" /* even 6 */
970          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
971          "\n\t" /* even 6 */
972          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
973          "\n\t" /* even 6 */
974          "extp             %[Temp3],         $ac3,           31              "
975          "\n\t" /* even 6 */
976          "lbux             %[st2],           %[Temp2](%[cm])                 "
977          "\n\t" /* even 5 */
978
979          /* even 7. pixel */
980          "mtlo             %[vector_64],     $ac2                            "
981          "\n\t" /* even 8 */
982          "mthi             $zero,            $ac2                            "
983          "\n\t"
984          "preceu.ph.qbr    %[p5],            %[qload1]                       "
985          "\n\t"
986          "sb               %[st2],           0(%[dst])                       "
987          "\n\t" /* even 5 */
988          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
989          "\n\t"
990          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
991          "\n\t" /* even 7 */
992          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
993          "\n\t" /* even 7 */
994          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
995          "\n\t" /* even 7 */
996          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
997          "\n\t" /* even 7 */
998          "extp             %[Temp1],         $ac1,           31              "
999          "\n\t" /* even 7 */
1000          "lbux             %[st3],           %[Temp3](%[cm])                 "
1001          "\n\t" /* even 6 */
1002
1003          /* even 8. pixel */
1004          "mtlo             %[vector_64],     $ac3                            "
1005          "\n\t" /* odd 1 */
1006          "mthi             $zero,            $ac3                            "
1007          "\n\t"
1008          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
1009          "\n\t" /* even 8 */
1010          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
1011          "\n\t" /* even 8 */
1012          "sb               %[st3],           0(%[dst])                       "
1013          "\n\t" /* even 6 */
1014          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
1015          "\n\t"
1016          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
1017          "\n\t" /* even 8 */
1018          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
1019          "\n\t" /* even 8 */
1020          "extp             %[Temp2],         $ac2,           31              "
1021          "\n\t" /* even 8 */
1022          "lbux             %[st1],           %[Temp1](%[cm])                 "
1023          "\n\t" /* even 7 */
1024
1025          /* ODD pixels */
1026          "ulw              %[qload1],        1(%[src])                       "
1027          "\n\t"
1028          "ulw              %[qload2],        5(%[src])                       "
1029          "\n\t"
1030
1031          /* odd 1. pixel */
1032          "mtlo             %[vector_64],     $ac1                            "
1033          "\n\t" /* odd 2 */
1034          "mthi             $zero,            $ac1                            "
1035          "\n\t"
1036          "preceu.ph.qbr    %[p1],            %[qload1]                       "
1037          "\n\t"
1038          "preceu.ph.qbl    %[p2],            %[qload1]                       "
1039          "\n\t"
1040          "preceu.ph.qbr    %[p3],            %[qload2]                       "
1041          "\n\t"
1042          "preceu.ph.qbl    %[p4],            %[qload2]                       "
1043          "\n\t"
1044          "sb               %[st1],           0(%[dst])                       "
1045          "\n\t" /* even 7 */
1046          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
1047          "\n\t"
1048          "ulw              %[qload2],        9(%[src])                       "
1049          "\n\t"
1050          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
1051          "\n\t" /* odd 1 */
1052          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
1053          "\n\t" /* odd 1 */
1054          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
1055          "\n\t" /* odd 1 */
1056          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
1057          "\n\t" /* odd 1 */
1058          "extp             %[Temp3],         $ac3,           31              "
1059          "\n\t" /* odd 1 */
1060          "lbux             %[st2],           %[Temp2](%[cm])                 "
1061          "\n\t" /* even 8 */
1062
1063          /* odd 2. pixel */
1064          "mtlo             %[vector_64],     $ac2                            "
1065          "\n\t" /* odd 3 */
1066          "mthi             $zero,            $ac2                            "
1067          "\n\t"
1068          "preceu.ph.qbr    %[p1],            %[qload2]                       "
1069          "\n\t"
1070          "preceu.ph.qbl    %[p5],            %[qload2]                       "
1071          "\n\t"
1072          "sb               %[st2],           0(%[dst])                       "
1073          "\n\t" /* even 8 */
1074          "ulw              %[qload1],        13(%[src])                      "
1075          "\n\t"
1076          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
1077          "\n\t" /* odd 2 */
1078          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
1079          "\n\t" /* odd 2 */
1080          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
1081          "\n\t" /* odd 2 */
1082          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
1083          "\n\t" /* odd 2 */
1084          "extp             %[Temp1],         $ac1,           31              "
1085          "\n\t" /* odd 2 */
1086          "lbux             %[st3],           %[Temp3](%[cm])                 "
1087          "\n\t" /* odd 1 */
1088
1089          /* odd 3. pixel */
1090          "mtlo             %[vector_64],     $ac3                            "
1091          "\n\t" /* odd 4 */
1092          "mthi             $zero,            $ac3                            "
1093          "\n\t"
1094          "preceu.ph.qbr    %[p2],            %[qload1]                       "
1095          "\n\t"
1096          "sb               %[st3],           0(%[odd_dst])                   "
1097          "\n\t" /* odd 1 */
1098          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1099          "\n\t"
1100          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
1101          "\n\t" /* odd 3 */
1102          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
1103          "\n\t" /* odd 3 */
1104          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
1105          "\n\t" /* odd 3 */
1106          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
1107          "\n\t" /* odd 3 */
1108          "extp             %[Temp2],         $ac2,           31              "
1109          "\n\t" /* odd 3 */
1110          "lbux             %[st1],           %[Temp1](%[cm])                 "
1111          "\n\t" /* odd 2 */
1112
1113          /* odd 4. pixel */
1114          "mtlo             %[vector_64],     $ac1                            "
1115          "\n\t" /* odd 5 */
1116          "mthi             $zero,            $ac1                            "
1117          "\n\t"
1118          "preceu.ph.qbl    %[p3],            %[qload1]                       "
1119          "\n\t"
1120          "sb               %[st1],           0(%[odd_dst])                   "
1121          "\n\t" /* odd 2 */
1122          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1123          "\n\t"
1124          "ulw              %[qload2],        17(%[src])                      "
1125          "\n\t"
1126          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
1127          "\n\t" /* odd 4 */
1128          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
1129          "\n\t" /* odd 4 */
1130          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
1131          "\n\t" /* odd 4 */
1132          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
1133          "\n\t" /* odd 4 */
1134          "extp             %[Temp3],         $ac3,           31              "
1135          "\n\t" /* odd 4 */
1136          "lbux             %[st2],           %[Temp2](%[cm])                 "
1137          "\n\t" /* odd 3 */
1138
1139          /* odd 5. pixel */
1140          "mtlo             %[vector_64],     $ac2                            "
1141          "\n\t" /* odd 6 */
1142          "mthi             $zero,            $ac2                            "
1143          "\n\t"
1144          "preceu.ph.qbr    %[p4],            %[qload2]                       "
1145          "\n\t"
1146          "sb               %[st2],           0(%[odd_dst])                   "
1147          "\n\t" /* odd 3 */
1148          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1149          "\n\t"
1150          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
1151          "\n\t" /* odd 5 */
1152          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
1153          "\n\t" /* odd 5 */
1154          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
1155          "\n\t" /* odd 5 */
1156          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
1157          "\n\t" /* odd 5 */
1158          "extp             %[Temp1],         $ac1,           31              "
1159          "\n\t" /* odd 5 */
1160          "lbux             %[st3],           %[Temp3](%[cm])                 "
1161          "\n\t" /* odd 4 */
1162
1163          /* odd 6. pixel */
1164          "mtlo             %[vector_64],     $ac3                            "
1165          "\n\t" /* odd 7 */
1166          "mthi             $zero,            $ac3                            "
1167          "\n\t"
1168          "preceu.ph.qbl    %[p1],            %[qload2]                       "
1169          "\n\t"
1170          "sb               %[st3],           0(%[odd_dst])                   "
1171          "\n\t" /* odd 4 */
1172          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1173          "\n\t"
1174          "ulw              %[qload1],        21(%[src])                      "
1175          "\n\t"
1176          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
1177          "\n\t" /* odd 6 */
1178          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
1179          "\n\t" /* odd 6 */
1180          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
1181          "\n\t" /* odd 6 */
1182          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
1183          "\n\t" /* odd 6 */
1184          "extp             %[Temp2],         $ac2,           31              "
1185          "\n\t" /* odd 6 */
1186          "lbux             %[st1],           %[Temp1](%[cm])                 "
1187          "\n\t" /* odd 5 */
1188
1189          /* odd 7. pixel */
1190          "mtlo             %[vector_64],     $ac1                            "
1191          "\n\t" /* odd 8 */
1192          "mthi             $zero,            $ac1                            "
1193          "\n\t"
1194          "preceu.ph.qbr    %[p5],            %[qload1]                       "
1195          "\n\t"
1196          "sb               %[st1],           0(%[odd_dst])                   "
1197          "\n\t" /* odd 5 */
1198          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1199          "\n\t"
1200          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
1201          "\n\t" /* odd 7 */
1202          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
1203          "\n\t" /* odd 7 */
1204          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
1205          "\n\t" /* odd 7 */
1206          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
1207          "\n\t" /* odd 7 */
1208          "extp             %[Temp3],         $ac3,           31              "
1209          "\n\t" /* odd 7 */
1210
1211          /* odd 8. pixel */
1212          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
1213          "\n\t" /* odd 8 */
1214          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
1215          "\n\t" /* odd 8 */
1216          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
1217          "\n\t" /* odd 8 */
1218          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
1219          "\n\t" /* odd 8 */
1220          "extp             %[Temp1],         $ac1,           31              "
1221          "\n\t" /* odd 8 */
1222
1223          "lbux             %[st2],           %[Temp2](%[cm])                 "
1224          "\n\t" /* odd 6 */
1225          "lbux             %[st3],           %[Temp3](%[cm])                 "
1226          "\n\t" /* odd 7 */
1227          "lbux             %[st1],           %[Temp1](%[cm])                 "
1228          "\n\t" /* odd 8 */
1229
1230          "sb               %[st2],           0(%[odd_dst])                   "
1231          "\n\t" /* odd 6 */
1232          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1233          "\n\t"
1234
1235          "sb               %[st3],           0(%[odd_dst])                   "
1236          "\n\t" /* odd 7 */
1237          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1238          "\n\t"
1239
1240          "sb               %[st1],           0(%[odd_dst])                   "
1241          "\n\t" /* odd 8 */
1242
1243          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
1244            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
1245            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
1246            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1247            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
1248          : [filter12] "r"(filter12), [filter34] "r"(filter34),
1249            [filter56] "r"(filter56), [filter78] "r"(filter78),
1250            [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
1251            [dst_pitch_2] "r"(dst_pitch_2));
1252
1253      src += 16;
1254      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
1255      odd_dst = (dst + dst_stride);
1256    }
1257
1258    /* Next row... */
1259    src_ptr += src_stride;
1260
1261    dst_ptr += 1;
1262  }
1263}
1264
1265void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
1266                               uint8_t *dst, ptrdiff_t dst_stride,
1267                               const int16_t *filter, int w, int h) {
1268  int x, y, k;
1269
1270  for (y = 0; y < h; ++y) {
1271    for (x = 0; x < w; ++x) {
1272      int sum = 0;
1273
1274      for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
1275
1276      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
1277    }
1278
1279    src += src_stride;
1280    dst += 1;
1281  }
1282}
1283
1284void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
1285                           uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
1286  int x, y;
1287
1288  for (y = 0; y < h; ++y) {
1289    for (x = 0; x < w; ++x) {
1290      dst[x * dst_stride] = src[x];
1291    }
1292
1293    src += src_stride;
1294    dst += 1;
1295  }
1296}
1297
1298void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1299                         ptrdiff_t dst_stride, const InterpKernel *filter,
1300                         int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,
1301                         int w, int h) {
1302  const int16_t *const filter_x = filter[x0_q4];
1303  const int16_t *const filter_y = filter[y0_q4];
1304  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
1305  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
1306  uint32_t pos = 38;
1307
1308  assert(x_step_q4 == 16);
1309  assert(y_step_q4 == 16);
1310  assert(((const int32_t *)filter_x)[1] != 0x800000);
1311  assert(((const int32_t *)filter_y)[1] != 0x800000);
1312  (void)x_step_q4;
1313
1314  /* bit positon for extract from acc */
1315  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
1316                       :
1317                       : [pos] "r"(pos));
1318
1319  if (intermediate_height < h) intermediate_height = h;
1320
1321  /* copy the src to dst */
1322  if (filter_x[3] == 0x80) {
1323    copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
1324                          intermediate_height, w, intermediate_height);
1325  } else if (((const int32_t *)filter_x)[0] == 0) {
1326    vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
1327                        intermediate_height, filter_x, w, intermediate_height);
1328  } else {
1329    src -= (src_stride * 3 + 3);
1330
1331    /* prefetch data to cache memory */
1332    prefetch_load(src);
1333    prefetch_load(src + 32);
1334
1335    switch (w) {
1336      case 4:
1337        convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
1338                                          intermediate_height, filter_x,
1339                                          intermediate_height);
1340        break;
1341      case 8:
1342        convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
1343                                          intermediate_height, filter_x,
1344                                          intermediate_height);
1345        break;
1346      case 16:
1347      case 32:
1348        convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
1349                                           intermediate_height, filter_x,
1350                                           intermediate_height, (w / 16));
1351        break;
1352      case 64:
1353        prefetch_load(src + 32);
1354        convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
1355                                           intermediate_height, filter_x,
1356                                           intermediate_height);
1357        break;
1358      default:
1359        convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
1360                                  filter_x, w, intermediate_height);
1361        break;
1362    }
1363  }
1364
1365  /* copy the src to dst */
1366  if (filter_y[3] == 0x80) {
1367    copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
1368  } else if (((const int32_t *)filter_y)[0] == 0) {
1369    vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
1370                        filter_y, h, w);
1371  } else {
1372    switch (h) {
1373      case 4:
1374        convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
1375                                          dst_stride, filter_y, w);
1376        break;
1377      case 8:
1378        convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
1379                                          dst_stride, filter_y, w);
1380        break;
1381      case 16:
1382      case 32:
1383        convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
1384                                           dst_stride, filter_y, w, (h / 16));
1385        break;
1386      case 64:
1387        convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
1388                                           dst_stride, filter_y, w);
1389        break;
1390      default:
1391        convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
1392                                  filter_y, h, w);
1393        break;
1394    }
1395  }
1396}
1397
1398void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
1399                             uint8_t *dst, ptrdiff_t dst_stride,
1400                             const InterpKernel *filter, int x0_q4,
1401                             int x_step_q4, int y0_q4, int y_step_q4, int w,
1402                             int h) {
1403  int x, y;
1404  (void)filter;
1405  (void)x0_q4;
1406  (void)x_step_q4;
1407  (void)y0_q4;
1408  (void)y_step_q4;
1409
1410  /* prefetch data to cache memory */
1411  prefetch_load(src);
1412  prefetch_load(src + 32);
1413  prefetch_store(dst);
1414
1415  switch (w) {
1416    case 4: {
1417      uint32_t tp1;
1418
1419      /* 1 word storage */
1420      for (y = h; y--;) {
1421        prefetch_load(src + src_stride);
1422        prefetch_load(src + src_stride + 32);
1423        prefetch_store(dst + dst_stride);
1424
1425        __asm__ __volatile__(
1426            "ulw              %[tp1],         (%[src])      \n\t"
1427            "sw               %[tp1],         (%[dst])      \n\t" /* store */
1428
1429            : [tp1] "=&r"(tp1)
1430            : [src] "r"(src), [dst] "r"(dst));
1431
1432        src += src_stride;
1433        dst += dst_stride;
1434      }
1435      break;
1436    }
1437    case 8: {
1438      uint32_t tp1, tp2;
1439
1440      /* 2 word storage */
1441      for (y = h; y--;) {
1442        prefetch_load(src + src_stride);
1443        prefetch_load(src + src_stride + 32);
1444        prefetch_store(dst + dst_stride);
1445
1446        __asm__ __volatile__(
1447            "ulw              %[tp1],         0(%[src])      \n\t"
1448            "ulw              %[tp2],         4(%[src])      \n\t"
1449            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1450            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1451
1452            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
1453            : [src] "r"(src), [dst] "r"(dst));
1454
1455        src += src_stride;
1456        dst += dst_stride;
1457      }
1458      break;
1459    }
1460    case 16: {
1461      uint32_t tp1, tp2, tp3, tp4;
1462
1463      /* 4 word storage */
1464      for (y = h; y--;) {
1465        prefetch_load(src + src_stride);
1466        prefetch_load(src + src_stride + 32);
1467        prefetch_store(dst + dst_stride);
1468
1469        __asm__ __volatile__(
1470            "ulw              %[tp1],         0(%[src])      \n\t"
1471            "ulw              %[tp2],         4(%[src])      \n\t"
1472            "ulw              %[tp3],         8(%[src])      \n\t"
1473            "ulw              %[tp4],         12(%[src])     \n\t"
1474
1475            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1476            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1477            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1478            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1479
1480            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1481              [tp4] "=&r"(tp4)
1482            : [src] "r"(src), [dst] "r"(dst));
1483
1484        src += src_stride;
1485        dst += dst_stride;
1486      }
1487      break;
1488    }
1489    case 32: {
1490      uint32_t tp1, tp2, tp3, tp4;
1491      uint32_t tp5, tp6, tp7, tp8;
1492
1493      /* 8 word storage */
1494      for (y = h; y--;) {
1495        prefetch_load(src + src_stride);
1496        prefetch_load(src + src_stride + 32);
1497        prefetch_store(dst + dst_stride);
1498
1499        __asm__ __volatile__(
1500            "ulw              %[tp1],         0(%[src])      \n\t"
1501            "ulw              %[tp2],         4(%[src])      \n\t"
1502            "ulw              %[tp3],         8(%[src])      \n\t"
1503            "ulw              %[tp4],         12(%[src])     \n\t"
1504            "ulw              %[tp5],         16(%[src])     \n\t"
1505            "ulw              %[tp6],         20(%[src])     \n\t"
1506            "ulw              %[tp7],         24(%[src])     \n\t"
1507            "ulw              %[tp8],         28(%[src])     \n\t"
1508
1509            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1510            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1511            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1512            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1513            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
1514            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
1515            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
1516            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
1517
1518            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1519              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
1520              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
1521            : [src] "r"(src), [dst] "r"(dst));
1522
1523        src += src_stride;
1524        dst += dst_stride;
1525      }
1526      break;
1527    }
1528    case 64: {
1529      uint32_t tp1, tp2, tp3, tp4;
1530      uint32_t tp5, tp6, tp7, tp8;
1531
1532      prefetch_load(src + 64);
1533      prefetch_store(dst + 32);
1534
1535      /* 16 word storage */
1536      for (y = h; y--;) {
1537        prefetch_load(src + src_stride);
1538        prefetch_load(src + src_stride + 32);
1539        prefetch_load(src + src_stride + 64);
1540        prefetch_store(dst + dst_stride);
1541        prefetch_store(dst + dst_stride + 32);
1542
1543        __asm__ __volatile__(
1544            "ulw              %[tp1],         0(%[src])      \n\t"
1545            "ulw              %[tp2],         4(%[src])      \n\t"
1546            "ulw              %[tp3],         8(%[src])      \n\t"
1547            "ulw              %[tp4],         12(%[src])     \n\t"
1548            "ulw              %[tp5],         16(%[src])     \n\t"
1549            "ulw              %[tp6],         20(%[src])     \n\t"
1550            "ulw              %[tp7],         24(%[src])     \n\t"
1551            "ulw              %[tp8],         28(%[src])     \n\t"
1552
1553            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1554            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1555            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1556            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1557            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
1558            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
1559            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
1560            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
1561
1562            "ulw              %[tp1],         32(%[src])     \n\t"
1563            "ulw              %[tp2],         36(%[src])     \n\t"
1564            "ulw              %[tp3],         40(%[src])     \n\t"
1565            "ulw              %[tp4],         44(%[src])     \n\t"
1566            "ulw              %[tp5],         48(%[src])     \n\t"
1567            "ulw              %[tp6],         52(%[src])     \n\t"
1568            "ulw              %[tp7],         56(%[src])     \n\t"
1569            "ulw              %[tp8],         60(%[src])     \n\t"
1570
1571            "sw               %[tp1],         32(%[dst])     \n\t" /* store */
1572            "sw               %[tp2],         36(%[dst])     \n\t" /* store */
1573            "sw               %[tp3],         40(%[dst])     \n\t" /* store */
1574            "sw               %[tp4],         44(%[dst])     \n\t" /* store */
1575            "sw               %[tp5],         48(%[dst])     \n\t" /* store */
1576            "sw               %[tp6],         52(%[dst])     \n\t" /* store */
1577            "sw               %[tp7],         56(%[dst])     \n\t" /* store */
1578            "sw               %[tp8],         60(%[dst])     \n\t" /* store */
1579
1580            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1581              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
1582              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
1583            : [src] "r"(src), [dst] "r"(dst));
1584
1585        src += src_stride;
1586        dst += dst_stride;
1587      }
1588      break;
1589    }
1590    default:
1591      for (y = h; y--;) {
1592        for (x = 0; x < w; ++x) {
1593          dst[x] = src[x];
1594        }
1595
1596        src += src_stride;
1597        dst += dst_stride;
1598      }
1599      break;
1600  }
1601}
1602#endif
1603