1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_ports/mem.h"
19#include "vp9/common/vp9_filter.h"
20#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22#if HAVE_DSPR2
23uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
24uint8_t *vp9_ff_cropTbl;
25
26void vp9_dsputil_static_init(void) {
27  int i;
28
29  for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
30
31  for (i = 0; i < CROP_WIDTH; i++) {
32    vp9_ff_cropTbl_a[i] = 0;
33    vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
34  }
35
36  vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
37}
38
39static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
40                                              int32_t src_stride,
41                                              uint8_t *dst,
42                                              int32_t dst_stride,
43                                              const int16_t *filter_x0,
44                                              int32_t h) {
45  int32_t y;
46  uint8_t *cm = vp9_ff_cropTbl;
47  uint8_t *dst_ptr;
48  int32_t vector1b, vector2b, vector3b, vector4b;
49  int32_t Temp1, Temp2, Temp3, Temp4;
50  uint32_t vector4a = 64;
51  uint32_t tp1, tp2;
52  uint32_t p1, p2, p3, p4;
53  uint32_t tn1, tn2;
54
55  vector1b = ((const int32_t *)filter_x0)[0];
56  vector2b = ((const int32_t *)filter_x0)[1];
57  vector3b = ((const int32_t *)filter_x0)[2];
58  vector4b = ((const int32_t *)filter_x0)[3];
59
60  for (y = h; y--;) {
61    dst_ptr = dst;
62    /* prefetch data to cache memory */
63    vp9_prefetch_load(src + src_stride);
64    vp9_prefetch_load(src + src_stride + 32);
65
66    __asm__ __volatile__ (
67        "ulw              %[tp1],         0(%[src])                      \n\t"
68        "ulw              %[tp2],         4(%[src])                      \n\t"
69
70        /* even 1. pixel */
71        "mtlo             %[vector4a],    $ac3                           \n\t"
72        "mthi             $zero,          $ac3                           \n\t"
73        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
74        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
75        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
76        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
77        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
78        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
79        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
80        "ulw              %[tn2],         8(%[src])                      \n\t"
81        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
82        "extp             %[Temp1],       $ac3,           31             \n\t"
83
84        /* even 2. pixel */
85        "mtlo             %[vector4a],    $ac2                           \n\t"
86        "mthi             $zero,          $ac2                           \n\t"
87        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
88        "balign           %[tn1],         %[tn2],         3              \n\t"
89        "balign           %[tn2],         %[tp2],         3              \n\t"
90        "balign           %[tp2],         %[tp1],         3              \n\t"
91        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
92        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
93        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
94        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
95        "extp             %[Temp3],       $ac2,           31             \n\t"
96
97        /* odd 1. pixel */
98        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
99        "mtlo             %[vector4a],    $ac3                           \n\t"
100        "mthi             $zero,          $ac3                           \n\t"
101        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
102        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
103        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
104        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
105        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
106        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
107        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
108        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
109        "extp             %[Temp2],       $ac3,           31             \n\t"
110
111        /* odd 2. pixel */
112        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
113        "mtlo             %[vector4a],    $ac2                           \n\t"
114        "mthi             $zero,          $ac2                           \n\t"
115        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
116        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
117        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
118        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
119        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
120        "extp             %[Temp4],       $ac2,           31             \n\t"
121
122        /* clamp */
123        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
124        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
125
126        /* store bytes */
127        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
128        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
129
130        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
131        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
132
133        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
134        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
135
136        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
137        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
138
139        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
140          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
141          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
142          [dst_ptr] "+r" (dst_ptr)
143        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
144          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
145          [vector4a] "r" (vector4a),
146          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
147    );
148
149    /* Next row... */
150    src += src_stride;
151    dst += 1;
152  }
153}
154
155static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
156                                              int32_t src_stride,
157                                              uint8_t *dst,
158                                              int32_t dst_stride,
159                                              const int16_t *filter_x0,
160                                              int32_t h) {
161  int32_t y;
162  uint8_t *cm = vp9_ff_cropTbl;
163  uint8_t *dst_ptr;
164  uint32_t vector4a = 64;
165  int32_t vector1b, vector2b, vector3b, vector4b;
166  int32_t Temp1, Temp2, Temp3;
167  uint32_t tp1, tp2, tp3;
168  uint32_t p1, p2, p3, p4, n1;
169  uint8_t *odd_dst;
170  uint32_t dst_pitch_2 = (dst_stride << 1);
171
172  vector1b = ((const int32_t *)filter_x0)[0];
173  vector2b = ((const int32_t *)filter_x0)[1];
174  vector3b = ((const int32_t *)filter_x0)[2];
175  vector4b = ((const int32_t *)filter_x0)[3];
176
177  for (y = h; y--;) {
178    /* prefetch data to cache memory */
179    vp9_prefetch_load(src + src_stride);
180    vp9_prefetch_load(src + src_stride + 32);
181
182    dst_ptr = dst;
183    odd_dst = (dst_ptr + dst_stride);
184
185    __asm__ __volatile__ (
186        "ulw              %[tp2],         0(%[src])                       \n\t"
187        "ulw              %[tp1],         4(%[src])                       \n\t"
188
189        /* even 1. pixel */
190        "mtlo             %[vector4a],    $ac3                            \n\t"
191        "mthi             $zero,          $ac3                            \n\t"
192        "mtlo             %[vector4a],    $ac2                            \n\t"
193        "mthi             $zero,          $ac2                            \n\t"
194        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
195        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
196        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
197        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
198        "ulw              %[tp3],         8(%[src])                       \n\t"
199        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
200        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
201        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
202        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
203        "extp             %[Temp1],       $ac3,           31              \n\t"
204
205        /* even 2. pixel */
206        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
207        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
208        "ulw              %[tp2],         12(%[src])                      \n\t"
209        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
210        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
211        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
212        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
213        "extp             %[Temp3],       $ac2,           31              \n\t"
214
215        /* even 3. pixel */
216        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
217        "mtlo             %[vector4a],    $ac1                            \n\t"
218        "mthi             $zero,          $ac1                            \n\t"
219        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
220        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
221        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
222        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
223        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
224        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
225        "extp             %[p3],          $ac1,           31              \n\t"
226
227        /* even 4. pixel */
228        "mtlo             %[vector4a],    $ac2                            \n\t"
229        "mthi             $zero,          $ac2                            \n\t"
230        "mtlo             %[vector4a],    $ac3                            \n\t"
231        "mthi             $zero,          $ac3                            \n\t"
232        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
233        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
234        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
235        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
236
237        "ulw              %[tp1],         1(%[src])                       \n\t"
238        "ulw              %[tp3],         5(%[src])                       \n\t"
239
240        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
241        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
242        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
243        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
244        "extp             %[Temp3],       $ac2,           31              \n\t"
245
246        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
247
248        /* odd 1. pixel */
249        "mtlo             %[vector4a],    $ac1                            \n\t"
250        "mthi             $zero,          $ac1                            \n\t"
251        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
252        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
253        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
254        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
255        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
256        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
257        "ulw              %[tp2],         9(%[src])                       \n\t"
258
259        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
260        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
261        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
262        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
263        "extp             %[Temp2],       $ac3,           31              \n\t"
264
265        /* odd 2. pixel */
266        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
267        "mtlo             %[vector4a],    $ac3                            \n\t"
268        "mthi             $zero,          $ac3                            \n\t"
269        "mtlo             %[vector4a],    $ac2                            \n\t"
270        "mthi             $zero,          $ac2                            \n\t"
271        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
272        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
273        "ulw              %[Temp1],       13(%[src])                      \n\t"
274        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
275        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
276        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
277        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
278        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
279        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
280        "extp             %[Temp3],       $ac1,           31              \n\t"
281
282        /* odd 3. pixel */
283        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
284        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
285        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
286        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
287        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
288        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
289        "extp             %[Temp2],       $ac3,           31              \n\t"
290
291        /* odd 4. pixel */
292        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
293        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
294        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
295        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
296        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
297        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
298        "extp             %[Temp1],       $ac2,           31              \n\t"
299
300        /* clamp */
301        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
302        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
303        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
304
305        /* store bytes */
306        "sb               %[p4],          0(%[odd_dst])                   \n\t"
307        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
308
309        "sb               %[p2],          0(%[odd_dst])                   \n\t"
310        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
311
312        "sb               %[n1],          0(%[odd_dst])                   \n\t"
313
314        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
315          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
316          [n1] "=&r" (n1),
317          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
318          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
319        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
320          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
321          [vector4a] "r" (vector4a), [cm] "r" (cm),
322          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
323    );
324
325    /* Next row... */
326    src += src_stride;
327    dst += 1;
328  }
329}
330
331static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
332                                               int32_t src_stride,
333                                               uint8_t *dst_ptr,
334                                               int32_t dst_stride,
335                                               const int16_t *filter_x0,
336                                               int32_t h,
337                                               int32_t count) {
338  int32_t c, y;
339  const uint8_t *src;
340  uint8_t *dst;
341  uint8_t *cm = vp9_ff_cropTbl;
342  uint32_t vector_64 = 64;
343  int32_t  filter12, filter34, filter56, filter78;
344  int32_t  Temp1, Temp2, Temp3;
345  uint32_t qload1, qload2;
346  uint32_t p1, p2, p3, p4, p5;
347  uint32_t st1, st2, st3;
348  uint32_t dst_pitch_2 = (dst_stride << 1);
349  uint8_t  *odd_dst;
350
351  filter12 = ((const int32_t *)filter_x0)[0];
352  filter34 = ((const int32_t *)filter_x0)[1];
353  filter56 = ((const int32_t *)filter_x0)[2];
354  filter78 = ((const int32_t *)filter_x0)[3];
355
356  for (y = h; y--;) {
357    /* prefetch data to cache memory */
358    vp9_prefetch_load(src_ptr + src_stride);
359    vp9_prefetch_load(src_ptr + src_stride + 32);
360
361    src = src_ptr;
362    dst = dst_ptr;
363
364    odd_dst = (dst + dst_stride);
365
366    for (c = 0; c < count; c++) {
367      __asm__ __volatile__ (
368          "ulw              %[qload1],        0(%[src])                       \n\t"
369          "ulw              %[qload2],        4(%[src])                       \n\t"
370
371          /* even 1. pixel */
372          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
373          "mthi             $zero,            $ac1                            \n\t"
374          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
375          "mthi             $zero,            $ac2                            \n\t"
376          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
377          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
378          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
379          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
380          "ulw              %[qload2],        8(%[src])                       \n\t"
381          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
382          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
383          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
384          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
385          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
386
387          /* even 2. pixel */
388          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
389          "mthi             $zero,            $ac3                            \n\t"
390          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
391          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
392          "ulw              %[qload1],        12(%[src])                      \n\t"
393          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
394          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
395          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
396          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
397          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
398          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
399
400          /* even 3. pixel */
401          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
402          "mthi             $zero,            $ac1                            \n\t"
403          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
404          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
405          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
406          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
407          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
408          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
409          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
410          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
411          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
412
413          /* even 4. pixel */
414          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
415          "mthi             $zero,            $ac2                            \n\t"
416          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
417          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
418          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
419          "ulw              %[qload2],        16(%[src])                      \n\t"
420          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
421          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
422          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
423          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
424          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
425          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
426
427          /* even 5. pixel */
428          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
429          "mthi             $zero,            $ac3                            \n\t"
430          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
431          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
432          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
433          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
434          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
435          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
436          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
437          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
438          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
439
440          /* even 6. pixel */
441          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
442          "mthi             $zero,            $ac1                            \n\t"
443          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
444          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
445          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
446          "ulw              %[qload1],        20(%[src])                      \n\t"
447          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
448          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
449          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
450          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
451          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
452          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
453
454          /* even 7. pixel */
455          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
456          "mthi             $zero,            $ac2                            \n\t"
457          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
458          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
459          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
460          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
461          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
462          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
463          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
464          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
465          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
466
467          /* even 8. pixel */
468          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
469          "mthi             $zero,            $ac3                            \n\t"
470          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
471          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
472          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
473          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
474          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
475          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
476          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
477          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
478
479          /* ODD pixels */
480          "ulw              %[qload1],        1(%[src])                       \n\t"
481          "ulw              %[qload2],        5(%[src])                       \n\t"
482
483          /* odd 1. pixel */
484          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
485          "mthi             $zero,            $ac1                            \n\t"
486          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
487          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
488          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
489          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
490          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
491          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
492          "ulw              %[qload2],        9(%[src])                       \n\t"
493          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
494          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
495          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
496          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
497          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
498          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
499
500          /* odd 2. pixel */
501          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
502          "mthi             $zero,            $ac2                            \n\t"
503          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
504          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
505          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
506          "ulw              %[qload1],        13(%[src])                      \n\t"
507          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
508          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
509          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
510          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
511          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
512          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
513
514          /* odd 3. pixel */
515          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
516          "mthi             $zero,            $ac3                            \n\t"
517          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
518          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
519          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
520          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
521          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
522          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
523          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
524          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
525          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
526
527          /* odd 4. pixel */
528          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
529          "mthi             $zero,            $ac1                            \n\t"
530          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
531          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
532          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
533          "ulw              %[qload2],        17(%[src])                      \n\t"
534          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
535          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
536          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
537          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
538          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
539          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
540
541          /* odd 5. pixel */
542          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
543          "mthi             $zero,            $ac2                            \n\t"
544          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
545          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
546          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
547          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
548          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
549          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
550          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
551          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
552          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
553
554          /* odd 6. pixel */
555          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
556          "mthi             $zero,            $ac3                            \n\t"
557          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
558          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
559          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
560          "ulw              %[qload1],        21(%[src])                      \n\t"
561          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
562          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
563          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
564          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
565          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
566          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
567
568          /* odd 7. pixel */
569          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
570          "mthi             $zero,            $ac1                            \n\t"
571          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
572          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
573          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
574          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
575          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
576          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
577          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
578          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
579
580          /* odd 8. pixel */
581          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
582          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
583          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
584          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
585          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
586
587          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
588          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
589          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
590
591          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
592          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
593
594          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
595          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
596
597          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
598
599          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
600            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
601            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
602            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
603            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
604          : [filter12] "r" (filter12), [filter34] "r" (filter34),
605            [filter56] "r" (filter56), [filter78] "r" (filter78),
606            [vector_64] "r" (vector_64), [cm] "r" (cm),
607            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
608      );
609
610      src += 16;
611      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
612      odd_dst = (dst + dst_stride);
613    }
614
615    /* Next row... */
616    src_ptr += src_stride;
617
618    dst_ptr += 1;
619  }
620}
621
622static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
623                                               int32_t src_stride,
624                                               uint8_t *dst_ptr,
625                                               int32_t dst_stride,
626                                               const int16_t *filter_x0,
627                                               int32_t h) {
628  int32_t c, y;
629  const uint8_t *src;
630  uint8_t *dst;
631  uint8_t *cm = vp9_ff_cropTbl;
632  uint32_t vector_64 = 64;
633  int32_t  filter12, filter34, filter56, filter78;
634  int32_t  Temp1, Temp2, Temp3;
635  uint32_t qload1, qload2;
636  uint32_t p1, p2, p3, p4, p5;
637  uint32_t st1, st2, st3;
638  uint32_t dst_pitch_2 = (dst_stride << 1);
639  uint8_t  *odd_dst;
640
641  filter12 = ((const int32_t *)filter_x0)[0];
642  filter34 = ((const int32_t *)filter_x0)[1];
643  filter56 = ((const int32_t *)filter_x0)[2];
644  filter78 = ((const int32_t *)filter_x0)[3];
645
646  for (y = h; y--;) {
647    /* prefetch data to cache memory */
648    vp9_prefetch_load(src_ptr + src_stride);
649    vp9_prefetch_load(src_ptr + src_stride + 32);
650    vp9_prefetch_load(src_ptr + src_stride + 64);
651
652    src = src_ptr;
653    dst = dst_ptr;
654
655    odd_dst = (dst + dst_stride);
656
657    for (c = 0; c < 4; c++) {
658      __asm__ __volatile__ (
659          "ulw              %[qload1],        0(%[src])                       \n\t"
660          "ulw              %[qload2],        4(%[src])                       \n\t"
661
662          /* even 1. pixel */
663          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
664          "mthi             $zero,            $ac1                            \n\t"
665          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
666          "mthi             $zero,            $ac2                            \n\t"
667          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
668          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
669          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
670          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
671          "ulw              %[qload2],        8(%[src])                       \n\t"
672          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
673          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
674          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
675          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
676          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
677
678          /* even 2. pixel */
679          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
680          "mthi             $zero,            $ac3                            \n\t"
681          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
682          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
683          "ulw              %[qload1],        12(%[src])                      \n\t"
684          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
685          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
686          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
687          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
688          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
689          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
690
691          /* even 3. pixel */
692          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
693          "mthi             $zero,            $ac1                            \n\t"
694          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
695          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
696          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
697          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
698          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
699          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
700          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
701          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
702          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
703
704          /* even 4. pixel */
705          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
706          "mthi             $zero,            $ac2                            \n\t"
707          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
708          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
709          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
710          "ulw              %[qload2],        16(%[src])                      \n\t"
711          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
712          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
713          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
714          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
715          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
716          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
717
718          /* even 5. pixel */
719          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
720          "mthi             $zero,            $ac3                            \n\t"
721          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
722          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
723          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
724          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
725          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
726          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
727          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
728          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
729          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
730
731          /* even 6. pixel */
732          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
733          "mthi             $zero,            $ac1                            \n\t"
734          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
735          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
736          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
737          "ulw              %[qload1],        20(%[src])                      \n\t"
738          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
739          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
740          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
741          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
742          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
743          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
744
745          /* even 7. pixel */
746          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
747          "mthi             $zero,            $ac2                            \n\t"
748          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
749          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
750          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
751          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
752          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
753          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
754          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
755          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
756          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
757
758          /* even 8. pixel */
759          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
760          "mthi             $zero,            $ac3                            \n\t"
761          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
762          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
763          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
764          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
765          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
766          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
767          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
768          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
769
770          /* ODD pixels */
771          "ulw              %[qload1],        1(%[src])                       \n\t"
772          "ulw              %[qload2],        5(%[src])                       \n\t"
773
774          /* odd 1. pixel */
775          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
776          "mthi             $zero,            $ac1                            \n\t"
777          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
778          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
779          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
780          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
781          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
782          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
783          "ulw              %[qload2],        9(%[src])                       \n\t"
784          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
785          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
786          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
787          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
788          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
789          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
790
791          /* odd 2. pixel */
792          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
793          "mthi             $zero,            $ac2                            \n\t"
794          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
795          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
796          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
797          "ulw              %[qload1],        13(%[src])                      \n\t"
798          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
799          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
800          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
801          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
802          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
803          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
804
805          /* odd 3. pixel */
806          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
807          "mthi             $zero,            $ac3                            \n\t"
808          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
809          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
810          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
811          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
812          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
813          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
814          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
815          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
816          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
817
818          /* odd 4. pixel */
819          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
820          "mthi             $zero,            $ac1                            \n\t"
821          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
822          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
823          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
824          "ulw              %[qload2],        17(%[src])                      \n\t"
825          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
826          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
827          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
828          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
829          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
830          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
831
832          /* odd 5. pixel */
833          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
834          "mthi             $zero,            $ac2                            \n\t"
835          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
836          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
837          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
838          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
839          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
840          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
841          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
842          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
843          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
844
845          /* odd 6. pixel */
846          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
847          "mthi             $zero,            $ac3                            \n\t"
848          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
849          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
850          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
851          "ulw              %[qload1],        21(%[src])                      \n\t"
852          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
853          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
854          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
855          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
856          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
857          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
858
859          /* odd 7. pixel */
860          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
861          "mthi             $zero,            $ac1                            \n\t"
862          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
863          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
864          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
865          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
866          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
867          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
868          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
869          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
870
871          /* odd 8. pixel */
872          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
873          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
874          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
875          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
876          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
877
878          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
879          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
880          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
881
882          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
883          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
884
885          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
886          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
887
888          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
889
890          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
891            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
892            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
893            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
894            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
895          : [filter12] "r" (filter12), [filter34] "r" (filter34),
896            [filter56] "r" (filter56), [filter78] "r" (filter78),
897            [vector_64] "r" (vector_64), [cm] "r" (cm),
898            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
899      );
900
901      src += 16;
902      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
903      odd_dst = (dst + dst_stride);
904    }
905
906    /* Next row... */
907    src_ptr += src_stride;
908
909    dst_ptr += 1;
910  }
911}
912
913void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
914                               uint8_t *dst, ptrdiff_t dst_stride,
915                               const int16_t *filter, int w, int h) {
916  int x, y, k;
917
918  for (y = 0; y < h; ++y) {
919    for (x = 0; x < w; ++x) {
920      int sum = 0;
921
922      for (k = 0; k < 8; ++k)
923        sum += src[x + k] * filter[k];
924
925      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
926    }
927
928    src += src_stride;
929    dst += 1;
930  }
931}
932
933void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
934                           uint8_t *dst, ptrdiff_t dst_stride,
935                           int w, int h) {
936  int x, y;
937
938  for (y = 0; y < h; ++y) {
939    for (x = 0; x < w; ++x) {
940      dst[x * dst_stride] = src[x];
941    }
942
943    src += src_stride;
944    dst += 1;
945  }
946}
947
948void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
949                         uint8_t *dst, ptrdiff_t dst_stride,
950                         const int16_t *filter_x, int x_step_q4,
951                         const int16_t *filter_y, int y_step_q4,
952                         int w, int h) {
953  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
954  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
955  uint32_t pos = 38;
956
957  /* bit positon for extract from acc */
958  __asm__ __volatile__ (
959    "wrdsp      %[pos],     1           \n\t"
960    :
961    : [pos] "r" (pos)
962  );
963
964  if (intermediate_height < h)
965    intermediate_height = h;
966
967  if (x_step_q4 != 16 || y_step_q4 != 16)
968    return vp9_convolve8_c(src, src_stride,
969                           dst, dst_stride,
970                           filter_x, x_step_q4,
971                           filter_y, y_step_q4,
972                           w, h);
973
974  if ((((const int32_t *)filter_x)[1] == 0x800000)
975      && (((const int32_t *)filter_y)[1] == 0x800000))
976    return vp9_convolve_copy(src, src_stride,
977                             dst, dst_stride,
978                             filter_x, x_step_q4,
979                             filter_y, y_step_q4,
980                             w, h);
981
982  /* copy the src to dst */
983  if (filter_x[3] == 0x80) {
984    copy_horiz_transposed(src - src_stride * 3, src_stride,
985                          temp, intermediate_height,
986                          w, intermediate_height);
987  } else if (((const int32_t *)filter_x)[0] == 0) {
988    vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
989                        temp, intermediate_height,
990                        filter_x,
991                        w, intermediate_height);
992  } else {
993    src -= (src_stride * 3 + 3);
994
995    /* prefetch data to cache memory */
996    vp9_prefetch_load(src);
997    vp9_prefetch_load(src + 32);
998
999    switch (w) {
1000      case 4:
1001        convolve_horiz_4_transposed_dspr2(src, src_stride,
1002                                          temp, intermediate_height,
1003                                          filter_x, intermediate_height);
1004        break;
1005      case 8:
1006        convolve_horiz_8_transposed_dspr2(src, src_stride,
1007                                          temp, intermediate_height,
1008                                          filter_x, intermediate_height);
1009        break;
1010      case 16:
1011      case 32:
1012        convolve_horiz_16_transposed_dspr2(src, src_stride,
1013                                           temp, intermediate_height,
1014                                           filter_x, intermediate_height,
1015                                           (w/16));
1016        break;
1017      case 64:
1018        vp9_prefetch_load(src + 32);
1019        convolve_horiz_64_transposed_dspr2(src, src_stride,
1020                                           temp, intermediate_height,
1021                                           filter_x, intermediate_height);
1022        break;
1023      default:
1024        convolve_horiz_transposed(src, src_stride,
1025                                  temp, intermediate_height,
1026                                  filter_x, w, intermediate_height);
1027        break;
1028    }
1029  }
1030
1031  /* copy the src to dst */
1032  if (filter_y[3] == 0x80) {
1033    copy_horiz_transposed(temp + 3, intermediate_height,
1034                          dst, dst_stride,
1035                          h, w);
1036  } else if (((const int32_t *)filter_y)[0] == 0) {
1037    vp9_convolve2_dspr2(temp + 3, intermediate_height,
1038                        dst, dst_stride,
1039                        filter_y,
1040                        h, w);
1041  } else {
1042    switch (h) {
1043      case 4:
1044        convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
1045                                          dst, dst_stride,
1046                                          filter_y, w);
1047        break;
1048      case 8:
1049        convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
1050                                          dst, dst_stride,
1051                                          filter_y, w);
1052        break;
1053      case 16:
1054      case 32:
1055        convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
1056                                           dst, dst_stride,
1057                                           filter_y, w, (h/16));
1058        break;
1059      case 64:
1060        convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
1061                                           dst, dst_stride,
1062                                           filter_y, w);
1063        break;
1064      default:
1065        convolve_horiz_transposed(temp, intermediate_height,
1066                                  dst, dst_stride,
1067                                  filter_y, h, w);
1068        break;
1069    }
1070  }
1071}
1072
1073void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
1074                             uint8_t *dst, ptrdiff_t dst_stride,
1075                             const int16_t *filter_x, int filter_x_stride,
1076                             const int16_t *filter_y, int filter_y_stride,
1077                             int w, int h) {
1078  int x, y;
1079
1080  /* prefetch data to cache memory */
1081  vp9_prefetch_load(src);
1082  vp9_prefetch_load(src + 32);
1083  vp9_prefetch_store(dst);
1084
1085  switch (w) {
1086    case 4:
1087      {
1088      uint32_t tp1;
1089
1090      /* 1 word storage */
1091      for (y = h; y--; ) {
1092        vp9_prefetch_load(src + src_stride);
1093        vp9_prefetch_load(src + src_stride + 32);
1094        vp9_prefetch_store(dst + dst_stride);
1095
1096        __asm__ __volatile__ (
1097            "ulw              %[tp1],         (%[src])      \n\t"
1098            "sw               %[tp1],         (%[dst])      \n\t"  /* store */
1099
1100            : [tp1] "=&r" (tp1)
1101            : [src] "r" (src), [dst] "r" (dst)
1102        );
1103
1104        src += src_stride;
1105        dst += dst_stride;
1106      }
1107      }
1108      break;
1109    case 8:
1110      {
1111      uint32_t tp1, tp2;
1112
1113      /* 2 word storage */
1114      for (y = h; y--; ) {
1115        vp9_prefetch_load(src + src_stride);
1116        vp9_prefetch_load(src + src_stride + 32);
1117        vp9_prefetch_store(dst + dst_stride);
1118
1119        __asm__ __volatile__ (
1120            "ulw              %[tp1],         0(%[src])      \n\t"
1121            "ulw              %[tp2],         4(%[src])      \n\t"
1122            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
1123            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
1124
1125            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
1126            : [src] "r" (src), [dst] "r" (dst)
1127        );
1128
1129        src += src_stride;
1130        dst += dst_stride;
1131      }
1132      }
1133      break;
1134    case 16:
1135      {
1136      uint32_t tp1, tp2, tp3, tp4;
1137
1138      /* 4 word storage */
1139      for (y = h; y--; ) {
1140        vp9_prefetch_load(src + src_stride);
1141        vp9_prefetch_load(src + src_stride + 32);
1142        vp9_prefetch_store(dst + dst_stride);
1143
1144        __asm__ __volatile__ (
1145            "ulw              %[tp1],         0(%[src])      \n\t"
1146            "ulw              %[tp2],         4(%[src])      \n\t"
1147            "ulw              %[tp3],         8(%[src])      \n\t"
1148            "ulw              %[tp4],         12(%[src])     \n\t"
1149
1150            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
1151            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
1152            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
1153            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
1154
1155            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1156              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
1157            : [src] "r" (src), [dst] "r" (dst)
1158        );
1159
1160        src += src_stride;
1161        dst += dst_stride;
1162      }
1163      }
1164      break;
1165    case 32:
1166      {
1167      uint32_t tp1, tp2, tp3, tp4;
1168      uint32_t tp5, tp6, tp7, tp8;
1169
1170      /* 8 word storage */
1171      for (y = h; y--; ) {
1172        vp9_prefetch_load(src + src_stride);
1173        vp9_prefetch_load(src + src_stride + 32);
1174        vp9_prefetch_store(dst + dst_stride);
1175
1176        __asm__ __volatile__ (
1177            "ulw              %[tp1],         0(%[src])      \n\t"
1178            "ulw              %[tp2],         4(%[src])      \n\t"
1179            "ulw              %[tp3],         8(%[src])      \n\t"
1180            "ulw              %[tp4],         12(%[src])     \n\t"
1181            "ulw              %[tp5],         16(%[src])     \n\t"
1182            "ulw              %[tp6],         20(%[src])     \n\t"
1183            "ulw              %[tp7],         24(%[src])     \n\t"
1184            "ulw              %[tp8],         28(%[src])     \n\t"
1185
1186            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
1187            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
1188            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
1189            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
1190            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
1191            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
1192            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
1193            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
1194
1195            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1196              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
1197              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
1198              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
1199            : [src] "r" (src), [dst] "r" (dst)
1200        );
1201
1202        src += src_stride;
1203        dst += dst_stride;
1204      }
1205      }
1206      break;
1207    case 64:
1208      {
1209      uint32_t tp1, tp2, tp3, tp4;
1210      uint32_t tp5, tp6, tp7, tp8;
1211
1212      vp9_prefetch_load(src + 64);
1213      vp9_prefetch_store(dst + 32);
1214
1215      /* 16 word storage */
1216      for (y = h; y--; ) {
1217        vp9_prefetch_load(src + src_stride);
1218        vp9_prefetch_load(src + src_stride + 32);
1219        vp9_prefetch_load(src + src_stride + 64);
1220        vp9_prefetch_store(dst + dst_stride);
1221        vp9_prefetch_store(dst + dst_stride + 32);
1222
1223        __asm__ __volatile__ (
1224            "ulw              %[tp1],         0(%[src])      \n\t"
1225            "ulw              %[tp2],         4(%[src])      \n\t"
1226            "ulw              %[tp3],         8(%[src])      \n\t"
1227            "ulw              %[tp4],         12(%[src])     \n\t"
1228            "ulw              %[tp5],         16(%[src])     \n\t"
1229            "ulw              %[tp6],         20(%[src])     \n\t"
1230            "ulw              %[tp7],         24(%[src])     \n\t"
1231            "ulw              %[tp8],         28(%[src])     \n\t"
1232
1233            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
1234            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
1235            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
1236            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
1237            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
1238            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
1239            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
1240            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
1241
1242            "ulw              %[tp1],         32(%[src])     \n\t"
1243            "ulw              %[tp2],         36(%[src])     \n\t"
1244            "ulw              %[tp3],         40(%[src])     \n\t"
1245            "ulw              %[tp4],         44(%[src])     \n\t"
1246            "ulw              %[tp5],         48(%[src])     \n\t"
1247            "ulw              %[tp6],         52(%[src])     \n\t"
1248            "ulw              %[tp7],         56(%[src])     \n\t"
1249            "ulw              %[tp8],         60(%[src])     \n\t"
1250
1251            "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
1252            "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
1253            "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
1254            "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
1255            "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
1256            "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
1257            "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
1258            "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
1259
1260            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1261              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
1262              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
1263              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
1264            : [src] "r" (src), [dst] "r" (dst)
1265        );
1266
1267        src += src_stride;
1268        dst += dst_stride;
1269      }
1270      }
1271      break;
1272    default:
1273      for (y = h; y--; ) {
1274        for (x = 0; x < w; ++x) {
1275          dst[x] = src[x];
1276        }
1277
1278        src += src_stride;
1279        dst += dst_stride;
1280      }
1281      break;
1282  }
1283}
1284#endif
1285