convolve8_horiz_dspr2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_dsp_common.h"
17#include "vpx_dsp/vpx_filter.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_horiz_4_dspr2(const uint8_t *src,
22                                   int32_t src_stride,
23                                   uint8_t *dst,
24                                   int32_t dst_stride,
25                                   const int16_t *filter_x0,
26                                   int32_t h) {
27  int32_t y;
28  uint8_t *cm = vpx_ff_cropTbl;
29  int32_t vector1b, vector2b, vector3b, vector4b;
30  int32_t Temp1, Temp2, Temp3, Temp4;
31  uint32_t vector4a = 64;
32  uint32_t tp1, tp2;
33  uint32_t p1, p2, p3, p4;
34  uint32_t n1, n2, n3, n4;
35  uint32_t tn1, tn2;
36
37  vector1b = ((const int32_t *)filter_x0)[0];
38  vector2b = ((const int32_t *)filter_x0)[1];
39  vector3b = ((const int32_t *)filter_x0)[2];
40  vector4b = ((const int32_t *)filter_x0)[3];
41
42  for (y = h; y--;) {
43    /* prefetch data to cache memory */
44    prefetch_load(src + src_stride);
45    prefetch_load(src + src_stride + 32);
46    prefetch_store(dst + dst_stride);
47
48    __asm__ __volatile__ (
49        "ulw              %[tp1],      0(%[src])                      \n\t"
50        "ulw              %[tp2],      4(%[src])                      \n\t"
51
52        /* even 1. pixel */
53        "mtlo             %[vector4a], $ac3                           \n\t"
54        "mthi             $zero,       $ac3                           \n\t"
55        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
56        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
57        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
58        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
59        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
60        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
61        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
62        "ulw              %[tn2],      8(%[src])                      \n\t"
63        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
64        "extp             %[Temp1],    $ac3,           31             \n\t"
65
66        /* even 2. pixel */
67        "mtlo             %[vector4a], $ac2                           \n\t"
68        "mthi             $zero,       $ac2                           \n\t"
69        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
70        "balign           %[tn1],      %[tn2],         3              \n\t"
71        "balign           %[tn2],      %[tp2],         3              \n\t"
72        "balign           %[tp2],      %[tp1],         3              \n\t"
73        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
74        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
75        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
76        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
77        "extp             %[Temp3],    $ac2,           31             \n\t"
78
79        /* odd 1. pixel */
80        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
81        "mtlo             %[vector4a], $ac3                           \n\t"
82        "mthi             $zero,       $ac3                           \n\t"
83        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
84        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
85        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
86        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
87        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
88        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
89        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
90        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
91        "extp             %[Temp2],    $ac3,           31             \n\t"
92
93        /* odd 2. pixel */
94        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
95        "mtlo             %[vector4a], $ac2                           \n\t"
96        "mthi             $zero,       $ac2                           \n\t"
97        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
98        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
99        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
100        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
101        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
102        "extp             %[Temp4],    $ac2,           31             \n\t"
103
104        /* clamp */
105        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
106        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
107
108        /* store bytes */
109        "sb               %[tp1],      0(%[dst])                      \n\t"
110        "sb               %[tn1],      1(%[dst])                      \n\t"
111        "sb               %[tp2],      2(%[dst])                      \n\t"
112        "sb               %[n2],       3(%[dst])                      \n\t"
113
114        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
115          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
116          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
117          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
118          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
119          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
120        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
121          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
122          [vector4a] "r" (vector4a),
123          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
124    );
125
126    /* Next row... */
127    src += src_stride;
128    dst += dst_stride;
129  }
130}
131
132static void convolve_horiz_8_dspr2(const uint8_t *src,
133                                   int32_t src_stride,
134                                   uint8_t *dst,
135                                   int32_t dst_stride,
136                                   const int16_t *filter_x0,
137                                   int32_t h) {
138  int32_t y;
139  uint8_t *cm = vpx_ff_cropTbl;
140  uint32_t vector4a = 64;
141  int32_t vector1b, vector2b, vector3b, vector4b;
142  int32_t Temp1, Temp2, Temp3;
143  uint32_t tp1, tp2;
144  uint32_t p1, p2, p3, p4, n1;
145  uint32_t tn1, tn2, tn3;
146  uint32_t st0, st1;
147
148  vector1b = ((const int32_t *)filter_x0)[0];
149  vector2b = ((const int32_t *)filter_x0)[1];
150  vector3b = ((const int32_t *)filter_x0)[2];
151  vector4b = ((const int32_t *)filter_x0)[3];
152
153  for (y = h; y--;) {
154    /* prefetch data to cache memory */
155    prefetch_load(src + src_stride);
156    prefetch_load(src + src_stride + 32);
157    prefetch_store(dst + dst_stride);
158
159    __asm__ __volatile__ (
160        "ulw              %[tp1],      0(%[src])                      \n\t"
161        "ulw              %[tp2],      4(%[src])                      \n\t"
162
163        /* even 1. pixel */
164        "mtlo             %[vector4a], $ac3                           \n\t"
165        "mthi             $zero,       $ac3                           \n\t"
166        "mtlo             %[vector4a], $ac2                           \n\t"
167        "mthi             $zero,       $ac2                           \n\t"
168        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
169        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
170        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
171        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
172        "ulw              %[tn2],      8(%[src])                      \n\t"
173        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
174        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
175        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
176        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
177        "extp             %[Temp1],    $ac3,           31             \n\t"
178
179        /* even 2. pixel */
180        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
181        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
182        "ulw              %[tn1],      12(%[src])                     \n\t"
183        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
184        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
185        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
186        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
187        "extp             %[Temp3],    $ac2,           31             \n\t"
188
189        /* even 3. pixel */
190        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
191        "mtlo             %[vector4a], $ac1                           \n\t"
192        "mthi             $zero,       $ac1                           \n\t"
193        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
194        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
195        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
196        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
197        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
198        "extp             %[Temp1],    $ac1,           31             \n\t"
199
200        /* even 4. pixel */
201        "mtlo             %[vector4a], $ac2                           \n\t"
202        "mthi             $zero,       $ac2                           \n\t"
203        "mtlo             %[vector4a], $ac3                           \n\t"
204        "mthi             $zero,       $ac3                           \n\t"
205        "sb               %[st0],      0(%[dst])                      \n\t"
206        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
207
208        "balign           %[tn3],      %[tn1],         3              \n\t"
209        "balign           %[tn1],      %[tn2],         3              \n\t"
210        "balign           %[tn2],      %[tp2],         3              \n\t"
211        "balign           %[tp2],      %[tp1],         3              \n\t"
212
213        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
214        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
215        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
216        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
217        "extp             %[Temp3],    $ac2,           31             \n\t"
218
219        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
220
221        /* odd 1. pixel */
222        "mtlo             %[vector4a], $ac1                           \n\t"
223        "mthi             $zero,       $ac1                           \n\t"
224        "sb               %[st1],      2(%[dst])                      \n\t"
225        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
226        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
227        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
228        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
229        "sb               %[st0],      4(%[dst])                      \n\t"
230        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
231        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
232        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
233        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
234        "extp             %[Temp2],    $ac3,           31             \n\t"
235
236        /* odd 2. pixel */
237        "mtlo             %[vector4a], $ac3                           \n\t"
238        "mthi             $zero,       $ac3                           \n\t"
239        "mtlo             %[vector4a], $ac2                           \n\t"
240        "mthi             $zero,       $ac2                           \n\t"
241        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
242        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
243        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
244        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
245        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
246        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
247        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
248        "extp             %[Temp3],    $ac1,           31             \n\t"
249
250        /* odd 3. pixel */
251        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
252        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
253        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
254        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
255        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
256        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
257        "extp             %[Temp2],    $ac3,           31             \n\t"
258
259        /* odd 4. pixel */
260        "sb               %[st1],      1(%[dst])                      \n\t"
261        "sb               %[st0],      6(%[dst])                      \n\t"
262        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
263        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
264        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
265        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
266        "extp             %[Temp1],    $ac2,           31             \n\t"
267
268        /* clamp */
269        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
270        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
271        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
272
273        /* store bytes */
274        "sb               %[p4],       3(%[dst])                      \n\t"
275        "sb               %[p2],       5(%[dst])                      \n\t"
276        "sb               %[n1],       7(%[dst])                      \n\t"
277
278        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
279          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
280          [st0] "=&r" (st0), [st1] "=&r" (st1),
281          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
282          [n1] "=&r" (n1),
283          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
284        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
285          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
286          [vector4a] "r" (vector4a),
287          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
288    );
289
290    /* Next row... */
291    src += src_stride;
292    dst += dst_stride;
293  }
294}
295
296static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
297                                    int32_t src_stride,
298                                    uint8_t *dst_ptr,
299                                    int32_t dst_stride,
300                                    const int16_t *filter_x0,
301                                    int32_t h,
302                                    int32_t count) {
303  int32_t y, c;
304  const uint8_t *src;
305  uint8_t *dst;
306  uint8_t *cm = vpx_ff_cropTbl;
307  uint32_t vector_64 = 64;
308  int32_t filter12, filter34, filter56, filter78;
309  int32_t Temp1, Temp2, Temp3;
310  uint32_t qload1, qload2, qload3;
311  uint32_t p1, p2, p3, p4, p5;
312  uint32_t st1, st2, st3;
313
314  filter12 = ((const int32_t *)filter_x0)[0];
315  filter34 = ((const int32_t *)filter_x0)[1];
316  filter56 = ((const int32_t *)filter_x0)[2];
317  filter78 = ((const int32_t *)filter_x0)[3];
318
319  for (y = h; y--;) {
320    src = src_ptr;
321    dst = dst_ptr;
322
323    /* prefetch data to cache memory */
324    prefetch_load(src_ptr + src_stride);
325    prefetch_load(src_ptr + src_stride + 32);
326    prefetch_store(dst_ptr + dst_stride);
327
328    for (c = 0; c < count; c++) {
329      __asm__ __volatile__ (
330          "ulw              %[qload1],    0(%[src])                    \n\t"
331          "ulw              %[qload2],    4(%[src])                    \n\t"
332
333          /* even 1. pixel */
334          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
335          "mthi             $zero,        $ac1                         \n\t"
336          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
337          "mthi             $zero,        $ac2                         \n\t"
338          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
339          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
340          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
341          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
342          "ulw              %[qload3],    8(%[src])                    \n\t"
343          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
344          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
345          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
346          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
347          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
348
349          /* even 2. pixel */
350          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
351          "mthi             $zero,        $ac3                         \n\t"
352          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
353          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
354          "ulw              %[qload1],    12(%[src])                   \n\t"
355          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
356          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
357          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
358          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
359          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
360          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
361
362          /* even 3. pixel */
363          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
364          "mthi             $zero,        $ac1                         \n\t"
365          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
366          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
367          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
368          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
369          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
370          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
371          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
372          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
373
374          /* even 4. pixel */
375          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
376          "mthi             $zero,        $ac2                         \n\t"
377          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
378          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
379          "ulw              %[qload2],    16(%[src])                   \n\t"
380          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
381          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
382          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
383          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
384          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
385          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
386
387          /* even 5. pixel */
388          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
389          "mthi             $zero,        $ac3                         \n\t"
390          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
391          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
392          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
393          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
394          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
395          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
396          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
397          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
398
399          /* even 6. pixel */
400          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
401          "mthi             $zero,        $ac1                         \n\t"
402          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
403          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
404          "ulw              %[qload3],    20(%[src])                   \n\t"
405          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
406          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
407          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
408          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
409          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
410          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
411
412          /* even 7. pixel */
413          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
414          "mthi             $zero,        $ac2                         \n\t"
415          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
416          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
417          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
418          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
419          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
420          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
421          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
422          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
423
424          /* even 8. pixel */
425          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
426          "mthi             $zero,        $ac3                         \n\t"
427          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
428          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
429          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
430          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
431          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
432          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
433          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
434
435          /* ODD pixels */
436          "ulw              %[qload1],    1(%[src])                    \n\t"
437          "ulw              %[qload2],    5(%[src])                    \n\t"
438
439          /* odd 1. pixel */
440          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
441          "mthi             $zero,        $ac1                         \n\t"
442          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
443          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
444          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
445          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
446          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
447          "ulw              %[qload3],    9(%[src])                    \n\t"
448          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
449          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
450          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
451          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
452          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
453          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
454
455          /* odd 2. pixel */
456          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
457          "mthi             $zero,        $ac2                         \n\t"
458          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
459          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
460          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
461          "ulw              %[qload1],    13(%[src])                   \n\t"
462          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
463          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
464          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
465          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
466          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
467          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
468
469          /* odd 3. pixel */
470          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
471          "mthi             $zero,        $ac3                         \n\t"
472          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
473          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
474          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
475          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
476          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
477          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
478          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
479          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
480
481          /* odd 4. pixel */
482          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
483          "mthi             $zero,        $ac1                         \n\t"
484          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
485          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
486          "ulw              %[qload2],    17(%[src])                   \n\t"
487          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
488          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
489          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
490          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
491          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
492          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
493
494          /* odd 5. pixel */
495          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
496          "mthi             $zero,        $ac2                         \n\t"
497          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
498          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
499          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
500          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
501          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
502          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
503          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
504          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
505
506          /* odd 6. pixel */
507          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
508          "mthi             $zero,        $ac3                         \n\t"
509          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
510          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
511          "ulw              %[qload3],    21(%[src])                   \n\t"
512          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
513          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
514          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
515          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
516          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
517          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
518
519          /* odd 7. pixel */
520          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
521          "mthi             $zero,        $ac1                         \n\t"
522          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
523          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
524          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
525          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
526          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
527          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
528          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
529
530          /* odd 8. pixel */
531          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
532          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
533          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
534          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
535          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
536
537          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
538          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
539          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
540
541          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
542          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
543          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
544
545          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
546            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
547            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
548            [p5] "=&r" (p5),
549            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
550          : [filter12] "r" (filter12), [filter34] "r" (filter34),
551            [filter56] "r" (filter56), [filter78] "r" (filter78),
552            [vector_64] "r" (vector_64),
553            [cm] "r" (cm), [dst] "r" (dst),
554            [src] "r" (src)
555      );
556
557      src += 16;
558      dst += 16;
559    }
560
561    /* Next row... */
562    src_ptr += src_stride;
563    dst_ptr += dst_stride;
564  }
565}
566
567static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
568                                    int32_t src_stride,
569                                    uint8_t *dst_ptr,
570                                    int32_t dst_stride,
571                                    const int16_t *filter_x0,
572                                    int32_t h) {
573  int32_t y, c;
574  const uint8_t *src;
575  uint8_t *dst;
576  uint8_t *cm = vpx_ff_cropTbl;
577  uint32_t vector_64 = 64;
578  int32_t filter12, filter34, filter56, filter78;
579  int32_t Temp1, Temp2, Temp3;
580  uint32_t qload1, qload2, qload3;
581  uint32_t p1, p2, p3, p4, p5;
582  uint32_t st1, st2, st3;
583
584  filter12 = ((const int32_t *)filter_x0)[0];
585  filter34 = ((const int32_t *)filter_x0)[1];
586  filter56 = ((const int32_t *)filter_x0)[2];
587  filter78 = ((const int32_t *)filter_x0)[3];
588
589  for (y = h; y--;) {
590    src = src_ptr;
591    dst = dst_ptr;
592
593    /* prefetch data to cache memory */
594    prefetch_load(src_ptr + src_stride);
595    prefetch_load(src_ptr + src_stride + 32);
596    prefetch_load(src_ptr + src_stride + 64);
597    prefetch_store(dst_ptr + dst_stride);
598    prefetch_store(dst_ptr + dst_stride + 32);
599
600    for (c = 0; c < 4; c++) {
601      __asm__ __volatile__ (
602          "ulw              %[qload1],    0(%[src])                    \n\t"
603          "ulw              %[qload2],    4(%[src])                    \n\t"
604
605          /* even 1. pixel */
606          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
607          "mthi             $zero,        $ac1                         \n\t"
608          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
609          "mthi             $zero,        $ac2                         \n\t"
610          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
611          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
612          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
613          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
614          "ulw              %[qload3],    8(%[src])                    \n\t"
615          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
616          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
617          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
618          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
619          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
620
621          /* even 2. pixel */
622          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
623          "mthi             $zero,        $ac3                         \n\t"
624          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
625          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
626          "ulw              %[qload1],    12(%[src])                   \n\t"
627          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
628          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
629          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
630          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
631          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
632          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
633
634          /* even 3. pixel */
635          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
636          "mthi             $zero,        $ac1                         \n\t"
637          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
638          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
639          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
640          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
641          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
642          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
643          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
644          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
645
646          /* even 4. pixel */
647          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
648          "mthi             $zero,        $ac2                         \n\t"
649          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
650          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
651          "ulw              %[qload2],    16(%[src])                   \n\t"
652          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
653          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
654          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
655          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
656          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
657          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
658
659          /* even 5. pixel */
660          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
661          "mthi             $zero,        $ac3                         \n\t"
662          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
663          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
664          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
665          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
666          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
667          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
668          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
669          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
670
671          /* even 6. pixel */
672          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
673          "mthi             $zero,        $ac1                         \n\t"
674          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
675          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
676          "ulw              %[qload3],    20(%[src])                   \n\t"
677          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
678          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
679          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
680          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
681          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
682          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
683
684          /* even 7. pixel */
685          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
686          "mthi             $zero,        $ac2                         \n\t"
687          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
688          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
689          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
690          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
691          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
692          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
693          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
694          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
695
696          /* even 8. pixel */
697          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
698          "mthi             $zero,        $ac3                         \n\t"
699          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
700          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
701          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
702          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
703          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
704          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
705          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
706
707          /* ODD pixels */
708          "ulw              %[qload1],    1(%[src])                    \n\t"
709          "ulw              %[qload2],    5(%[src])                    \n\t"
710
711          /* odd 1. pixel */
712          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
713          "mthi             $zero,        $ac1                         \n\t"
714          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
715          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
716          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
717          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
718          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
719          "ulw              %[qload3],    9(%[src])                    \n\t"
720          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
721          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
722          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
723          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
724          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
725          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
726
727          /* odd 2. pixel */
728          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
729          "mthi             $zero,        $ac2                         \n\t"
730          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
731          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
732          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
733          "ulw              %[qload1],    13(%[src])                   \n\t"
734          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
735          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
736          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
737          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
738          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
739          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
740
741          /* odd 3. pixel */
742          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
743          "mthi             $zero,        $ac3                         \n\t"
744          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
745          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
746          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
747          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
748          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
749          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
750          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
751          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
752
753          /* odd 4. pixel */
754          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
755          "mthi             $zero,        $ac1                         \n\t"
756          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
757          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
758          "ulw              %[qload2],    17(%[src])                   \n\t"
759          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
760          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
761          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
762          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
763          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
764          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
765
766          /* odd 5. pixel */
767          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
768          "mthi             $zero,        $ac2                         \n\t"
769          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
770          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
771          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
772          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
773          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
774          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
775          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
776          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
777
778          /* odd 6. pixel */
779          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
780          "mthi             $zero,        $ac3                         \n\t"
781          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
782          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
783          "ulw              %[qload3],    21(%[src])                   \n\t"
784          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
785          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
786          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
787          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
788          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
789          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
790
791          /* odd 7. pixel */
792          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
793          "mthi             $zero,        $ac1                         \n\t"
794          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
795          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
796          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
797          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
798          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
799          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
800          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
801
802          /* odd 8. pixel */
803          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
804          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
805          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
806          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
807          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
808
809          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
810          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
811          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
812
813          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
814          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
815          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
816
817          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
818            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
819            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
820            [p5] "=&r" (p5),
821            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
822          : [filter12] "r" (filter12), [filter34] "r" (filter34),
823            [filter56] "r" (filter56), [filter78] "r" (filter78),
824            [vector_64] "r" (vector_64),
825            [cm] "r" (cm), [dst] "r" (dst),
826            [src] "r" (src)
827      );
828
829      src += 16;
830      dst += 16;
831    }
832
833    /* Next row... */
834    src_ptr += src_stride;
835    dst_ptr += dst_stride;
836  }
837}
838
839void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
840                               uint8_t *dst, ptrdiff_t dst_stride,
841                               const int16_t *filter_x, int x_step_q4,
842                               const int16_t *filter_y, int y_step_q4,
843                               int w, int h) {
844  assert(x_step_q4 == 16);
845  assert(((const int32_t *)filter_x)[1] != 0x800000);
846
847  if (((const int32_t *)filter_x)[0] == 0) {
848    vpx_convolve2_horiz_dspr2(src, src_stride,
849                              dst, dst_stride,
850                              filter_x, x_step_q4,
851                              filter_y, y_step_q4,
852                              w, h);
853  } else {
854    uint32_t pos = 38;
855
856    prefetch_load((const uint8_t *)filter_x);
857    src -= 3;
858
859    /* bit positon for extract from acc */
860    __asm__ __volatile__ (
861      "wrdsp      %[pos],     1           \n\t"
862      :
863      : [pos] "r" (pos)
864    );
865
866    /* prefetch data to cache memory */
867    prefetch_load(src);
868    prefetch_load(src + 32);
869    prefetch_store(dst);
870
871    switch (w) {
872      case 4:
873        convolve_horiz_4_dspr2(src, (int32_t)src_stride,
874                               dst, (int32_t)dst_stride,
875                               filter_x, (int32_t)h);
876        break;
877      case 8:
878        convolve_horiz_8_dspr2(src, (int32_t)src_stride,
879                               dst, (int32_t)dst_stride,
880                               filter_x, (int32_t)h);
881        break;
882      case 16:
883        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
884                                dst, (int32_t)dst_stride,
885                                filter_x, (int32_t)h, 1);
886        break;
887      case 32:
888        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
889                                dst, (int32_t)dst_stride,
890                                filter_x, (int32_t)h, 2);
891        break;
892      case 64:
893        prefetch_load(src + 64);
894        prefetch_store(dst + 32);
895
896        convolve_horiz_64_dspr2(src, (int32_t)src_stride,
897                                dst, (int32_t)dst_stride,
898                                filter_x, (int32_t)h);
899        break;
900      default:
901        vpx_convolve8_horiz_c(src + 3, src_stride,
902                              dst, dst_stride,
903                              filter_x, x_step_q4,
904                              filter_y, y_step_q4,
905                              w, h);
906        break;
907    }
908  }
909}
910#endif
911