1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_dsp_common.h"
17#include "vpx_dsp/vpx_filter.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
22                                   uint8_t *dst, int32_t dst_stride,
23                                   const int16_t *filter_x0, int32_t h) {
24  int32_t y;
25  uint8_t *cm = vpx_ff_cropTbl;
26  int32_t vector1b, vector2b, vector3b, vector4b;
27  int32_t Temp1, Temp2, Temp3, Temp4;
28  uint32_t vector4a = 64;
29  uint32_t tp1, tp2;
30  uint32_t p1, p2, p3, p4;
31  uint32_t n1, n2, n3, n4;
32  uint32_t tn1, tn2;
33
34  vector1b = ((const int32_t *)filter_x0)[0];
35  vector2b = ((const int32_t *)filter_x0)[1];
36  vector3b = ((const int32_t *)filter_x0)[2];
37  vector4b = ((const int32_t *)filter_x0)[3];
38
39  for (y = h; y--;) {
40    /* prefetch data to cache memory */
41    prefetch_load(src + src_stride);
42    prefetch_load(src + src_stride + 32);
43    prefetch_store(dst + dst_stride);
44
45    __asm__ __volatile__(
46        "ulw              %[tp1],      0(%[src])                      \n\t"
47        "ulw              %[tp2],      4(%[src])                      \n\t"
48
49        /* even 1. pixel */
50        "mtlo             %[vector4a], $ac3                           \n\t"
51        "mthi             $zero,       $ac3                           \n\t"
52        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
53        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
54        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
55        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
56        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
57        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
58        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
59        "ulw              %[tn2],      8(%[src])                      \n\t"
60        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
61        "extp             %[Temp1],    $ac3,           31             \n\t"
62
63        /* even 2. pixel */
64        "mtlo             %[vector4a], $ac2                           \n\t"
65        "mthi             $zero,       $ac2                           \n\t"
66        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
67        "balign           %[tn1],      %[tn2],         3              \n\t"
68        "balign           %[tn2],      %[tp2],         3              \n\t"
69        "balign           %[tp2],      %[tp1],         3              \n\t"
70        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
71        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
72        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
73        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
74        "extp             %[Temp3],    $ac2,           31             \n\t"
75
76        /* odd 1. pixel */
77        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
78        "mtlo             %[vector4a], $ac3                           \n\t"
79        "mthi             $zero,       $ac3                           \n\t"
80        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
81        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
82        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
83        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
84        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
85        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
86        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
87        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
88        "extp             %[Temp2],    $ac3,           31             \n\t"
89
90        /* odd 2. pixel */
91        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
92        "mtlo             %[vector4a], $ac2                           \n\t"
93        "mthi             $zero,       $ac2                           \n\t"
94        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
95        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
96        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
97        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
98        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
99        "extp             %[Temp4],    $ac2,           31             \n\t"
100
101        /* clamp */
102        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
103        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
104
105        /* store bytes */
106        "sb               %[tp1],      0(%[dst])                      \n\t"
107        "sb               %[tn1],      1(%[dst])                      \n\t"
108        "sb               %[tp2],      2(%[dst])                      \n\t"
109        "sb               %[n2],       3(%[dst])                      \n\t"
110
111        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
112          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
113          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
114          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
115          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
116        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
117          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
118          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
119          [src] "r"(src));
120
121    /* Next row... */
122    src += src_stride;
123    dst += dst_stride;
124  }
125}
126
127static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
128                                   uint8_t *dst, int32_t dst_stride,
129                                   const int16_t *filter_x0, int32_t h) {
130  int32_t y;
131  uint8_t *cm = vpx_ff_cropTbl;
132  uint32_t vector4a = 64;
133  int32_t vector1b, vector2b, vector3b, vector4b;
134  int32_t Temp1, Temp2, Temp3;
135  uint32_t tp1, tp2;
136  uint32_t p1, p2, p3, p4, n1;
137  uint32_t tn1, tn2, tn3;
138  uint32_t st0, st1;
139
140  vector1b = ((const int32_t *)filter_x0)[0];
141  vector2b = ((const int32_t *)filter_x0)[1];
142  vector3b = ((const int32_t *)filter_x0)[2];
143  vector4b = ((const int32_t *)filter_x0)[3];
144
145  for (y = h; y--;) {
146    /* prefetch data to cache memory */
147    prefetch_load(src + src_stride);
148    prefetch_load(src + src_stride + 32);
149    prefetch_store(dst + dst_stride);
150
151    __asm__ __volatile__(
152        "ulw              %[tp1],      0(%[src])                      \n\t"
153        "ulw              %[tp2],      4(%[src])                      \n\t"
154
155        /* even 1. pixel */
156        "mtlo             %[vector4a], $ac3                           \n\t"
157        "mthi             $zero,       $ac3                           \n\t"
158        "mtlo             %[vector4a], $ac2                           \n\t"
159        "mthi             $zero,       $ac2                           \n\t"
160        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
161        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
162        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
163        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
164        "ulw              %[tn2],      8(%[src])                      \n\t"
165        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
166        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
167        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
168        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
169        "extp             %[Temp1],    $ac3,           31             \n\t"
170
171        /* even 2. pixel */
172        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
173        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
174        "ulw              %[tn1],      12(%[src])                     \n\t"
175        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
176        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
177        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
178        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
179        "extp             %[Temp3],    $ac2,           31             \n\t"
180
181        /* even 3. pixel */
182        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
183        "mtlo             %[vector4a], $ac1                           \n\t"
184        "mthi             $zero,       $ac1                           \n\t"
185        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
186        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
187        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
188        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
189        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
190        "extp             %[Temp1],    $ac1,           31             \n\t"
191
192        /* even 4. pixel */
193        "mtlo             %[vector4a], $ac2                           \n\t"
194        "mthi             $zero,       $ac2                           \n\t"
195        "mtlo             %[vector4a], $ac3                           \n\t"
196        "mthi             $zero,       $ac3                           \n\t"
197        "sb               %[st0],      0(%[dst])                      \n\t"
198        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
199
200        "balign           %[tn3],      %[tn1],         3              \n\t"
201        "balign           %[tn1],      %[tn2],         3              \n\t"
202        "balign           %[tn2],      %[tp2],         3              \n\t"
203        "balign           %[tp2],      %[tp1],         3              \n\t"
204
205        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
206        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
207        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
208        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
209        "extp             %[Temp3],    $ac2,           31             \n\t"
210
211        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
212
213        /* odd 1. pixel */
214        "mtlo             %[vector4a], $ac1                           \n\t"
215        "mthi             $zero,       $ac1                           \n\t"
216        "sb               %[st1],      2(%[dst])                      \n\t"
217        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
218        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
219        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
220        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
221        "sb               %[st0],      4(%[dst])                      \n\t"
222        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
223        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
224        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
225        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
226        "extp             %[Temp2],    $ac3,           31             \n\t"
227
228        /* odd 2. pixel */
229        "mtlo             %[vector4a], $ac3                           \n\t"
230        "mthi             $zero,       $ac3                           \n\t"
231        "mtlo             %[vector4a], $ac2                           \n\t"
232        "mthi             $zero,       $ac2                           \n\t"
233        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
234        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
235        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
236        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
237        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
238        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
239        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
240        "extp             %[Temp3],    $ac1,           31             \n\t"
241
242        /* odd 3. pixel */
243        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
244        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
245        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
246        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
247        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
248        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
249        "extp             %[Temp2],    $ac3,           31             \n\t"
250
251        /* odd 4. pixel */
252        "sb               %[st1],      1(%[dst])                      \n\t"
253        "sb               %[st0],      6(%[dst])                      \n\t"
254        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
255        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
256        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
257        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
258        "extp             %[Temp1],    $ac2,           31             \n\t"
259
260        /* clamp */
261        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
262        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
263        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
264
265        /* store bytes */
266        "sb               %[p4],       3(%[dst])                      \n\t"
267        "sb               %[p2],       5(%[dst])                      \n\t"
268        "sb               %[n1],       7(%[dst])                      \n\t"
269
270        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
271          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
272          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
273          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
274          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
275        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
276          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
277          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
278          [src] "r"(src));
279
280    /* Next row... */
281    src += src_stride;
282    dst += dst_stride;
283  }
284}
285
286static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
287                                    uint8_t *dst_ptr, int32_t dst_stride,
288                                    const int16_t *filter_x0, int32_t h,
289                                    int32_t count) {
290  int32_t y, c;
291  const uint8_t *src;
292  uint8_t *dst;
293  uint8_t *cm = vpx_ff_cropTbl;
294  uint32_t vector_64 = 64;
295  int32_t filter12, filter34, filter56, filter78;
296  int32_t Temp1, Temp2, Temp3;
297  uint32_t qload1, qload2, qload3;
298  uint32_t p1, p2, p3, p4, p5;
299  uint32_t st1, st2, st3;
300
301  filter12 = ((const int32_t *)filter_x0)[0];
302  filter34 = ((const int32_t *)filter_x0)[1];
303  filter56 = ((const int32_t *)filter_x0)[2];
304  filter78 = ((const int32_t *)filter_x0)[3];
305
306  for (y = h; y--;) {
307    src = src_ptr;
308    dst = dst_ptr;
309
310    /* prefetch data to cache memory */
311    prefetch_load(src_ptr + src_stride);
312    prefetch_load(src_ptr + src_stride + 32);
313    prefetch_store(dst_ptr + dst_stride);
314
315    for (c = 0; c < count; c++) {
316      __asm__ __volatile__(
317          "ulw              %[qload1],    0(%[src])                    \n\t"
318          "ulw              %[qload2],    4(%[src])                    \n\t"
319
320          /* even 1. pixel */
321          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
322          "mthi             $zero,        $ac1                         \n\t"
323          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
324          "mthi             $zero,        $ac2                         \n\t"
325          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
326          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
327          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
328          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
329          "ulw              %[qload3],    8(%[src])                    \n\t"
330          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
331          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
332          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
333          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
334          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
335
336          /* even 2. pixel */
337          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
338          "mthi             $zero,        $ac3                         \n\t"
339          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
340          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
341          "ulw              %[qload1],    12(%[src])                   \n\t"
342          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
343          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
344          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
345          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
346          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
347          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
348
349          /* even 3. pixel */
350          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
351          "mthi             $zero,        $ac1                         \n\t"
352          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
353          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
354          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
355          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
356          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
357          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
358          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
359          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
360
361          /* even 4. pixel */
362          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
363          "mthi             $zero,        $ac2                         \n\t"
364          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
365          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
366          "ulw              %[qload2],    16(%[src])                   \n\t"
367          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
368          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
369          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
370          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
371          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
372          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
373
374          /* even 5. pixel */
375          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
376          "mthi             $zero,        $ac3                         \n\t"
377          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
378          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
379          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
380          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
381          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
382          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
383          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
384          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
385
386          /* even 6. pixel */
387          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
388          "mthi             $zero,        $ac1                         \n\t"
389          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
390          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
391          "ulw              %[qload3],    20(%[src])                   \n\t"
392          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
393          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
394          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
395          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
396          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
397          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
398
399          /* even 7. pixel */
400          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
401          "mthi             $zero,        $ac2                         \n\t"
402          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
403          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
404          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
405          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
406          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
407          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
408          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
409          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
410
411          /* even 8. pixel */
412          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
413          "mthi             $zero,        $ac3                         \n\t"
414          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
415          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
416          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
417          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
418          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
419          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
420          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
421
422          /* ODD pixels */
423          "ulw              %[qload1],    1(%[src])                    \n\t"
424          "ulw              %[qload2],    5(%[src])                    \n\t"
425
426          /* odd 1. pixel */
427          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
428          "mthi             $zero,        $ac1                         \n\t"
429          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
430          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
431          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
432          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
433          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
434          "ulw              %[qload3],    9(%[src])                    \n\t"
435          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
436          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
437          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
438          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
439          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
440          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
441
442          /* odd 2. pixel */
443          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
444          "mthi             $zero,        $ac2                         \n\t"
445          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
446          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
447          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
448          "ulw              %[qload1],    13(%[src])                   \n\t"
449          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
450          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
451          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
452          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
453          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
454          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
455
456          /* odd 3. pixel */
457          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
458          "mthi             $zero,        $ac3                         \n\t"
459          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
460          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
461          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
462          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
463          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
464          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
465          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
466          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
467
468          /* odd 4. pixel */
469          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
470          "mthi             $zero,        $ac1                         \n\t"
471          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
472          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
473          "ulw              %[qload2],    17(%[src])                   \n\t"
474          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
475          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
476          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
477          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
478          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
479          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
480
481          /* odd 5. pixel */
482          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
483          "mthi             $zero,        $ac2                         \n\t"
484          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
485          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
486          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
487          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
488          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
489          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
490          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
491          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
492
493          /* odd 6. pixel */
494          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
495          "mthi             $zero,        $ac3                         \n\t"
496          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
497          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
498          "ulw              %[qload3],    21(%[src])                   \n\t"
499          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
500          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
501          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
502          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
503          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
504          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
505
506          /* odd 7. pixel */
507          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
508          "mthi             $zero,        $ac1                         \n\t"
509          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
510          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
511          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
512          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
513          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
514          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
515          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
516
517          /* odd 8. pixel */
518          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
519          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
520          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
521          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
522          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
523
524          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
525          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
526          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
527
528          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
529          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
530          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
531
532          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
533            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
534            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
535            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
536            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
537          : [filter12] "r"(filter12), [filter34] "r"(filter34),
538            [filter56] "r"(filter56), [filter78] "r"(filter78),
539            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
540            [src] "r"(src));
541
542      src += 16;
543      dst += 16;
544    }
545
546    /* Next row... */
547    src_ptr += src_stride;
548    dst_ptr += dst_stride;
549  }
550}
551
552static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
553                                    uint8_t *dst_ptr, int32_t dst_stride,
554                                    const int16_t *filter_x0, int32_t h) {
555  int32_t y, c;
556  const uint8_t *src;
557  uint8_t *dst;
558  uint8_t *cm = vpx_ff_cropTbl;
559  uint32_t vector_64 = 64;
560  int32_t filter12, filter34, filter56, filter78;
561  int32_t Temp1, Temp2, Temp3;
562  uint32_t qload1, qload2, qload3;
563  uint32_t p1, p2, p3, p4, p5;
564  uint32_t st1, st2, st3;
565
566  filter12 = ((const int32_t *)filter_x0)[0];
567  filter34 = ((const int32_t *)filter_x0)[1];
568  filter56 = ((const int32_t *)filter_x0)[2];
569  filter78 = ((const int32_t *)filter_x0)[3];
570
571  for (y = h; y--;) {
572    src = src_ptr;
573    dst = dst_ptr;
574
575    /* prefetch data to cache memory */
576    prefetch_load(src_ptr + src_stride);
577    prefetch_load(src_ptr + src_stride + 32);
578    prefetch_load(src_ptr + src_stride + 64);
579    prefetch_store(dst_ptr + dst_stride);
580    prefetch_store(dst_ptr + dst_stride + 32);
581
582    for (c = 0; c < 4; c++) {
583      __asm__ __volatile__(
584          "ulw              %[qload1],    0(%[src])                    \n\t"
585          "ulw              %[qload2],    4(%[src])                    \n\t"
586
587          /* even 1. pixel */
588          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
589          "mthi             $zero,        $ac1                         \n\t"
590          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
591          "mthi             $zero,        $ac2                         \n\t"
592          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
593          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
594          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
595          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
596          "ulw              %[qload3],    8(%[src])                    \n\t"
597          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
598          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
599          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
600          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
601          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
602
603          /* even 2. pixel */
604          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
605          "mthi             $zero,        $ac3                         \n\t"
606          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
607          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
608          "ulw              %[qload1],    12(%[src])                   \n\t"
609          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
610          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
611          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
612          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
613          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
614          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
615
616          /* even 3. pixel */
617          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
618          "mthi             $zero,        $ac1                         \n\t"
619          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
620          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
621          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
622          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
623          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
624          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
625          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
626          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
627
628          /* even 4. pixel */
629          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
630          "mthi             $zero,        $ac2                         \n\t"
631          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
632          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
633          "ulw              %[qload2],    16(%[src])                   \n\t"
634          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
635          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
636          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
637          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
638          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
639          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
640
641          /* even 5. pixel */
642          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
643          "mthi             $zero,        $ac3                         \n\t"
644          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
645          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
646          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
647          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
648          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
649          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
650          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
651          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
652
653          /* even 6. pixel */
654          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
655          "mthi             $zero,        $ac1                         \n\t"
656          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
657          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
658          "ulw              %[qload3],    20(%[src])                   \n\t"
659          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
660          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
661          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
662          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
663          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
664          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
665
666          /* even 7. pixel */
667          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
668          "mthi             $zero,        $ac2                         \n\t"
669          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
670          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
671          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
672          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
673          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
674          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
675          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
676          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
677
678          /* even 8. pixel */
679          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
680          "mthi             $zero,        $ac3                         \n\t"
681          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
682          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
683          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
684          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
685          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
686          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
687          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
688
689          /* ODD pixels */
690          "ulw              %[qload1],    1(%[src])                    \n\t"
691          "ulw              %[qload2],    5(%[src])                    \n\t"
692
693          /* odd 1. pixel */
694          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
695          "mthi             $zero,        $ac1                         \n\t"
696          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
697          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
698          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
699          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
700          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
701          "ulw              %[qload3],    9(%[src])                    \n\t"
702          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
703          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
704          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
705          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
706          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
707          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
708
709          /* odd 2. pixel */
710          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
711          "mthi             $zero,        $ac2                         \n\t"
712          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
713          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
714          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
715          "ulw              %[qload1],    13(%[src])                   \n\t"
716          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
717          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
718          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
719          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
720          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
721          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
722
723          /* odd 3. pixel */
724          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
725          "mthi             $zero,        $ac3                         \n\t"
726          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
727          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
728          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
729          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
730          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
731          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
732          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
733          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
734
735          /* odd 4. pixel */
736          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
737          "mthi             $zero,        $ac1                         \n\t"
738          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
739          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
740          "ulw              %[qload2],    17(%[src])                   \n\t"
741          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
742          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
743          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
744          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
745          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
746          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
747
748          /* odd 5. pixel */
749          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
750          "mthi             $zero,        $ac2                         \n\t"
751          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
752          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
753          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
754          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
755          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
756          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
757          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
758          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
759
760          /* odd 6. pixel */
761          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
762          "mthi             $zero,        $ac3                         \n\t"
763          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
764          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
765          "ulw              %[qload3],    21(%[src])                   \n\t"
766          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
767          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
768          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
769          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
770          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
771          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
772
773          /* odd 7. pixel */
774          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
775          "mthi             $zero,        $ac1                         \n\t"
776          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
777          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
778          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
779          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
780          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
781          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
782          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
783
784          /* odd 8. pixel */
785          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
786          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
787          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
788          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
789          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
790
791          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
792          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
793          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
794
795          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
796          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
797          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
798
799          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
800            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
801            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
802            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
803            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
804          : [filter12] "r"(filter12), [filter34] "r"(filter34),
805            [filter56] "r"(filter56), [filter78] "r"(filter78),
806            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
807            [src] "r"(src));
808
809      src += 16;
810      dst += 16;
811    }
812
813    /* Next row... */
814    src_ptr += src_stride;
815    dst_ptr += dst_stride;
816  }
817}
818
819void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
820                               uint8_t *dst, ptrdiff_t dst_stride,
821                               const int16_t *filter_x, int x_step_q4,
822                               const int16_t *filter_y, int y_step_q4, int w,
823                               int h) {
824  assert(x_step_q4 == 16);
825  assert(((const int32_t *)filter_x)[1] != 0x800000);
826
827  if (((const int32_t *)filter_x)[0] == 0) {
828    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
829                              x_step_q4, filter_y, y_step_q4, w, h);
830  } else {
831    uint32_t pos = 38;
832
833    prefetch_load((const uint8_t *)filter_x);
834    src -= 3;
835
836    /* bit positon for extract from acc */
837    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
838                         :
839                         : [pos] "r"(pos));
840
841    /* prefetch data to cache memory */
842    prefetch_load(src);
843    prefetch_load(src + 32);
844    prefetch_store(dst);
845
846    switch (w) {
847      case 4:
848        convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
849                               (int32_t)dst_stride, filter_x, (int32_t)h);
850        break;
851      case 8:
852        convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
853                               (int32_t)dst_stride, filter_x, (int32_t)h);
854        break;
855      case 16:
856        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
857                                (int32_t)dst_stride, filter_x, (int32_t)h, 1);
858        break;
859      case 32:
860        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
861                                (int32_t)dst_stride, filter_x, (int32_t)h, 2);
862        break;
863      case 64:
864        prefetch_load(src + 64);
865        prefetch_store(dst + 32);
866
867        convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
868                                (int32_t)dst_stride, filter_x, (int32_t)h);
869        break;
870      default:
871        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
872                              x_step_q4, filter_y, y_step_q4, w, h);
873        break;
874    }
875  }
876}
877#endif
878