1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_ports/mem.h"
19#include "vp9/common/vp9_convolve.h"
20#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22#if HAVE_DSPR2
23static void convolve_horiz_4_dspr2(const uint8_t *src,
24                                   int32_t src_stride,
25                                   uint8_t *dst,
26                                   int32_t dst_stride,
27                                   const int16_t *filter_x0,
28                                   int32_t h) {
29  int32_t y;
30  uint8_t *cm = vp9_ff_cropTbl;
31  int32_t vector1b, vector2b, vector3b, vector4b;
32  int32_t Temp1, Temp2, Temp3, Temp4;
33  uint32_t vector4a = 64;
34  uint32_t tp1, tp2;
35  uint32_t p1, p2, p3, p4;
36  uint32_t n1, n2, n3, n4;
37  uint32_t tn1, tn2;
38
39  vector1b = ((const int32_t *)filter_x0)[0];
40  vector2b = ((const int32_t *)filter_x0)[1];
41  vector3b = ((const int32_t *)filter_x0)[2];
42  vector4b = ((const int32_t *)filter_x0)[3];
43
44  for (y = h; y--;) {
45    /* prefetch data to cache memory */
46    vp9_prefetch_load(src + src_stride);
47    vp9_prefetch_load(src + src_stride + 32);
48    vp9_prefetch_store(dst + dst_stride);
49
50    __asm__ __volatile__ (
51        "ulw              %[tp1],      0(%[src])                      \n\t"
52        "ulw              %[tp2],      4(%[src])                      \n\t"
53
54        /* even 1. pixel */
55        "mtlo             %[vector4a], $ac3                           \n\t"
56        "mthi             $zero,       $ac3                           \n\t"
57        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
58        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
59        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
60        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
61        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
62        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
63        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
64        "ulw              %[tn2],      8(%[src])                      \n\t"
65        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
66        "extp             %[Temp1],    $ac3,           31             \n\t"
67
68        /* even 2. pixel */
69        "mtlo             %[vector4a], $ac2                           \n\t"
70        "mthi             $zero,       $ac2                           \n\t"
71        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
72        "balign           %[tn1],      %[tn2],         3              \n\t"
73        "balign           %[tn2],      %[tp2],         3              \n\t"
74        "balign           %[tp2],      %[tp1],         3              \n\t"
75        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
76        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
77        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
78        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
79        "extp             %[Temp3],    $ac2,           31             \n\t"
80
81        /* odd 1. pixel */
82        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
83        "mtlo             %[vector4a], $ac3                           \n\t"
84        "mthi             $zero,       $ac3                           \n\t"
85        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
86        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
87        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
88        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
89        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
90        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
91        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
92        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
93        "extp             %[Temp2],    $ac3,           31             \n\t"
94
95        /* odd 2. pixel */
96        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
97        "mtlo             %[vector4a], $ac2                           \n\t"
98        "mthi             $zero,       $ac2                           \n\t"
99        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
100        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
101        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
102        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
103        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
104        "extp             %[Temp4],    $ac2,           31             \n\t"
105
106        /* clamp */
107        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
108        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
109
110        /* store bytes */
111        "sb               %[tp1],      0(%[dst])                      \n\t"
112        "sb               %[tn1],      1(%[dst])                      \n\t"
113        "sb               %[tp2],      2(%[dst])                      \n\t"
114        "sb               %[n2],       3(%[dst])                      \n\t"
115
116        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
117          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
118          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
119          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
120          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
121          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
122        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
123          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
124          [vector4a] "r" (vector4a),
125          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
126    );
127
128    /* Next row... */
129    src += src_stride;
130    dst += dst_stride;
131  }
132}
133
134static void convolve_horiz_8_dspr2(const uint8_t *src,
135                                   int32_t src_stride,
136                                   uint8_t *dst,
137                                   int32_t dst_stride,
138                                   const int16_t *filter_x0,
139                                   int32_t h) {
140  int32_t y;
141  uint8_t *cm = vp9_ff_cropTbl;
142  uint32_t vector4a = 64;
143  int32_t vector1b, vector2b, vector3b, vector4b;
144  int32_t Temp1, Temp2, Temp3;
145  uint32_t tp1, tp2;
146  uint32_t p1, p2, p3, p4, n1;
147  uint32_t tn1, tn2, tn3;
148  uint32_t st0, st1;
149
150  vector1b = ((const int32_t *)filter_x0)[0];
151  vector2b = ((const int32_t *)filter_x0)[1];
152  vector3b = ((const int32_t *)filter_x0)[2];
153  vector4b = ((const int32_t *)filter_x0)[3];
154
155  for (y = h; y--;) {
156    /* prefetch data to cache memory */
157    vp9_prefetch_load(src + src_stride);
158    vp9_prefetch_load(src + src_stride + 32);
159    vp9_prefetch_store(dst + dst_stride);
160
161    __asm__ __volatile__ (
162        "ulw              %[tp1],      0(%[src])                      \n\t"
163        "ulw              %[tp2],      4(%[src])                      \n\t"
164
165        /* even 1. pixel */
166        "mtlo             %[vector4a], $ac3                           \n\t"
167        "mthi             $zero,       $ac3                           \n\t"
168        "mtlo             %[vector4a], $ac2                           \n\t"
169        "mthi             $zero,       $ac2                           \n\t"
170        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
171        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
172        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
173        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
174        "ulw              %[tn2],      8(%[src])                      \n\t"
175        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
176        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
177        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
178        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
179        "extp             %[Temp1],    $ac3,           31             \n\t"
180
181        /* even 2. pixel */
182        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
183        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
184        "ulw              %[tn1],      12(%[src])                     \n\t"
185        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
186        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
187        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
188        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
189        "extp             %[Temp3],    $ac2,           31             \n\t"
190
191        /* even 3. pixel */
192        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
193        "mtlo             %[vector4a], $ac1                           \n\t"
194        "mthi             $zero,       $ac1                           \n\t"
195        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
196        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
197        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
198        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
199        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
200        "extp             %[Temp1],    $ac1,           31             \n\t"
201
202        /* even 4. pixel */
203        "mtlo             %[vector4a], $ac2                           \n\t"
204        "mthi             $zero,       $ac2                           \n\t"
205        "mtlo             %[vector4a], $ac3                           \n\t"
206        "mthi             $zero,       $ac3                           \n\t"
207        "sb               %[st0],      0(%[dst])                      \n\t"
208        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
209
210        "balign           %[tn3],      %[tn1],         3              \n\t"
211        "balign           %[tn1],      %[tn2],         3              \n\t"
212        "balign           %[tn2],      %[tp2],         3              \n\t"
213        "balign           %[tp2],      %[tp1],         3              \n\t"
214
215        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
216        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
217        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
218        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
219        "extp             %[Temp3],    $ac2,           31             \n\t"
220
221        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
222
223        /* odd 1. pixel */
224        "mtlo             %[vector4a], $ac1                           \n\t"
225        "mthi             $zero,       $ac1                           \n\t"
226        "sb               %[st1],      2(%[dst])                      \n\t"
227        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
228        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
229        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
230        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
231        "sb               %[st0],      4(%[dst])                      \n\t"
232        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
233        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
234        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
235        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
236        "extp             %[Temp2],    $ac3,           31             \n\t"
237
238        /* odd 2. pixel */
239        "mtlo             %[vector4a], $ac3                           \n\t"
240        "mthi             $zero,       $ac3                           \n\t"
241        "mtlo             %[vector4a], $ac2                           \n\t"
242        "mthi             $zero,       $ac2                           \n\t"
243        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
244        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
245        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
246        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
247        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
248        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
249        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
250        "extp             %[Temp3],    $ac1,           31             \n\t"
251
252        /* odd 3. pixel */
253        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
254        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
255        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
256        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
257        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
258        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
259        "extp             %[Temp2],    $ac3,           31             \n\t"
260
261        /* odd 4. pixel */
262        "sb               %[st1],      1(%[dst])                      \n\t"
263        "sb               %[st0],      6(%[dst])                      \n\t"
264        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
265        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
266        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
267        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
268        "extp             %[Temp1],    $ac2,           31             \n\t"
269
270        /* clamp */
271        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
272        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
273        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
274
275        /* store bytes */
276        "sb               %[p4],       3(%[dst])                      \n\t"
277        "sb               %[p2],       5(%[dst])                      \n\t"
278        "sb               %[n1],       7(%[dst])                      \n\t"
279
280        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
281          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
282          [st0] "=&r" (st0), [st1] "=&r" (st1),
283          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
284          [n1] "=&r" (n1),
285          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
286        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
287          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
288          [vector4a] "r" (vector4a),
289          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
290    );
291
292    /* Next row... */
293    src += src_stride;
294    dst += dst_stride;
295  }
296}
297
298static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
299                                    int32_t src_stride,
300                                    uint8_t *dst_ptr,
301                                    int32_t dst_stride,
302                                    const int16_t *filter_x0,
303                                    int32_t h,
304                                    int32_t count) {
305  int32_t y, c;
306  const uint8_t *src;
307  uint8_t *dst;
308  uint8_t *cm = vp9_ff_cropTbl;
309  uint32_t vector_64 = 64;
310  int32_t filter12, filter34, filter56, filter78;
311  int32_t Temp1, Temp2, Temp3;
312  uint32_t qload1, qload2, qload3;
313  uint32_t p1, p2, p3, p4, p5;
314  uint32_t st1, st2, st3;
315
316  filter12 = ((const int32_t *)filter_x0)[0];
317  filter34 = ((const int32_t *)filter_x0)[1];
318  filter56 = ((const int32_t *)filter_x0)[2];
319  filter78 = ((const int32_t *)filter_x0)[3];
320
321  for (y = h; y--;) {
322    src = src_ptr;
323    dst = dst_ptr;
324
325    /* prefetch data to cache memory */
326    vp9_prefetch_load(src_ptr + src_stride);
327    vp9_prefetch_load(src_ptr + src_stride + 32);
328    vp9_prefetch_store(dst_ptr + dst_stride);
329
330    for (c = 0; c < count; c++) {
331      __asm__ __volatile__ (
332          "ulw              %[qload1],    0(%[src])                    \n\t"
333          "ulw              %[qload2],    4(%[src])                    \n\t"
334
335          /* even 1. pixel */
336          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
337          "mthi             $zero,        $ac1                         \n\t"
338          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
339          "mthi             $zero,        $ac2                         \n\t"
340          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
341          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
342          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
343          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
344          "ulw              %[qload3],    8(%[src])                    \n\t"
345          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
346          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
347          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
348          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
349          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
350
351          /* even 2. pixel */
352          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
353          "mthi             $zero,        $ac3                         \n\t"
354          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
355          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
356          "ulw              %[qload1],    12(%[src])                   \n\t"
357          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
358          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
359          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
360          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
361          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
362          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
363
364          /* even 3. pixel */
365          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
366          "mthi             $zero,        $ac1                         \n\t"
367          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
368          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
369          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
370          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
371          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
372          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
373          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
374          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
375
376          /* even 4. pixel */
377          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
378          "mthi             $zero,        $ac2                         \n\t"
379          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
380          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
381          "ulw              %[qload2],    16(%[src])                   \n\t"
382          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
383          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
384          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
385          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
386          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
387          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
388
389          /* even 5. pixel */
390          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
391          "mthi             $zero,        $ac3                         \n\t"
392          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
393          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
394          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
395          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
396          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
397          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
398          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
399          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
400
401          /* even 6. pixel */
402          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
403          "mthi             $zero,        $ac1                         \n\t"
404          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
405          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
406          "ulw              %[qload3],    20(%[src])                   \n\t"
407          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
408          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
409          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
410          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
411          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
412          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
413
414          /* even 7. pixel */
415          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
416          "mthi             $zero,        $ac2                         \n\t"
417          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
418          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
419          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
420          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
421          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
422          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
423          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
424          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
425
426          /* even 8. pixel */
427          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
428          "mthi             $zero,        $ac3                         \n\t"
429          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
430          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
431          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
432          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
433          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
434          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
435          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
436
437          /* ODD pixels */
438          "ulw              %[qload1],    1(%[src])                    \n\t"
439          "ulw              %[qload2],    5(%[src])                    \n\t"
440
441          /* odd 1. pixel */
442          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
443          "mthi             $zero,        $ac1                         \n\t"
444          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
445          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
446          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
447          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
448          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
449          "ulw              %[qload3],    9(%[src])                    \n\t"
450          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
451          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
452          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
453          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
454          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
455          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
456
457          /* odd 2. pixel */
458          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
459          "mthi             $zero,        $ac2                         \n\t"
460          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
461          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
462          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
463          "ulw              %[qload1],    13(%[src])                   \n\t"
464          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
465          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
466          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
467          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
468          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
469          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
470
471          /* odd 3. pixel */
472          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
473          "mthi             $zero,        $ac3                         \n\t"
474          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
475          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
476          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
477          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
478          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
479          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
480          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
481          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
482
483          /* odd 4. pixel */
484          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
485          "mthi             $zero,        $ac1                         \n\t"
486          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
487          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
488          "ulw              %[qload2],    17(%[src])                   \n\t"
489          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
490          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
491          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
492          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
493          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
494          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
495
496          /* odd 5. pixel */
497          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
498          "mthi             $zero,        $ac2                         \n\t"
499          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
500          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
501          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
502          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
503          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
504          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
505          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
506          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
507
508          /* odd 6. pixel */
509          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
510          "mthi             $zero,        $ac3                         \n\t"
511          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
512          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
513          "ulw              %[qload3],    21(%[src])                   \n\t"
514          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
515          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
516          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
517          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
518          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
519          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
520
521          /* odd 7. pixel */
522          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
523          "mthi             $zero,        $ac1                         \n\t"
524          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
525          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
526          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
527          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
528          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
529          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
530          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
531
532          /* odd 8. pixel */
533          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
534          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
535          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
536          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
537          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
538
539          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
540          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
541          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
542
543          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
544          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
545          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
546
547          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
548            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
549            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
550            [p5] "=&r" (p5),
551            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
552          : [filter12] "r" (filter12), [filter34] "r" (filter34),
553            [filter56] "r" (filter56), [filter78] "r" (filter78),
554            [vector_64] "r" (vector_64),
555            [cm] "r" (cm), [dst] "r" (dst),
556            [src] "r" (src)
557      );
558
559      src += 16;
560      dst += 16;
561    }
562
563    /* Next row... */
564    src_ptr += src_stride;
565    dst_ptr += dst_stride;
566  }
567}
568
569static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
570                                    int32_t src_stride,
571                                    uint8_t *dst_ptr,
572                                    int32_t dst_stride,
573                                    const int16_t *filter_x0,
574                                    int32_t h) {
575  int32_t y, c;
576  const uint8_t *src;
577  uint8_t *dst;
578  uint8_t *cm = vp9_ff_cropTbl;
579  uint32_t vector_64 = 64;
580  int32_t filter12, filter34, filter56, filter78;
581  int32_t Temp1, Temp2, Temp3;
582  uint32_t qload1, qload2, qload3;
583  uint32_t p1, p2, p3, p4, p5;
584  uint32_t st1, st2, st3;
585
586  filter12 = ((const int32_t *)filter_x0)[0];
587  filter34 = ((const int32_t *)filter_x0)[1];
588  filter56 = ((const int32_t *)filter_x0)[2];
589  filter78 = ((const int32_t *)filter_x0)[3];
590
591  for (y = h; y--;) {
592    src = src_ptr;
593    dst = dst_ptr;
594
595    /* prefetch data to cache memory */
596    vp9_prefetch_load(src_ptr + src_stride);
597    vp9_prefetch_load(src_ptr + src_stride + 32);
598    vp9_prefetch_load(src_ptr + src_stride + 64);
599    vp9_prefetch_store(dst_ptr + dst_stride);
600    vp9_prefetch_store(dst_ptr + dst_stride + 32);
601
602    for (c = 0; c < 4; c++) {
603      __asm__ __volatile__ (
604          "ulw              %[qload1],    0(%[src])                    \n\t"
605          "ulw              %[qload2],    4(%[src])                    \n\t"
606
607          /* even 1. pixel */
608          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
609          "mthi             $zero,        $ac1                         \n\t"
610          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
611          "mthi             $zero,        $ac2                         \n\t"
612          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
613          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
614          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
615          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
616          "ulw              %[qload3],    8(%[src])                    \n\t"
617          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
618          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
619          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
620          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
621          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
622
623          /* even 2. pixel */
624          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
625          "mthi             $zero,        $ac3                         \n\t"
626          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
627          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
628          "ulw              %[qload1],    12(%[src])                   \n\t"
629          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
630          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
631          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
632          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
633          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
634          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
635
636          /* even 3. pixel */
637          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
638          "mthi             $zero,        $ac1                         \n\t"
639          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
640          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
641          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
642          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
643          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
644          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
645          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
646          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
647
648          /* even 4. pixel */
649          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
650          "mthi             $zero,        $ac2                         \n\t"
651          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
652          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
653          "ulw              %[qload2],    16(%[src])                   \n\t"
654          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
655          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
656          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
657          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
658          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
659          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
660
661          /* even 5. pixel */
662          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
663          "mthi             $zero,        $ac3                         \n\t"
664          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
665          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
666          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
667          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
668          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
669          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
670          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
671          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
672
673          /* even 6. pixel */
674          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
675          "mthi             $zero,        $ac1                         \n\t"
676          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
677          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
678          "ulw              %[qload3],    20(%[src])                   \n\t"
679          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
680          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
681          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
682          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
683          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
684          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
685
686          /* even 7. pixel */
687          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
688          "mthi             $zero,        $ac2                         \n\t"
689          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
690          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
691          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
692          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
693          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
694          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
695          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
696          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
697
698          /* even 8. pixel */
699          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
700          "mthi             $zero,        $ac3                         \n\t"
701          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
702          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
703          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
704          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
705          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
706          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
707          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
708
709          /* ODD pixels */
710          "ulw              %[qload1],    1(%[src])                    \n\t"
711          "ulw              %[qload2],    5(%[src])                    \n\t"
712
713          /* odd 1. pixel */
714          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
715          "mthi             $zero,        $ac1                         \n\t"
716          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
717          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
718          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
719          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
720          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
721          "ulw              %[qload3],    9(%[src])                    \n\t"
722          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
723          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
724          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
725          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
726          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
727          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
728
729          /* odd 2. pixel */
730          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
731          "mthi             $zero,        $ac2                         \n\t"
732          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
733          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
734          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
735          "ulw              %[qload1],    13(%[src])                   \n\t"
736          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
737          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
738          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
739          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
740          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
741          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
742
743          /* odd 3. pixel */
744          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
745          "mthi             $zero,        $ac3                         \n\t"
746          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
747          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
748          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
749          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
750          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
751          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
752          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
753          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
754
755          /* odd 4. pixel */
756          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
757          "mthi             $zero,        $ac1                         \n\t"
758          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
759          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
760          "ulw              %[qload2],    17(%[src])                   \n\t"
761          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
762          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
763          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
764          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
765          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
766          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
767
768          /* odd 5. pixel */
769          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
770          "mthi             $zero,        $ac2                         \n\t"
771          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
772          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
773          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
774          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
775          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
776          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
777          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
778          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
779
780          /* odd 6. pixel */
781          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
782          "mthi             $zero,        $ac3                         \n\t"
783          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
784          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
785          "ulw              %[qload3],    21(%[src])                   \n\t"
786          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
787          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
788          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
789          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
790          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
791          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
792
793          /* odd 7. pixel */
794          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
795          "mthi             $zero,        $ac1                         \n\t"
796          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
797          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
798          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
799          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
800          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
801          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
802          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
803
804          /* odd 8. pixel */
805          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
806          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
807          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
808          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
809          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
810
811          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
812          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
813          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
814
815          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
816          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
817          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
818
819          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
820            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
821            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
822            [p5] "=&r" (p5),
823            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
824          : [filter12] "r" (filter12), [filter34] "r" (filter34),
825            [filter56] "r" (filter56), [filter78] "r" (filter78),
826            [vector_64] "r" (vector_64),
827            [cm] "r" (cm), [dst] "r" (dst),
828            [src] "r" (src)
829      );
830
831      src += 16;
832      dst += 16;
833    }
834
835    /* Next row... */
836    src_ptr += src_stride;
837    dst_ptr += dst_stride;
838  }
839}
840
841void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
842                               uint8_t *dst, ptrdiff_t dst_stride,
843                               const int16_t *filter_x, int x_step_q4,
844                               const int16_t *filter_y, int y_step_q4,
845                               int w, int h) {
846  if (((const int32_t *)filter_x)[1] == 0x800000) {
847    vp9_convolve_copy(src, src_stride,
848                      dst, dst_stride,
849                      filter_x, x_step_q4,
850                      filter_y, y_step_q4,
851                      w, h);
852  } else if (((const int32_t *)filter_x)[0] == 0) {
853    vp9_convolve2_horiz_dspr2(src, src_stride,
854                              dst, dst_stride,
855                              filter_x, x_step_q4,
856                              filter_y, y_step_q4,
857                              w, h);
858  } else {
859    if (16 == x_step_q4) {
860      uint32_t pos = 38;
861
862      vp9_prefetch_load((const uint8_t *)filter_x);
863      src -= 3;
864
865      /* bit positon for extract from acc */
866      __asm__ __volatile__ (
867        "wrdsp      %[pos],     1           \n\t"
868        :
869        : [pos] "r" (pos)
870      );
871
872      /* prefetch data to cache memory */
873      vp9_prefetch_load(src);
874      vp9_prefetch_load(src + 32);
875      vp9_prefetch_store(dst);
876
877      switch (w) {
878        case 4:
879          convolve_horiz_4_dspr2(src, (int32_t)src_stride,
880                                 dst, (int32_t)dst_stride,
881                                 filter_x, (int32_t)h);
882          break;
883        case 8:
884          convolve_horiz_8_dspr2(src, (int32_t)src_stride,
885                                 dst, (int32_t)dst_stride,
886                                 filter_x, (int32_t)h);
887          break;
888        case 16:
889          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
890                                  dst, (int32_t)dst_stride,
891                                  filter_x, (int32_t)h, 1);
892          break;
893        case 32:
894          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
895                                  dst, (int32_t)dst_stride,
896                                  filter_x, (int32_t)h, 2);
897          break;
898        case 64:
899          vp9_prefetch_load(src + 64);
900          vp9_prefetch_store(dst + 32);
901
902          convolve_horiz_64_dspr2(src, (int32_t)src_stride,
903                                  dst, (int32_t)dst_stride,
904                                  filter_x, (int32_t)h);
905          break;
906        default:
907          vp9_convolve8_horiz_c(src + 3, src_stride,
908                                dst, dst_stride,
909                                filter_x, x_step_q4,
910                                filter_y, y_step_q4,
911                                w, h);
912          break;
913      }
914    } else {
915      vp9_convolve8_horiz_c(src, src_stride,
916                            dst, dst_stride,
917                            filter_x, x_step_q4,
918                            filter_y, y_step_q4,
919                            w, h);
920    }
921  }
922}
923#endif
924