1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_convolve.h"
17#include "vpx_dsp/vpx_dsp_common.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
22                                      uint8_t *dst, int32_t dst_stride,
23                                      const int16_t *filter_x0, int32_t h) {
24  int32_t y;
25  uint8_t *cm = vpx_ff_cropTbl;
26  int32_t Temp1, Temp2, Temp3, Temp4;
27  uint32_t vector4a = 64;
28  uint32_t tp1, tp2;
29  uint32_t p1, p2;
30  const int16_t *filter = &filter_x0[3];
31  uint32_t filter45;
32
33  filter45 = ((const int32_t *)filter)[0];
34
35  for (y = h; y--;) {
36    /* prefetch data to cache memory */
37    prefetch_load(src + src_stride);
38    prefetch_load(src + src_stride + 32);
39    prefetch_store(dst + dst_stride);
40
41    __asm__ __volatile__(
42        "ulw              %[tp1],      0(%[src])                      \n\t"
43        "ulw              %[tp2],      4(%[src])                      \n\t"
44
45        /* even 1. pixel */
46        "mtlo             %[vector4a], $ac3                           \n\t"
47        "mthi             $zero,       $ac3                           \n\t"
48        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
49        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
50        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
51        "extp             %[Temp1],    $ac3,           31             \n\t"
52
53        /* even 2. pixel */
54        "mtlo             %[vector4a], $ac2                           \n\t"
55        "mthi             $zero,       $ac2                           \n\t"
56        "balign           %[tp2],      %[tp1],         3              \n\t"
57        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
58        "extp             %[Temp3],    $ac2,           31             \n\t"
59
60        /* odd 1. pixel */
61        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
62        "mtlo             %[vector4a], $ac3                           \n\t"
63        "mthi             $zero,       $ac3                           \n\t"
64        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
65        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
66        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
67        "extp             %[Temp2],    $ac3,           31             \n\t"
68
69        /* odd 2. pixel */
70        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
71        "mtlo             %[vector4a], $ac2                           \n\t"
72        "mthi             $zero,       $ac2                           \n\t"
73        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
74        "extp             %[Temp4],    $ac2,           31             \n\t"
75
76        /* clamp */
77        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
78        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
79
80        /* store bytes */
81        "sb               %[tp1],      0(%[dst])                      \n\t"
82        "sb               %[p1],       1(%[dst])                      \n\t"
83        "sb               %[tp2],      2(%[dst])                      \n\t"
84        "sb               %[p2],       3(%[dst])                      \n\t"
85
86        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
87          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
88          [Temp4] "=&r"(Temp4)
89        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
90          [dst] "r"(dst), [src] "r"(src));
91
92    /* Next row... */
93    src += src_stride;
94    dst += dst_stride;
95  }
96}
97
98static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
99                                      uint8_t *dst, int32_t dst_stride,
100                                      const int16_t *filter_x0, int32_t h) {
101  int32_t y;
102  uint8_t *cm = vpx_ff_cropTbl;
103  uint32_t vector4a = 64;
104  int32_t Temp1, Temp2, Temp3;
105  uint32_t tp1, tp2, tp3;
106  uint32_t p1, p2, p3, p4;
107  uint32_t st0, st1;
108  const int16_t *filter = &filter_x0[3];
109  uint32_t filter45;
110
111  filter45 = ((const int32_t *)filter)[0];
112
113  for (y = h; y--;) {
114    /* prefetch data to cache memory */
115    prefetch_load(src + src_stride);
116    prefetch_load(src + src_stride + 32);
117    prefetch_store(dst + dst_stride);
118
119    __asm__ __volatile__(
120        "ulw              %[tp1],      0(%[src])                      \n\t"
121        "ulw              %[tp2],      4(%[src])                      \n\t"
122
123        /* even 1. pixel */
124        "mtlo             %[vector4a], $ac3                           \n\t"
125        "mthi             $zero,       $ac3                           \n\t"
126        "mtlo             %[vector4a], $ac2                           \n\t"
127        "mthi             $zero,       $ac2                           \n\t"
128        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
129        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
130        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
131        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
132        "ulw              %[tp3],      8(%[src])                      \n\t"
133        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
134        "extp             %[Temp1],    $ac3,           31             \n\t"
135
136        /* even 2. pixel */
137        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
138        "extp             %[Temp3],    $ac2,           31             \n\t"
139
140        /* even 3. pixel */
141        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
142        "mtlo             %[vector4a], $ac1                           \n\t"
143        "mthi             $zero,       $ac1                           \n\t"
144        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
145        "extp             %[Temp1],    $ac1,           31             \n\t"
146
147        /* even 4. pixel */
148        "mtlo             %[vector4a], $ac2                           \n\t"
149        "mthi             $zero,       $ac2                           \n\t"
150        "mtlo             %[vector4a], $ac3                           \n\t"
151        "mthi             $zero,       $ac3                           \n\t"
152        "sb               %[st0],      0(%[dst])                      \n\t"
153        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
154
155        "balign           %[tp3],      %[tp2],         3              \n\t"
156        "balign           %[tp2],      %[tp1],         3              \n\t"
157
158        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
159        "extp             %[Temp3],    $ac2,           31             \n\t"
160
161        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
162
163        /* odd 1. pixel */
164        "mtlo             %[vector4a], $ac1                           \n\t"
165        "mthi             $zero,       $ac1                           \n\t"
166        "sb               %[st1],      2(%[dst])                      \n\t"
167        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
168        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
169        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
170        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
171        "sb               %[st0],      4(%[dst])                      \n\t"
172        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
173        "extp             %[Temp2],    $ac3,           31             \n\t"
174
175        /* odd 2. pixel */
176        "mtlo             %[vector4a], $ac3                           \n\t"
177        "mthi             $zero,       $ac3                           \n\t"
178        "mtlo             %[vector4a], $ac2                           \n\t"
179        "mthi             $zero,       $ac2                           \n\t"
180        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
181        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
182        "extp             %[Temp3],    $ac1,           31             \n\t"
183
184        /* odd 3. pixel */
185        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
186        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
187        "extp             %[Temp2],    $ac3,           31             \n\t"
188
189        /* odd 4. pixel */
190        "sb               %[st1],      1(%[dst])                      \n\t"
191        "sb               %[st0],      6(%[dst])                      \n\t"
192        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
193        "extp             %[Temp1],    $ac2,           31             \n\t"
194
195        /* clamp */
196        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
197        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
198        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
199
200        /* store bytes */
201        "sb               %[p4],       3(%[dst])                      \n\t"
202        "sb               %[p2],       5(%[dst])                      \n\t"
203        "sb               %[p1],       7(%[dst])                      \n\t"
204
205        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
206          [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
207          [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
208          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
209        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
210          [dst] "r"(dst), [src] "r"(src));
211
212    /* Next row... */
213    src += src_stride;
214    dst += dst_stride;
215  }
216}
217
218static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
219                                       int32_t src_stride, uint8_t *dst_ptr,
220                                       int32_t dst_stride,
221                                       const int16_t *filter_x0, int32_t h,
222                                       int32_t count) {
223  int32_t y, c;
224  const uint8_t *src;
225  uint8_t *dst;
226  uint8_t *cm = vpx_ff_cropTbl;
227  uint32_t vector_64 = 64;
228  int32_t Temp1, Temp2, Temp3;
229  uint32_t qload1, qload2, qload3;
230  uint32_t p1, p2, p3, p4, p5;
231  uint32_t st1, st2, st3;
232  const int16_t *filter = &filter_x0[3];
233  uint32_t filter45;
234
235  filter45 = ((const int32_t *)filter)[0];
236
237  for (y = h; y--;) {
238    src = src_ptr;
239    dst = dst_ptr;
240
241    /* prefetch data to cache memory */
242    prefetch_load(src_ptr + src_stride);
243    prefetch_load(src_ptr + src_stride + 32);
244    prefetch_store(dst_ptr + dst_stride);
245
246    for (c = 0; c < count; c++) {
247      __asm__ __volatile__(
248          "ulw              %[qload1],    0(%[src])                    \n\t"
249          "ulw              %[qload2],    4(%[src])                    \n\t"
250
251          /* even 1. pixel */
252          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
253          "mthi             $zero,        $ac1                         \n\t"
254          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
255          "mthi             $zero,        $ac2                         \n\t"
256          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
257          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
258          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
259          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
260          "ulw              %[qload3],    8(%[src])                    \n\t"
261          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
262          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
263
264          /* even 2. pixel */
265          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
266          "mthi             $zero,        $ac3                         \n\t"
267          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
268          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
269          "ulw              %[qload1],    12(%[src])                   \n\t"
270          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
271          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
272          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
273
274          /* even 3. pixel */
275          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
276          "mthi             $zero,        $ac1                         \n\t"
277          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
278          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
279          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
280          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
281          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
282
283          /* even 4. pixel */
284          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
285          "mthi             $zero,        $ac2                         \n\t"
286          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
287          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
288          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
289          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
290          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
291
292          /* even 5. pixel */
293          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
294          "mthi             $zero,        $ac3                         \n\t"
295          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
296          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
297          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
298          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
299
300          /* even 6. pixel */
301          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
302          "mthi             $zero,        $ac1                         \n\t"
303          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
304          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
305          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
306          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
307
308          /* even 7. pixel */
309          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
310          "mthi             $zero,        $ac2                         \n\t"
311          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
312          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
313          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
314          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
315
316          /* even 8. pixel */
317          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
318          "mthi             $zero,        $ac3                         \n\t"
319          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
320          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
321          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
322          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
323
324          /* ODD pixels */
325          "ulw              %[qload1],    1(%[src])                    \n\t"
326          "ulw              %[qload2],    5(%[src])                    \n\t"
327
328          /* odd 1. pixel */
329          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
330          "mthi             $zero,        $ac1                         \n\t"
331          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
332          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
333          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
334          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
335          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
336          "ulw              %[qload3],    9(%[src])                    \n\t"
337          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
338          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
339          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
340
341          /* odd 2. pixel */
342          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
343          "mthi             $zero,        $ac2                         \n\t"
344          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
345          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
346          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
347          "ulw              %[qload1],    13(%[src])                   \n\t"
348          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
349          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
350          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
351
352          /* odd 3. pixel */
353          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
354          "mthi             $zero,        $ac3                         \n\t"
355          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
356          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
357          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
358          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
359          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
360
361          /* odd 4. pixel */
362          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
363          "mthi             $zero,        $ac1                         \n\t"
364          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
365          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
366          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
367          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
368          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
369
370          /* odd 5. pixel */
371          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
372          "mthi             $zero,        $ac2                         \n\t"
373          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
374          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
375          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
376          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
377
378          /* odd 6. pixel */
379          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
380          "mthi             $zero,        $ac3                         \n\t"
381          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
382          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
383          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
384          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
385
386          /* odd 7. pixel */
387          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
388          "mthi             $zero,        $ac1                         \n\t"
389          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
390          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
391          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
392
393          /* odd 8. pixel */
394          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
395          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
396
397          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
398          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
399          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
400
401          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
402          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
403          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
404
405          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
406            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
407            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
408            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
409            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
410          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
411            [dst] "r"(dst), [src] "r"(src));
412
413      src += 16;
414      dst += 16;
415    }
416
417    /* Next row... */
418    src_ptr += src_stride;
419    dst_ptr += dst_stride;
420  }
421}
422
423static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
424                                       int32_t src_stride, uint8_t *dst_ptr,
425                                       int32_t dst_stride,
426                                       const int16_t *filter_x0, int32_t h) {
427  int32_t y, c;
428  const uint8_t *src;
429  uint8_t *dst;
430  uint8_t *cm = vpx_ff_cropTbl;
431  uint32_t vector_64 = 64;
432  int32_t Temp1, Temp2, Temp3;
433  uint32_t qload1, qload2, qload3;
434  uint32_t p1, p2, p3, p4, p5;
435  uint32_t st1, st2, st3;
436  const int16_t *filter = &filter_x0[3];
437  uint32_t filter45;
438
439  filter45 = ((const int32_t *)filter)[0];
440
441  for (y = h; y--;) {
442    src = src_ptr;
443    dst = dst_ptr;
444
445    /* prefetch data to cache memory */
446    prefetch_load(src_ptr + src_stride);
447    prefetch_load(src_ptr + src_stride + 32);
448    prefetch_load(src_ptr + src_stride + 64);
449    prefetch_store(dst_ptr + dst_stride);
450    prefetch_store(dst_ptr + dst_stride + 32);
451
452    for (c = 0; c < 4; c++) {
453      __asm__ __volatile__(
454          "ulw              %[qload1],    0(%[src])                    \n\t"
455          "ulw              %[qload2],    4(%[src])                    \n\t"
456
457          /* even 1. pixel */
458          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
459          "mthi             $zero,        $ac1                         \n\t"
460          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
461          "mthi             $zero,        $ac2                         \n\t"
462          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
463          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
464          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
465          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
466          "ulw              %[qload3],    8(%[src])                    \n\t"
467          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
468          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
469
470          /* even 2. pixel */
471          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
472          "mthi             $zero,        $ac3                         \n\t"
473          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
474          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
475          "ulw              %[qload1],    12(%[src])                   \n\t"
476          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
477          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
478          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
479
480          /* even 3. pixel */
481          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
482          "mthi             $zero,        $ac1                         \n\t"
483          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
484          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
485          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
486          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
487          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
488
489          /* even 4. pixel */
490          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
491          "mthi             $zero,        $ac2                         \n\t"
492          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
493          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
494          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
495          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
496          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
497
498          /* even 5. pixel */
499          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
500          "mthi             $zero,        $ac3                         \n\t"
501          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
502          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
503          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
504          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
505
506          /* even 6. pixel */
507          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
508          "mthi             $zero,        $ac1                         \n\t"
509          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
510          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
511          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
512          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
513
514          /* even 7. pixel */
515          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
516          "mthi             $zero,        $ac2                         \n\t"
517          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
518          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
519          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
520          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
521
522          /* even 8. pixel */
523          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
524          "mthi             $zero,        $ac3                         \n\t"
525          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
526          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
527          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
528          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
529
530          /* ODD pixels */
531          "ulw              %[qload1],    1(%[src])                    \n\t"
532          "ulw              %[qload2],    5(%[src])                    \n\t"
533
534          /* odd 1. pixel */
535          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
536          "mthi             $zero,        $ac1                         \n\t"
537          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
538          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
539          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
540          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
541          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
542          "ulw              %[qload3],    9(%[src])                    \n\t"
543          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
544          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
545          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
546
547          /* odd 2. pixel */
548          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
549          "mthi             $zero,        $ac2                         \n\t"
550          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
551          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
552          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
553          "ulw              %[qload1],    13(%[src])                   \n\t"
554          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
555          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
556          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
557
558          /* odd 3. pixel */
559          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
560          "mthi             $zero,        $ac3                         \n\t"
561          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
562          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
563          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
564          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
565          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
566
567          /* odd 4. pixel */
568          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
569          "mthi             $zero,        $ac1                         \n\t"
570          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
571          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
572          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
573          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
574          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
575
576          /* odd 5. pixel */
577          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
578          "mthi             $zero,        $ac2                         \n\t"
579          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
580          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
581          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
582          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
583
584          /* odd 6. pixel */
585          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
586          "mthi             $zero,        $ac3                         \n\t"
587          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
588          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
589          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
590          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
591
592          /* odd 7. pixel */
593          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
594          "mthi             $zero,        $ac1                         \n\t"
595          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
596          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
597          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
598
599          /* odd 8. pixel */
600          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
601          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
602
603          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
604          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
605          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
606
607          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
608          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
609          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
610
611          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
612            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
613            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
614            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
615            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
616          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
617            [dst] "r"(dst), [src] "r"(src));
618
619      src += 16;
620      dst += 16;
621    }
622
623    /* Next row... */
624    src_ptr += src_stride;
625    dst_ptr += dst_stride;
626  }
627}
628
629void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
630                               uint8_t *dst, ptrdiff_t dst_stride,
631                               const InterpKernel *filter, int x0_q4,
632                               int32_t x_step_q4, int y0_q4, int y_step_q4,
633                               int w, int h) {
634  const int16_t *const filter_x = filter[x0_q4];
635  uint32_t pos = 38;
636
637  assert(x_step_q4 == 16);
638
639  prefetch_load((const uint8_t *)filter_x);
640
641  /* bit positon for extract from acc */
642  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
643                       :
644                       : [pos] "r"(pos));
645
646  /* prefetch data to cache memory */
647  prefetch_load(src);
648  prefetch_load(src + 32);
649  prefetch_store(dst);
650
651  switch (w) {
652    case 4:
653      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
654                                (int32_t)dst_stride, filter_x, (int32_t)h);
655      break;
656    case 8:
657      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
658                                (int32_t)dst_stride, filter_x, (int32_t)h);
659      break;
660    case 16:
661      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
662                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
663      break;
664    case 32:
665      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
666                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
667      break;
668    case 64:
669      prefetch_load(src + 64);
670      prefetch_store(dst + 32);
671
672      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
673                                 (int32_t)dst_stride, filter_x, (int32_t)h);
674      break;
675    default:
676      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
677                            x_step_q4, y0_q4, y_step_q4, w, h);
678      break;
679  }
680}
681#endif
682