1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_convolve.h"
17#include "vpx_dsp/vpx_dsp_common.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
22                                      int32_t src_stride,
23                                      uint8_t *dst,
24                                      int32_t dst_stride,
25                                      const int16_t *filter_x0,
26                                      int32_t h) {
27  int32_t y;
28  uint8_t *cm = vpx_ff_cropTbl;
29  int32_t Temp1, Temp2, Temp3, Temp4;
30  uint32_t vector4a = 64;
31  uint32_t tp1, tp2;
32  uint32_t p1, p2;
33  const int16_t *filter = &filter_x0[3];
34  uint32_t filter45;;
35
36  filter45 = ((const int32_t *)filter)[0];
37
38  for (y = h; y--;) {
39    /* prefetch data to cache memory */
40    prefetch_load(src + src_stride);
41    prefetch_load(src + src_stride + 32);
42    prefetch_store(dst + dst_stride);
43
44    __asm__ __volatile__ (
45        "ulw              %[tp1],      0(%[src])                      \n\t"
46        "ulw              %[tp2],      4(%[src])                      \n\t"
47
48        /* even 1. pixel */
49        "mtlo             %[vector4a], $ac3                           \n\t"
50        "mthi             $zero,       $ac3                           \n\t"
51        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
52        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
53        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
54        "extp             %[Temp1],    $ac3,           31             \n\t"
55
56        /* even 2. pixel */
57        "mtlo             %[vector4a], $ac2                           \n\t"
58        "mthi             $zero,       $ac2                           \n\t"
59        "balign           %[tp2],      %[tp1],         3              \n\t"
60        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
61        "extp             %[Temp3],    $ac2,           31             \n\t"
62
63        /* odd 1. pixel */
64        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
65        "mtlo             %[vector4a], $ac3                           \n\t"
66        "mthi             $zero,       $ac3                           \n\t"
67        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
68        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
69        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
70        "extp             %[Temp2],    $ac3,           31             \n\t"
71
72        /* odd 2. pixel */
73        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
74        "mtlo             %[vector4a], $ac2                           \n\t"
75        "mthi             $zero,       $ac2                           \n\t"
76        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
77        "extp             %[Temp4],    $ac2,           31             \n\t"
78
79        /* clamp */
80        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
81        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
82
83        /* store bytes */
84        "sb               %[tp1],      0(%[dst])                      \n\t"
85        "sb               %[p1],       1(%[dst])                      \n\t"
86        "sb               %[tp2],      2(%[dst])                      \n\t"
87        "sb               %[p2],       3(%[dst])                      \n\t"
88
89        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
90          [p1] "=&r" (p1), [p2] "=&r" (p2),
91          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
92          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
93        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
94          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
95    );
96
97    /* Next row... */
98    src += src_stride;
99    dst += dst_stride;
100  }
101}
102
103static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
104                                      int32_t src_stride,
105                                      uint8_t *dst,
106                                      int32_t dst_stride,
107                                      const int16_t *filter_x0,
108                                      int32_t h) {
109  int32_t y;
110  uint8_t *cm = vpx_ff_cropTbl;
111  uint32_t vector4a = 64;
112  int32_t Temp1, Temp2, Temp3;
113  uint32_t tp1, tp2, tp3;
114  uint32_t p1, p2, p3, p4;
115  uint32_t st0, st1;
116  const int16_t *filter = &filter_x0[3];
117  uint32_t filter45;;
118
119  filter45 = ((const int32_t *)filter)[0];
120
121  for (y = h; y--;) {
122    /* prefetch data to cache memory */
123    prefetch_load(src + src_stride);
124    prefetch_load(src + src_stride + 32);
125    prefetch_store(dst + dst_stride);
126
127    __asm__ __volatile__ (
128        "ulw              %[tp1],      0(%[src])                      \n\t"
129        "ulw              %[tp2],      4(%[src])                      \n\t"
130
131        /* even 1. pixel */
132        "mtlo             %[vector4a], $ac3                           \n\t"
133        "mthi             $zero,       $ac3                           \n\t"
134        "mtlo             %[vector4a], $ac2                           \n\t"
135        "mthi             $zero,       $ac2                           \n\t"
136        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
137        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
138        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
139        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
140        "ulw              %[tp3],      8(%[src])                      \n\t"
141        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
142        "extp             %[Temp1],    $ac3,           31             \n\t"
143
144        /* even 2. pixel */
145        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
146        "extp             %[Temp3],    $ac2,           31             \n\t"
147
148        /* even 3. pixel */
149        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
150        "mtlo             %[vector4a], $ac1                           \n\t"
151        "mthi             $zero,       $ac1                           \n\t"
152        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
153        "extp             %[Temp1],    $ac1,           31             \n\t"
154
155        /* even 4. pixel */
156        "mtlo             %[vector4a], $ac2                           \n\t"
157        "mthi             $zero,       $ac2                           \n\t"
158        "mtlo             %[vector4a], $ac3                           \n\t"
159        "mthi             $zero,       $ac3                           \n\t"
160        "sb               %[st0],      0(%[dst])                      \n\t"
161        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
162
163        "balign           %[tp3],      %[tp2],         3              \n\t"
164        "balign           %[tp2],      %[tp1],         3              \n\t"
165
166        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
167        "extp             %[Temp3],    $ac2,           31             \n\t"
168
169        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
170
171        /* odd 1. pixel */
172        "mtlo             %[vector4a], $ac1                           \n\t"
173        "mthi             $zero,       $ac1                           \n\t"
174        "sb               %[st1],      2(%[dst])                      \n\t"
175        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
176        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
177        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
178        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
179        "sb               %[st0],      4(%[dst])                      \n\t"
180        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
181        "extp             %[Temp2],    $ac3,           31             \n\t"
182
183        /* odd 2. pixel */
184        "mtlo             %[vector4a], $ac3                           \n\t"
185        "mthi             $zero,       $ac3                           \n\t"
186        "mtlo             %[vector4a], $ac2                           \n\t"
187        "mthi             $zero,       $ac2                           \n\t"
188        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
189        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
190        "extp             %[Temp3],    $ac1,           31             \n\t"
191
192        /* odd 3. pixel */
193        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
194        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
195        "extp             %[Temp2],    $ac3,           31             \n\t"
196
197        /* odd 4. pixel */
198        "sb               %[st1],      1(%[dst])                      \n\t"
199        "sb               %[st0],      6(%[dst])                      \n\t"
200        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
201        "extp             %[Temp1],    $ac2,           31             \n\t"
202
203        /* clamp */
204        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
205        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
206        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
207
208        /* store bytes */
209        "sb               %[p4],       3(%[dst])                      \n\t"
210        "sb               %[p2],       5(%[dst])                      \n\t"
211        "sb               %[p1],       7(%[dst])                      \n\t"
212
213        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
214          [st0] "=&r" (st0), [st1] "=&r" (st1),
215          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
216          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
217        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
218          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
219    );
220
221    /* Next row... */
222    src += src_stride;
223    dst += dst_stride;
224  }
225}
226
227static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
228                                       int32_t src_stride,
229                                       uint8_t *dst_ptr,
230                                       int32_t dst_stride,
231                                       const int16_t *filter_x0,
232                                       int32_t h,
233                                       int32_t count) {
234  int32_t y, c;
235  const uint8_t *src;
236  uint8_t *dst;
237  uint8_t *cm = vpx_ff_cropTbl;
238  uint32_t vector_64 = 64;
239  int32_t Temp1, Temp2, Temp3;
240  uint32_t qload1, qload2, qload3;
241  uint32_t p1, p2, p3, p4, p5;
242  uint32_t st1, st2, st3;
243  const int16_t *filter = &filter_x0[3];
244  uint32_t filter45;;
245
246  filter45 = ((const int32_t *)filter)[0];
247
248  for (y = h; y--;) {
249    src = src_ptr;
250    dst = dst_ptr;
251
252    /* prefetch data to cache memory */
253    prefetch_load(src_ptr + src_stride);
254    prefetch_load(src_ptr + src_stride + 32);
255    prefetch_store(dst_ptr + dst_stride);
256
257    for (c = 0; c < count; c++) {
258      __asm__ __volatile__ (
259          "ulw              %[qload1],    0(%[src])                    \n\t"
260          "ulw              %[qload2],    4(%[src])                    \n\t"
261
262          /* even 1. pixel */
263          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
264          "mthi             $zero,        $ac1                         \n\t"
265          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
266          "mthi             $zero,        $ac2                         \n\t"
267          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
268          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
269          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
270          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
271          "ulw              %[qload3],    8(%[src])                    \n\t"
272          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
273          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
274
275          /* even 2. pixel */
276          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
277          "mthi             $zero,        $ac3                         \n\t"
278          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
279          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
280          "ulw              %[qload1],    12(%[src])                   \n\t"
281          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
282          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
283          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
284
285          /* even 3. pixel */
286          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
287          "mthi             $zero,        $ac1                         \n\t"
288          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
289          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
290          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
291          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
292          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
293
294          /* even 4. pixel */
295          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
296          "mthi             $zero,        $ac2                         \n\t"
297          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
298          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
299          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
300          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
301          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
302
303          /* even 5. pixel */
304          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
305          "mthi             $zero,        $ac3                         \n\t"
306          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
307          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
308          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
309          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
310
311          /* even 6. pixel */
312          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
313          "mthi             $zero,        $ac1                         \n\t"
314          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
315          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
316          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
317          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
318
319          /* even 7. pixel */
320          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
321          "mthi             $zero,        $ac2                         \n\t"
322          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
323          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
324          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
325          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
326
327          /* even 8. pixel */
328          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
329          "mthi             $zero,        $ac3                         \n\t"
330          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
331          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
332          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
333          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
334
335          /* ODD pixels */
336          "ulw              %[qload1],    1(%[src])                    \n\t"
337          "ulw              %[qload2],    5(%[src])                    \n\t"
338
339          /* odd 1. pixel */
340          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
341          "mthi             $zero,        $ac1                         \n\t"
342          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
343          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
344          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
345          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
346          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
347          "ulw              %[qload3],    9(%[src])                    \n\t"
348          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
349          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
350          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
351
352          /* odd 2. pixel */
353          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
354          "mthi             $zero,        $ac2                         \n\t"
355          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
356          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
357          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
358          "ulw              %[qload1],    13(%[src])                   \n\t"
359          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
360          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
361          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
362
363          /* odd 3. pixel */
364          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
365          "mthi             $zero,        $ac3                         \n\t"
366          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
367          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
368          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
369          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
370          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
371
372          /* odd 4. pixel */
373          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
374          "mthi             $zero,        $ac1                         \n\t"
375          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
376          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
377          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
378          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
379          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
380
381          /* odd 5. pixel */
382          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
383          "mthi             $zero,        $ac2                         \n\t"
384          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
385          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
386          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
387          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
388
389          /* odd 6. pixel */
390          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
391          "mthi             $zero,        $ac3                         \n\t"
392          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
393          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
394          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
395          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
396
397          /* odd 7. pixel */
398          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
399          "mthi             $zero,        $ac1                         \n\t"
400          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
401          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
402          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
403
404          /* odd 8. pixel */
405          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
406          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
407
408          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
409          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
410          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
411
412          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
413          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
414          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
415
416          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
417            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
418            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
419            [p5] "=&r" (p5),
420            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
421          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
422            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
423      );
424
425      src += 16;
426      dst += 16;
427    }
428
429    /* Next row... */
430    src_ptr += src_stride;
431    dst_ptr += dst_stride;
432  }
433}
434
435static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
436                                       int32_t src_stride,
437                                       uint8_t *dst_ptr,
438                                       int32_t dst_stride,
439                                       const int16_t *filter_x0,
440                                       int32_t h) {
441  int32_t y, c;
442  const uint8_t *src;
443  uint8_t *dst;
444  uint8_t *cm = vpx_ff_cropTbl;
445  uint32_t vector_64 = 64;
446  int32_t Temp1, Temp2, Temp3;
447  uint32_t qload1, qload2, qload3;
448  uint32_t p1, p2, p3, p4, p5;
449  uint32_t st1, st2, st3;
450  const int16_t *filter = &filter_x0[3];
451  uint32_t filter45;;
452
453  filter45 = ((const int32_t *)filter)[0];
454
455  for (y = h; y--;) {
456    src = src_ptr;
457    dst = dst_ptr;
458
459    /* prefetch data to cache memory */
460    prefetch_load(src_ptr + src_stride);
461    prefetch_load(src_ptr + src_stride + 32);
462    prefetch_load(src_ptr + src_stride + 64);
463    prefetch_store(dst_ptr + dst_stride);
464    prefetch_store(dst_ptr + dst_stride + 32);
465
466    for (c = 0; c < 4; c++) {
467      __asm__ __volatile__ (
468          "ulw              %[qload1],    0(%[src])                    \n\t"
469          "ulw              %[qload2],    4(%[src])                    \n\t"
470
471          /* even 1. pixel */
472          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
473          "mthi             $zero,        $ac1                         \n\t"
474          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
475          "mthi             $zero,        $ac2                         \n\t"
476          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
477          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
478          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
479          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
480          "ulw              %[qload3],    8(%[src])                    \n\t"
481          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
482          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
483
484          /* even 2. pixel */
485          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
486          "mthi             $zero,        $ac3                         \n\t"
487          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
488          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
489          "ulw              %[qload1],    12(%[src])                   \n\t"
490          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
491          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
492          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
493
494          /* even 3. pixel */
495          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
496          "mthi             $zero,        $ac1                         \n\t"
497          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
498          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
499          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
500          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
501          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
502
503          /* even 4. pixel */
504          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
505          "mthi             $zero,        $ac2                         \n\t"
506          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
507          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
508          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
509          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
510          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
511
512          /* even 5. pixel */
513          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
514          "mthi             $zero,        $ac3                         \n\t"
515          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
516          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
517          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
518          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
519
520          /* even 6. pixel */
521          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
522          "mthi             $zero,        $ac1                         \n\t"
523          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
524          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
525          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
526          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
527
528          /* even 7. pixel */
529          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
530          "mthi             $zero,        $ac2                         \n\t"
531          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
532          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
533          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
534          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
535
536          /* even 8. pixel */
537          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
538          "mthi             $zero,        $ac3                         \n\t"
539          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
540          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
541          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
542          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
543
544          /* ODD pixels */
545          "ulw              %[qload1],    1(%[src])                    \n\t"
546          "ulw              %[qload2],    5(%[src])                    \n\t"
547
548          /* odd 1. pixel */
549          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
550          "mthi             $zero,        $ac1                         \n\t"
551          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
552          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
553          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
554          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
555          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
556          "ulw              %[qload3],    9(%[src])                    \n\t"
557          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
558          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
559          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
560
561          /* odd 2. pixel */
562          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
563          "mthi             $zero,        $ac2                         \n\t"
564          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
565          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
566          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
567          "ulw              %[qload1],    13(%[src])                   \n\t"
568          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
569          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
570          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
571
572          /* odd 3. pixel */
573          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
574          "mthi             $zero,        $ac3                         \n\t"
575          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
576          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
577          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
578          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
579          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
580
581          /* odd 4. pixel */
582          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
583          "mthi             $zero,        $ac1                         \n\t"
584          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
585          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
586          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
587          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
588          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
589
590          /* odd 5. pixel */
591          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
592          "mthi             $zero,        $ac2                         \n\t"
593          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
594          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
595          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
596          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
597
598          /* odd 6. pixel */
599          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
600          "mthi             $zero,        $ac3                         \n\t"
601          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
602          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
603          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
604          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
605
606          /* odd 7. pixel */
607          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
608          "mthi             $zero,        $ac1                         \n\t"
609          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
610          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
611          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
612
613          /* odd 8. pixel */
614          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
615          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
616
617          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
618          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
619          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
620
621          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
622          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
623          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
624
625          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
626            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
627            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
628            [p5] "=&r" (p5),
629            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
630          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
631            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
632      );
633
634      src += 16;
635      dst += 16;
636    }
637
638    /* Next row... */
639    src_ptr += src_stride;
640    dst_ptr += dst_stride;
641  }
642}
643
644void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
645                               uint8_t *dst, ptrdiff_t dst_stride,
646                               const int16_t *filter_x, int x_step_q4,
647                               const int16_t *filter_y, int y_step_q4,
648                               int w, int h) {
649  uint32_t pos = 38;
650
651  assert(x_step_q4 == 16);
652
653  prefetch_load((const uint8_t *)filter_x);
654
655  /* bit positon for extract from acc */
656  __asm__ __volatile__ (
657    "wrdsp      %[pos],     1           \n\t"
658    :
659    : [pos] "r" (pos)
660  );
661
662  /* prefetch data to cache memory */
663  prefetch_load(src);
664  prefetch_load(src + 32);
665  prefetch_store(dst);
666
667  switch (w) {
668    case 4:
669      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
670                                dst, (int32_t)dst_stride,
671                                filter_x, (int32_t)h);
672      break;
673    case 8:
674      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
675                                dst, (int32_t)dst_stride,
676                                filter_x, (int32_t)h);
677      break;
678    case 16:
679      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
680                                 dst, (int32_t)dst_stride,
681                                 filter_x, (int32_t)h, 1);
682      break;
683    case 32:
684      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
685                                 dst, (int32_t)dst_stride,
686                                 filter_x, (int32_t)h, 2);
687      break;
688    case 64:
689      prefetch_load(src + 64);
690      prefetch_store(dst + 32);
691
692      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
693                                 dst, (int32_t)dst_stride,
694                                 filter_x, (int32_t)h);
695      break;
696    default:
697      vpx_convolve8_horiz_c(src, src_stride,
698                            dst, dst_stride,
699                            filter_x, x_step_q4,
700                            filter_y, y_step_q4,
701                            w, h);
702      break;
703  }
704}
705#endif
706