1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_dsp_common.h"
17#include "vpx_dsp/vpx_filter.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_bi_horiz_4_transposed_dspr2(
22    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
23    const int16_t *filter_x0, int32_t h) {
24  int32_t y;
25  uint8_t *cm = vpx_ff_cropTbl;
26  uint8_t *dst_ptr;
27  int32_t Temp1, Temp2;
28  uint32_t vector4a = 64;
29  uint32_t tp1, tp2;
30  uint32_t p1, p2;
31  const int16_t *filter = &filter_x0[3];
32  uint32_t filter45;
33
34  filter45 = ((const int32_t *)filter)[0];
35
36  for (y = h; y--;) {
37    dst_ptr = dst;
38    /* prefetch data to cache memory */
39    prefetch_load(src + src_stride);
40    prefetch_load(src + src_stride + 32);
41
42    __asm__ __volatile__(
43        "ulw              %[tp1],         0(%[src])                      \n\t"
44        "ulw              %[tp2],         4(%[src])                      \n\t"
45
46        /* even 1. pixel */
47        "mtlo             %[vector4a],    $ac3                           \n\t"
48        "mthi             $zero,          $ac3                           \n\t"
49        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
50        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
51        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
52        "extp             %[Temp1],       $ac3,           31             \n\t"
53
54        /* even 2. pixel */
55        "mtlo             %[vector4a],    $ac2                           \n\t"
56        "mthi             $zero,          $ac2                           \n\t"
57        "balign           %[tp2],         %[tp1],         3              \n\t"
58        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
59        "extp             %[Temp2],       $ac2,           31             \n\t"
60
61        /* odd 1. pixel */
62        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
63        "mtlo             %[vector4a],    $ac3                           \n\t"
64        "mthi             $zero,          $ac3                           \n\t"
65        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
66        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
67        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
68        "extp             %[Temp1],       $ac3,           31             \n\t"
69
70        /* odd 2. pixel */
71        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
72        "mtlo             %[vector4a],    $ac2                           \n\t"
73        "mthi             $zero,          $ac2                           \n\t"
74        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
75        "extp             %[Temp2],       $ac2,           31             \n\t"
76
77        /* clamp */
78        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
79        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
80
81        /* store bytes */
82        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
83        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
84
85        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
86        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
87
88        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
89        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
90
91        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
92        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
93
94        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
95          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
96        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
97          [src] "r"(src), [dst_stride] "r"(dst_stride));
98
99    /* Next row... */
100    src += src_stride;
101    dst += 1;
102  }
103}
104
105static void convolve_bi_horiz_8_transposed_dspr2(
106    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
107    const int16_t *filter_x0, int32_t h) {
108  int32_t y;
109  uint8_t *cm = vpx_ff_cropTbl;
110  uint8_t *dst_ptr;
111  uint32_t vector4a = 64;
112  int32_t Temp1, Temp2, Temp3;
113  uint32_t tp1, tp2, tp3;
114  uint32_t p1, p2, p3, p4;
115  uint8_t *odd_dst;
116  uint32_t dst_pitch_2 = (dst_stride << 1);
117  const int16_t *filter = &filter_x0[3];
118  uint32_t filter45;
119
120  filter45 = ((const int32_t *)filter)[0];
121
122  for (y = h; y--;) {
123    /* prefetch data to cache memory */
124    prefetch_load(src + src_stride);
125    prefetch_load(src + src_stride + 32);
126
127    dst_ptr = dst;
128    odd_dst = (dst_ptr + dst_stride);
129
130    __asm__ __volatile__(
131        "ulw              %[tp1],         0(%[src])                       \n\t"
132        "ulw              %[tp2],         4(%[src])                       \n\t"
133
134        /* even 1. pixel */
135        "mtlo             %[vector4a],    $ac3                            \n\t"
136        "mthi             $zero,          $ac3                            \n\t"
137        "mtlo             %[vector4a],    $ac2                            \n\t"
138        "mthi             $zero,          $ac2                            \n\t"
139        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
140        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
141        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
142        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
143        "ulw              %[tp3],         8(%[src])                       \n\t"
144        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
145        "extp             %[Temp1],       $ac3,           31              \n\t"
146
147        /* even 2. pixel */
148        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
149        "extp             %[Temp3],       $ac2,           31              \n\t"
150
151        /* even 3. pixel */
152        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
153        "mtlo             %[vector4a],    $ac1                            \n\t"
154        "mthi             $zero,          $ac1                            \n\t"
155        "balign           %[tp3],         %[tp2],         3              \n\t"
156        "balign           %[tp2],         %[tp1],         3              \n\t"
157        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
158        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
159        "extp             %[p3],          $ac1,           31              \n\t"
160
161        /* even 4. pixel */
162        "mtlo             %[vector4a],    $ac2                            \n\t"
163        "mthi             $zero,          $ac2                            \n\t"
164        "mtlo             %[vector4a],    $ac3                            \n\t"
165        "mthi             $zero,          $ac3                            \n\t"
166        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
167        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
168        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
169        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
170
171        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
172        "extp             %[Temp3],       $ac2,           31              \n\t"
173
174        "lbux             %[Temp1],         %[p3](%[cm])                    "
175        "\n\t"
176
177        /* odd 1. pixel */
178        "mtlo             %[vector4a],    $ac1                            \n\t"
179        "mthi             $zero,          $ac1                            \n\t"
180        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
181        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
182        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
183        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
184        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
185        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
186
187        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
188        "extp             %[Temp2],       $ac3,           31              \n\t"
189
190        /* odd 2. pixel */
191        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
192        "mtlo             %[vector4a],    $ac3                            \n\t"
193        "mthi             $zero,          $ac3                            \n\t"
194        "mtlo             %[vector4a],    $ac2                            \n\t"
195        "mthi             $zero,          $ac2                            \n\t"
196        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
197        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
198        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
199        "extp             %[Temp3],       $ac1,           31              \n\t"
200
201        /* odd 3. pixel */
202        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
203        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
204        "extp             %[Temp2],       $ac3,           31              \n\t"
205
206        /* odd 4. pixel */
207        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
208        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
209        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
210        "extp             %[Temp1],       $ac2,           31              \n\t"
211
212        /* clamp */
213        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
214        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
215        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
216
217        /* store bytes */
218        "sb               %[p4],          0(%[odd_dst])                   \n\t"
219        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
220
221        "sb               %[p2],          0(%[odd_dst])                   \n\t"
222        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
223
224        "sb               %[p1],          0(%[odd_dst])                   \n\t"
225
226        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
227          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
228          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
229          [odd_dst] "+r"(odd_dst)
230        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
231          [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
232
233    /* Next row... */
234    src += src_stride;
235    dst += 1;
236  }
237}
238
239static void convolve_bi_horiz_16_transposed_dspr2(
240    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
241    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
242  int32_t c, y;
243  const uint8_t *src;
244  uint8_t *dst;
245  uint8_t *cm = vpx_ff_cropTbl;
246  uint32_t vector_64 = 64;
247  int32_t Temp1, Temp2, Temp3;
248  uint32_t qload1, qload2;
249  uint32_t p1, p2, p3, p4, p5;
250  uint32_t st1, st2, st3;
251  uint32_t dst_pitch_2 = (dst_stride << 1);
252  uint8_t *odd_dst;
253  const int16_t *filter = &filter_x0[3];
254  uint32_t filter45;
255
256  filter45 = ((const int32_t *)filter)[0];
257
258  for (y = h; y--;) {
259    /* prefetch data to cache memory */
260    prefetch_load(src_ptr + src_stride);
261    prefetch_load(src_ptr + src_stride + 32);
262
263    src = src_ptr;
264    dst = dst_ptr;
265
266    odd_dst = (dst + dst_stride);
267
268    for (c = 0; c < count; c++) {
269      __asm__ __volatile__(
270          "ulw              %[qload1],        0(%[src])                       "
271          "\n\t"
272          "ulw              %[qload2],        4(%[src])                       "
273          "\n\t"
274
275          /* even 1. pixel */
276          "mtlo             %[vector_64],     $ac1                            "
277          "\n\t" /* even 1 */
278          "mthi             $zero,            $ac1                            "
279          "\n\t"
280          "mtlo             %[vector_64],     $ac2                            "
281          "\n\t" /* even 2 */
282          "mthi             $zero,            $ac2                            "
283          "\n\t"
284          "preceu.ph.qbr    %[p1],            %[qload1]                       "
285          "\n\t"
286          "preceu.ph.qbl    %[p2],            %[qload1]                       "
287          "\n\t"
288          "preceu.ph.qbr    %[p3],            %[qload2]                       "
289          "\n\t"
290          "preceu.ph.qbl    %[p4],            %[qload2]                       "
291          "\n\t"
292          "ulw              %[qload1],        8(%[src])                       "
293          "\n\t"
294          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
295          "\n\t" /* even 1 */
296          "extp             %[Temp1],         $ac1,           31              "
297          "\n\t" /* even 1 */
298
299          /* even 2. pixel */
300          "mtlo             %[vector_64],     $ac3                            "
301          "\n\t" /* even 3 */
302          "mthi             $zero,            $ac3                            "
303          "\n\t"
304          "preceu.ph.qbr    %[p1],            %[qload1]                       "
305          "\n\t"
306          "preceu.ph.qbl    %[p5],            %[qload1]                       "
307          "\n\t"
308          "ulw              %[qload2],        12(%[src])                      "
309          "\n\t"
310          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
311          "\n\t" /* even 1 */
312          "lbux             %[st1],           %[Temp1](%[cm])                 "
313          "\n\t" /* even 1 */
314          "extp             %[Temp2],         $ac2,           31              "
315          "\n\t" /* even 1 */
316
317          /* even 3. pixel */
318          "mtlo             %[vector_64],     $ac1                            "
319          "\n\t" /* even 4 */
320          "mthi             $zero,            $ac1                            "
321          "\n\t"
322          "preceu.ph.qbr    %[p2],            %[qload2]                       "
323          "\n\t"
324          "sb               %[st1],           0(%[dst])                       "
325          "\n\t" /* even 1 */
326          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
327          "          \n\t"
328          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
329          "\n\t" /* even 3 */
330          "extp             %[Temp3],         $ac3,           31              "
331          "\n\t" /* even 3 */
332          "lbux             %[st2],           %[Temp2](%[cm])                 "
333          "\n\t" /* even 1 */
334
335          /* even 4. pixel */
336          "mtlo             %[vector_64],     $ac2                            "
337          "\n\t" /* even 5 */
338          "mthi             $zero,            $ac2                            "
339          "\n\t"
340          "preceu.ph.qbl    %[p3],            %[qload2]                       "
341          "\n\t"
342          "sb               %[st2],           0(%[dst])                       "
343          "\n\t" /* even 2 */
344          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
345          "\n\t"
346          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
347          "\n\t" /* even 4 */
348          "extp             %[Temp1],         $ac1,           31              "
349          "\n\t" /* even 4 */
350          "lbux             %[st3],           %[Temp3](%[cm])                 "
351          "\n\t" /* even 3 */
352
353          /* even 5. pixel */
354          "mtlo             %[vector_64],     $ac3                            "
355          "\n\t" /* even 6 */
356          "mthi             $zero,            $ac3                            "
357          "\n\t"
358          "sb               %[st3],           0(%[dst])                       "
359          "\n\t" /* even 3 */
360          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
361          "\n\t"
362          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
363          "\n\t" /* even 5 */
364          "extp             %[Temp2],         $ac2,           31              "
365          "\n\t" /* even 5 */
366          "lbux             %[st1],           %[Temp1](%[cm])                 "
367          "\n\t" /* even 4 */
368
369          /* even 6. pixel */
370          "mtlo             %[vector_64],     $ac1                            "
371          "\n\t" /* even 7 */
372          "mthi             $zero,            $ac1                            "
373          "\n\t"
374          "sb               %[st1],           0(%[dst])                       "
375          "\n\t" /* even 4 */
376          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
377          "\n\t"
378          "ulw              %[qload1],        20(%[src])                      "
379          "\n\t"
380          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
381          "\n\t" /* even 6 */
382          "extp             %[Temp3],         $ac3,           31              "
383          "\n\t" /* even 6 */
384          "lbux             %[st2],           %[Temp2](%[cm])                 "
385          "\n\t" /* even 5 */
386
387          /* even 7. pixel */
388          "mtlo             %[vector_64],     $ac2                            "
389          "\n\t" /* even 8 */
390          "mthi             $zero,            $ac2                            "
391          "\n\t"
392          "preceu.ph.qbr    %[p5],            %[qload1]                       "
393          "\n\t"
394          "sb               %[st2],           0(%[dst])                       "
395          "\n\t" /* even 5 */
396          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
397          "\n\t"
398          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
399          "\n\t" /* even 7 */
400          "extp             %[Temp1],         $ac1,           31              "
401          "\n\t" /* even 7 */
402          "lbux             %[st3],           %[Temp3](%[cm])                 "
403          "\n\t" /* even 6 */
404
405          /* even 8. pixel */
406          "mtlo             %[vector_64],     $ac3                            "
407          "\n\t" /* odd 1 */
408          "mthi             $zero,            $ac3                            "
409          "\n\t"
410          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
411          "\n\t" /* even 8 */
412          "sb               %[st3],           0(%[dst])                       "
413          "\n\t" /* even 6 */
414          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
415          "\n\t"
416          "extp             %[Temp2],         $ac2,           31              "
417          "\n\t" /* even 8 */
418          "lbux             %[st1],           %[Temp1](%[cm])                 "
419          "\n\t" /* even 7 */
420
421          /* ODD pixels */
422          "ulw              %[qload1],        1(%[src])                       "
423          "\n\t"
424          "ulw              %[qload2],        5(%[src])                       "
425          "\n\t"
426
427          /* odd 1. pixel */
428          "mtlo             %[vector_64],     $ac1                            "
429          "\n\t" /* odd 2 */
430          "mthi             $zero,            $ac1                            "
431          "\n\t"
432          "preceu.ph.qbr    %[p1],            %[qload1]                       "
433          "\n\t"
434          "preceu.ph.qbl    %[p2],            %[qload1]                       "
435          "\n\t"
436          "preceu.ph.qbr    %[p3],            %[qload2]                       "
437          "\n\t"
438          "preceu.ph.qbl    %[p4],            %[qload2]                       "
439          "\n\t"
440          "sb               %[st1],           0(%[dst])                       "
441          "\n\t" /* even 7 */
442          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
443          "\n\t"
444          "ulw              %[qload2],        9(%[src])                       "
445          "\n\t"
446          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
447          "\n\t" /* odd 1 */
448          "extp             %[Temp3],         $ac3,           31              "
449          "\n\t" /* odd 1 */
450          "lbux             %[st2],           %[Temp2](%[cm])                 "
451          "\n\t" /* even 8 */
452
453          /* odd 2. pixel */
454          "mtlo             %[vector_64],     $ac2                            "
455          "\n\t" /* odd 3 */
456          "mthi             $zero,            $ac2                            "
457          "\n\t"
458          "preceu.ph.qbr    %[p1],            %[qload2]                       "
459          "\n\t"
460          "preceu.ph.qbl    %[p5],            %[qload2]                       "
461          "\n\t"
462          "sb               %[st2],           0(%[dst])                       "
463          "\n\t" /* even 8 */
464          "ulw              %[qload1],        13(%[src])                      "
465          "\n\t"
466          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
467          "\n\t" /* odd 2 */
468          "extp             %[Temp1],         $ac1,           31              "
469          "\n\t" /* odd 2 */
470          "lbux             %[st3],           %[Temp3](%[cm])                 "
471          "\n\t" /* odd 1 */
472
473          /* odd 3. pixel */
474          "mtlo             %[vector_64],     $ac3                            "
475          "\n\t" /* odd 4 */
476          "mthi             $zero,            $ac3                            "
477          "\n\t"
478          "preceu.ph.qbr    %[p2],            %[qload1]                       "
479          "\n\t"
480          "sb               %[st3],           0(%[odd_dst])                   "
481          "\n\t" /* odd 1 */
482          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
483          "\n\t"
484          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
485          "\n\t" /* odd 3 */
486          "extp             %[Temp2],         $ac2,           31              "
487          "\n\t" /* odd 3 */
488          "lbux             %[st1],           %[Temp1](%[cm])                 "
489          "\n\t" /* odd 2 */
490
491          /* odd 4. pixel */
492          "mtlo             %[vector_64],     $ac1                            "
493          "\n\t" /* odd 5 */
494          "mthi             $zero,            $ac1                            "
495          "\n\t"
496          "preceu.ph.qbl    %[p3],            %[qload1]                       "
497          "\n\t"
498          "sb               %[st1],           0(%[odd_dst])                   "
499          "\n\t" /* odd 2 */
500          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
501          "\n\t"
502          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
503          "\n\t" /* odd 4 */
504          "extp             %[Temp3],         $ac3,           31              "
505          "\n\t" /* odd 4 */
506          "lbux             %[st2],           %[Temp2](%[cm])                 "
507          "\n\t" /* odd 3 */
508
509          /* odd 5. pixel */
510          "mtlo             %[vector_64],     $ac2                            "
511          "\n\t" /* odd 6 */
512          "mthi             $zero,            $ac2                            "
513          "\n\t"
514          "sb               %[st2],           0(%[odd_dst])                   "
515          "\n\t" /* odd 3 */
516          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
517          "\n\t"
518          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
519          "\n\t" /* odd 5 */
520          "extp             %[Temp1],         $ac1,           31              "
521          "\n\t" /* odd 5 */
522          "lbux             %[st3],           %[Temp3](%[cm])                 "
523          "\n\t" /* odd 4 */
524
525          /* odd 6. pixel */
526          "mtlo             %[vector_64],     $ac3                            "
527          "\n\t" /* odd 7 */
528          "mthi             $zero,            $ac3                            "
529          "\n\t"
530          "sb               %[st3],           0(%[odd_dst])                   "
531          "\n\t" /* odd 4 */
532          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
533          "\n\t"
534          "ulw              %[qload1],        21(%[src])                      "
535          "\n\t"
536          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
537          "\n\t" /* odd 6 */
538          "extp             %[Temp2],         $ac2,           31              "
539          "\n\t" /* odd 6 */
540          "lbux             %[st1],           %[Temp1](%[cm])                 "
541          "\n\t" /* odd 5 */
542
543          /* odd 7. pixel */
544          "mtlo             %[vector_64],     $ac1                            "
545          "\n\t" /* odd 8 */
546          "mthi             $zero,            $ac1                            "
547          "\n\t"
548          "preceu.ph.qbr    %[p5],            %[qload1]                       "
549          "\n\t"
550          "sb               %[st1],           0(%[odd_dst])                   "
551          "\n\t" /* odd 5 */
552          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
553          "\n\t"
554          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
555          "\n\t" /* odd 7 */
556          "extp             %[Temp3],         $ac3,           31              "
557          "\n\t" /* odd 7 */
558
559          /* odd 8. pixel */
560          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
561          "\n\t" /* odd 8 */
562          "extp             %[Temp1],         $ac1,           31              "
563          "\n\t" /* odd 8 */
564
565          "lbux             %[st2],           %[Temp2](%[cm])                 "
566          "\n\t" /* odd 6 */
567          "lbux             %[st3],           %[Temp3](%[cm])                 "
568          "\n\t" /* odd 7 */
569          "lbux             %[st1],           %[Temp1](%[cm])                 "
570          "\n\t" /* odd 8 */
571
572          "sb               %[st2],           0(%[odd_dst])                   "
573          "\n\t" /* odd 6 */
574          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
575          "\n\t"
576
577          "sb               %[st3],           0(%[odd_dst])                   "
578          "\n\t" /* odd 7 */
579          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
580          "\n\t"
581
582          "sb               %[st1],           0(%[odd_dst])                   "
583          "\n\t" /* odd 8 */
584
585          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
586            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
587            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
588            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
589            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
590          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
591            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
592
593      src += 16;
594      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
595      odd_dst = (dst + dst_stride);
596    }
597
598    /* Next row... */
599    src_ptr += src_stride;
600    dst_ptr += 1;
601  }
602}
603
604static void convolve_bi_horiz_64_transposed_dspr2(
605    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
606    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
607  int32_t c, y;
608  const uint8_t *src;
609  uint8_t *dst;
610  uint8_t *cm = vpx_ff_cropTbl;
611  uint32_t vector_64 = 64;
612  int32_t Temp1, Temp2, Temp3;
613  uint32_t qload1, qload2;
614  uint32_t p1, p2, p3, p4, p5;
615  uint32_t st1, st2, st3;
616  uint32_t dst_pitch_2 = (dst_stride << 1);
617  uint8_t *odd_dst;
618  const int16_t *filter = &filter_x0[3];
619  uint32_t filter45;
620
621  filter45 = ((const int32_t *)filter)[0];
622
623  for (y = h; y--;) {
624    /* prefetch data to cache memory */
625    prefetch_load(src_ptr + src_stride);
626    prefetch_load(src_ptr + src_stride + 32);
627    prefetch_load(src_ptr + src_stride + 64);
628
629    src = src_ptr;
630    dst = dst_ptr;
631
632    odd_dst = (dst + dst_stride);
633
634    for (c = 0; c < 4; c++) {
635      __asm__ __volatile__(
636          "ulw              %[qload1],        0(%[src])                       "
637          "\n\t"
638          "ulw              %[qload2],        4(%[src])                       "
639          "\n\t"
640
641          /* even 1. pixel */
642          "mtlo             %[vector_64],     $ac1                            "
643          "\n\t" /* even 1 */
644          "mthi             $zero,            $ac1                            "
645          "\n\t"
646          "mtlo             %[vector_64],     $ac2                            "
647          "\n\t" /* even 2 */
648          "mthi             $zero,            $ac2                            "
649          "\n\t"
650          "preceu.ph.qbr    %[p1],            %[qload1]                       "
651          "\n\t"
652          "preceu.ph.qbl    %[p2],            %[qload1]                       "
653          "\n\t"
654          "preceu.ph.qbr    %[p3],            %[qload2]                       "
655          "\n\t"
656          "preceu.ph.qbl    %[p4],            %[qload2]                       "
657          "\n\t"
658          "ulw              %[qload1],        8(%[src])                       "
659          "\n\t"
660          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
661          "\n\t" /* even 1 */
662          "extp             %[Temp1],         $ac1,           31              "
663          "\n\t" /* even 1 */
664
665          /* even 2. pixel */
666          "mtlo             %[vector_64],     $ac3                            "
667          "\n\t" /* even 3 */
668          "mthi             $zero,            $ac3                            "
669          "\n\t"
670          "preceu.ph.qbr    %[p1],            %[qload1]                       "
671          "\n\t"
672          "preceu.ph.qbl    %[p5],            %[qload1]                       "
673          "\n\t"
674          "ulw              %[qload2],        12(%[src])                      "
675          "\n\t"
676          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
677          "\n\t" /* even 1 */
678          "lbux             %[st1],           %[Temp1](%[cm])                 "
679          "\n\t" /* even 1 */
680          "extp             %[Temp2],         $ac2,           31              "
681          "\n\t" /* even 1 */
682
683          /* even 3. pixel */
684          "mtlo             %[vector_64],     $ac1                            "
685          "\n\t" /* even 4 */
686          "mthi             $zero,            $ac1                            "
687          "\n\t"
688          "preceu.ph.qbr    %[p2],            %[qload2]                       "
689          "\n\t"
690          "sb               %[st1],           0(%[dst])                       "
691          "\n\t" /* even 1 */
692          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
693          "          \n\t"
694          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
695          "\n\t" /* even 3 */
696          "extp             %[Temp3],         $ac3,           31              "
697          "\n\t" /* even 3 */
698          "lbux             %[st2],           %[Temp2](%[cm])                 "
699          "\n\t" /* even 1 */
700
701          /* even 4. pixel */
702          "mtlo             %[vector_64],     $ac2                            "
703          "\n\t" /* even 5 */
704          "mthi             $zero,            $ac2                            "
705          "\n\t"
706          "preceu.ph.qbl    %[p3],            %[qload2]                       "
707          "\n\t"
708          "sb               %[st2],           0(%[dst])                       "
709          "\n\t" /* even 2 */
710          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
711          "\n\t"
712          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
713          "\n\t" /* even 4 */
714          "extp             %[Temp1],         $ac1,           31              "
715          "\n\t" /* even 4 */
716          "lbux             %[st3],           %[Temp3](%[cm])                 "
717          "\n\t" /* even 3 */
718
719          /* even 5. pixel */
720          "mtlo             %[vector_64],     $ac3                            "
721          "\n\t" /* even 6 */
722          "mthi             $zero,            $ac3                            "
723          "\n\t"
724          "sb               %[st3],           0(%[dst])                       "
725          "\n\t" /* even 3 */
726          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
727          "\n\t"
728          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
729          "\n\t" /* even 5 */
730          "extp             %[Temp2],         $ac2,           31              "
731          "\n\t" /* even 5 */
732          "lbux             %[st1],           %[Temp1](%[cm])                 "
733          "\n\t" /* even 4 */
734
735          /* even 6. pixel */
736          "mtlo             %[vector_64],     $ac1                            "
737          "\n\t" /* even 7 */
738          "mthi             $zero,            $ac1                            "
739          "\n\t"
740          "sb               %[st1],           0(%[dst])                       "
741          "\n\t" /* even 4 */
742          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
743          "\n\t"
744          "ulw              %[qload1],        20(%[src])                      "
745          "\n\t"
746          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
747          "\n\t" /* even 6 */
748          "extp             %[Temp3],         $ac3,           31              "
749          "\n\t" /* even 6 */
750          "lbux             %[st2],           %[Temp2](%[cm])                 "
751          "\n\t" /* even 5 */
752
753          /* even 7. pixel */
754          "mtlo             %[vector_64],     $ac2                            "
755          "\n\t" /* even 8 */
756          "mthi             $zero,            $ac2                            "
757          "\n\t"
758          "preceu.ph.qbr    %[p5],            %[qload1]                       "
759          "\n\t"
760          "sb               %[st2],           0(%[dst])                       "
761          "\n\t" /* even 5 */
762          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
763          "\n\t"
764          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
765          "\n\t" /* even 7 */
766          "extp             %[Temp1],         $ac1,           31              "
767          "\n\t" /* even 7 */
768          "lbux             %[st3],           %[Temp3](%[cm])                 "
769          "\n\t" /* even 6 */
770
771          /* even 8. pixel */
772          "mtlo             %[vector_64],     $ac3                            "
773          "\n\t" /* odd 1 */
774          "mthi             $zero,            $ac3                            "
775          "\n\t"
776          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
777          "\n\t" /* even 8 */
778          "sb               %[st3],           0(%[dst])                       "
779          "\n\t" /* even 6 */
780          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
781          "\n\t"
782          "extp             %[Temp2],         $ac2,           31              "
783          "\n\t" /* even 8 */
784          "lbux             %[st1],           %[Temp1](%[cm])                 "
785          "\n\t" /* even 7 */
786
787          /* ODD pixels */
788          "ulw              %[qload1],        1(%[src])                       "
789          "\n\t"
790          "ulw              %[qload2],        5(%[src])                       "
791          "\n\t"
792
793          /* odd 1. pixel */
794          "mtlo             %[vector_64],     $ac1                            "
795          "\n\t" /* odd 2 */
796          "mthi             $zero,            $ac1                            "
797          "\n\t"
798          "preceu.ph.qbr    %[p1],            %[qload1]                       "
799          "\n\t"
800          "preceu.ph.qbl    %[p2],            %[qload1]                       "
801          "\n\t"
802          "preceu.ph.qbr    %[p3],            %[qload2]                       "
803          "\n\t"
804          "preceu.ph.qbl    %[p4],            %[qload2]                       "
805          "\n\t"
806          "sb               %[st1],           0(%[dst])                       "
807          "\n\t" /* even 7 */
808          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
809          "\n\t"
810          "ulw              %[qload2],        9(%[src])                       "
811          "\n\t"
812          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
813          "\n\t" /* odd 1 */
814          "extp             %[Temp3],         $ac3,           31              "
815          "\n\t" /* odd 1 */
816          "lbux             %[st2],           %[Temp2](%[cm])                 "
817          "\n\t" /* even 8 */
818
819          /* odd 2. pixel */
820          "mtlo             %[vector_64],     $ac2                            "
821          "\n\t" /* odd 3 */
822          "mthi             $zero,            $ac2                            "
823          "\n\t"
824          "preceu.ph.qbr    %[p1],            %[qload2]                       "
825          "\n\t"
826          "preceu.ph.qbl    %[p5],            %[qload2]                       "
827          "\n\t"
828          "sb               %[st2],           0(%[dst])                       "
829          "\n\t" /* even 8 */
830          "ulw              %[qload1],        13(%[src])                      "
831          "\n\t"
832          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
833          "\n\t" /* odd 2 */
834          "extp             %[Temp1],         $ac1,           31              "
835          "\n\t" /* odd 2 */
836          "lbux             %[st3],           %[Temp3](%[cm])                 "
837          "\n\t" /* odd 1 */
838
839          /* odd 3. pixel */
840          "mtlo             %[vector_64],     $ac3                            "
841          "\n\t" /* odd 4 */
842          "mthi             $zero,            $ac3                            "
843          "\n\t"
844          "preceu.ph.qbr    %[p2],            %[qload1]                       "
845          "\n\t"
846          "sb               %[st3],           0(%[odd_dst])                   "
847          "\n\t" /* odd 1 */
848          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
849          "\n\t"
850          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
851          "\n\t" /* odd 3 */
852          "extp             %[Temp2],         $ac2,           31              "
853          "\n\t" /* odd 3 */
854          "lbux             %[st1],           %[Temp1](%[cm])                 "
855          "\n\t" /* odd 2 */
856
857          /* odd 4. pixel */
858          "mtlo             %[vector_64],     $ac1                            "
859          "\n\t" /* odd 5 */
860          "mthi             $zero,            $ac1                            "
861          "\n\t"
862          "preceu.ph.qbl    %[p3],            %[qload1]                       "
863          "\n\t"
864          "sb               %[st1],           0(%[odd_dst])                   "
865          "\n\t" /* odd 2 */
866          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
867          "\n\t"
868          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
869          "\n\t" /* odd 4 */
870          "extp             %[Temp3],         $ac3,           31              "
871          "\n\t" /* odd 4 */
872          "lbux             %[st2],           %[Temp2](%[cm])                 "
873          "\n\t" /* odd 3 */
874
875          /* odd 5. pixel */
876          "mtlo             %[vector_64],     $ac2                            "
877          "\n\t" /* odd 6 */
878          "mthi             $zero,            $ac2                            "
879          "\n\t"
880          "sb               %[st2],           0(%[odd_dst])                   "
881          "\n\t" /* odd 3 */
882          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
883          "\n\t"
884          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
885          "\n\t" /* odd 5 */
886          "extp             %[Temp1],         $ac1,           31              "
887          "\n\t" /* odd 5 */
888          "lbux             %[st3],           %[Temp3](%[cm])                 "
889          "\n\t" /* odd 4 */
890
891          /* odd 6. pixel */
892          "mtlo             %[vector_64],     $ac3                            "
893          "\n\t" /* odd 7 */
894          "mthi             $zero,            $ac3                            "
895          "\n\t"
896          "sb               %[st3],           0(%[odd_dst])                   "
897          "\n\t" /* odd 4 */
898          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
899          "\n\t"
900          "ulw              %[qload1],        21(%[src])                      "
901          "\n\t"
902          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
903          "\n\t" /* odd 6 */
904          "extp             %[Temp2],         $ac2,           31              "
905          "\n\t" /* odd 6 */
906          "lbux             %[st1],           %[Temp1](%[cm])                 "
907          "\n\t" /* odd 5 */
908
909          /* odd 7. pixel */
910          "mtlo             %[vector_64],     $ac1                            "
911          "\n\t" /* odd 8 */
912          "mthi             $zero,            $ac1                            "
913          "\n\t"
914          "preceu.ph.qbr    %[p5],            %[qload1]                       "
915          "\n\t"
916          "sb               %[st1],           0(%[odd_dst])                   "
917          "\n\t" /* odd 5 */
918          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
919          "\n\t"
920          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
921          "\n\t" /* odd 7 */
922          "extp             %[Temp3],         $ac3,           31              "
923          "\n\t" /* odd 7 */
924
925          /* odd 8. pixel */
926          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
927          "\n\t" /* odd 8 */
928          "extp             %[Temp1],         $ac1,           31              "
929          "\n\t" /* odd 8 */
930
931          "lbux             %[st2],           %[Temp2](%[cm])                 "
932          "\n\t" /* odd 6 */
933          "lbux             %[st3],           %[Temp3](%[cm])                 "
934          "\n\t" /* odd 7 */
935          "lbux             %[st1],           %[Temp1](%[cm])                 "
936          "\n\t" /* odd 8 */
937
938          "sb               %[st2],           0(%[odd_dst])                   "
939          "\n\t" /* odd 6 */
940          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
941          "\n\t"
942
943          "sb               %[st3],           0(%[odd_dst])                   "
944          "\n\t" /* odd 7 */
945          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
946          "\n\t"
947
948          "sb               %[st1],           0(%[odd_dst])                   "
949          "\n\t" /* odd 8 */
950
951          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
952            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
953            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
954            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
955            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
956          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
957            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
958
959      src += 16;
960      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
961      odd_dst = (dst + dst_stride);
962    }
963
964    /* Next row... */
965    src_ptr += src_stride;
966    dst_ptr += 1;
967  }
968}
969
970void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
971                                  uint8_t *dst, ptrdiff_t dst_stride,
972                                  const int16_t *filter, int w, int h) {
973  int x, y;
974
975  for (y = 0; y < h; ++y) {
976    for (x = 0; x < w; ++x) {
977      int sum = 0;
978
979      sum += src[x] * filter[3];
980      sum += src[x + 1] * filter[4];
981
982      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
983    }
984
985    src += src_stride;
986    dst += 1;
987  }
988}
989
990void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
991                         ptrdiff_t dst_stride, const int16_t *filter, int w,
992                         int h) {
993  uint32_t pos = 38;
994
995  /* bit positon for extract from acc */
996  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
997                       :
998                       : [pos] "r"(pos));
999
1000  /* prefetch data to cache memory */
1001  prefetch_load(src);
1002  prefetch_load(src + 32);
1003
1004  switch (w) {
1005    case 4:
1006      convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
1007                                           filter, h);
1008      break;
1009    case 8:
1010      convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
1011                                           filter, h);
1012      break;
1013    case 16:
1014    case 32:
1015      convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
1016                                            filter, h, (w / 16));
1017      break;
1018    case 64:
1019      prefetch_load(src + 32);
1020      convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
1021                                            filter, h);
1022      break;
1023    default:
1024      convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
1025                                   h);
1026      break;
1027  }
1028}
1029#endif
1030