1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_dsp_common.h"
17#include "vpx_dsp/vpx_filter.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
22                                                 int32_t src_stride,
23                                                 uint8_t *dst,
24                                                 int32_t dst_stride,
25                                                 const int16_t *filter_x0,
26                                                 int32_t h) {
27  int32_t       y;
28  uint8_t       *cm = vpx_ff_cropTbl;
29  uint8_t       *dst_ptr;
30  int32_t       Temp1, Temp2;
31  uint32_t      vector4a = 64;
32  uint32_t      tp1, tp2;
33  uint32_t      p1, p2;
34  const int16_t *filter = &filter_x0[3];
35  uint32_t      filter45;
36
37  filter45 = ((const int32_t *)filter)[0];
38
39  for (y = h; y--;) {
40    dst_ptr = dst;
41    /* prefetch data to cache memory */
42    prefetch_load(src + src_stride);
43    prefetch_load(src + src_stride + 32);
44
45    __asm__ __volatile__ (
46        "ulw              %[tp1],         0(%[src])                      \n\t"
47        "ulw              %[tp2],         4(%[src])                      \n\t"
48
49        /* even 1. pixel */
50        "mtlo             %[vector4a],    $ac3                           \n\t"
51        "mthi             $zero,          $ac3                           \n\t"
52        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
53        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
54        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
55        "extp             %[Temp1],       $ac3,           31             \n\t"
56
57        /* even 2. pixel */
58        "mtlo             %[vector4a],    $ac2                           \n\t"
59        "mthi             $zero,          $ac2                           \n\t"
60        "balign           %[tp2],         %[tp1],         3              \n\t"
61        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
62        "extp             %[Temp2],       $ac2,           31             \n\t"
63
64        /* odd 1. pixel */
65        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
66        "mtlo             %[vector4a],    $ac3                           \n\t"
67        "mthi             $zero,          $ac3                           \n\t"
68        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
69        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
70        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
71        "extp             %[Temp1],       $ac3,           31             \n\t"
72
73        /* odd 2. pixel */
74        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
75        "mtlo             %[vector4a],    $ac2                           \n\t"
76        "mthi             $zero,          $ac2                           \n\t"
77        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
78        "extp             %[Temp2],       $ac2,           31             \n\t"
79
80        /* clamp */
81        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
82        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
83
84        /* store bytes */
85        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
86        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
87
88        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
89        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
90
91        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
92        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
93
94        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
95        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
96
97        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
98          [p1] "=&r" (p1), [p2] "=&r" (p2),
99          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
100          [dst_ptr] "+r" (dst_ptr)
101        : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
102          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
103    );
104
105    /* Next row... */
106    src += src_stride;
107    dst += 1;
108  }
109}
110
111static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
112                                                 int32_t src_stride,
113                                                 uint8_t *dst,
114                                                 int32_t dst_stride,
115                                                 const int16_t *filter_x0,
116                                                 int32_t h) {
117  int32_t y;
118  uint8_t *cm = vpx_ff_cropTbl;
119  uint8_t *dst_ptr;
120  uint32_t vector4a = 64;
121  int32_t Temp1, Temp2, Temp3;
122  uint32_t tp1, tp2, tp3;
123  uint32_t p1, p2, p3, p4;
124  uint8_t *odd_dst;
125  uint32_t dst_pitch_2 = (dst_stride << 1);
126  const int16_t *filter = &filter_x0[3];
127  uint32_t      filter45;
128
129  filter45 = ((const int32_t *)filter)[0];
130
131  for (y = h; y--;) {
132    /* prefetch data to cache memory */
133    prefetch_load(src + src_stride);
134    prefetch_load(src + src_stride + 32);
135
136    dst_ptr = dst;
137    odd_dst = (dst_ptr + dst_stride);
138
139    __asm__ __volatile__ (
140        "ulw              %[tp1],         0(%[src])                       \n\t"
141        "ulw              %[tp2],         4(%[src])                       \n\t"
142
143        /* even 1. pixel */
144        "mtlo             %[vector4a],    $ac3                            \n\t"
145        "mthi             $zero,          $ac3                            \n\t"
146        "mtlo             %[vector4a],    $ac2                            \n\t"
147        "mthi             $zero,          $ac2                            \n\t"
148        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
149        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
150        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
151        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
152        "ulw              %[tp3],         8(%[src])                       \n\t"
153        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
154        "extp             %[Temp1],       $ac3,           31              \n\t"
155
156        /* even 2. pixel */
157        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
158        "extp             %[Temp3],       $ac2,           31              \n\t"
159
160        /* even 3. pixel */
161        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
162        "mtlo             %[vector4a],    $ac1                            \n\t"
163        "mthi             $zero,          $ac1                            \n\t"
164        "balign           %[tp3],         %[tp2],         3              \n\t"
165        "balign           %[tp2],         %[tp1],         3              \n\t"
166        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
167        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
168        "extp             %[p3],          $ac1,           31              \n\t"
169
170        /* even 4. pixel */
171        "mtlo             %[vector4a],    $ac2                            \n\t"
172        "mthi             $zero,          $ac2                            \n\t"
173        "mtlo             %[vector4a],    $ac3                            \n\t"
174        "mthi             $zero,          $ac3                            \n\t"
175        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
176        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
177        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
178        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
179
180        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
181        "extp             %[Temp3],       $ac2,           31              \n\t"
182
183        "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"
184
185        /* odd 1. pixel */
186        "mtlo             %[vector4a],    $ac1                            \n\t"
187        "mthi             $zero,          $ac1                            \n\t"
188        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
189        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
190        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
191        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
192        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
193        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
194
195        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
196        "extp             %[Temp2],       $ac3,           31              \n\t"
197
198        /* odd 2. pixel */
199        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
200        "mtlo             %[vector4a],    $ac3                            \n\t"
201        "mthi             $zero,          $ac3                            \n\t"
202        "mtlo             %[vector4a],    $ac2                            \n\t"
203        "mthi             $zero,          $ac2                            \n\t"
204        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
205        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
206        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
207        "extp             %[Temp3],       $ac1,           31              \n\t"
208
209        /* odd 3. pixel */
210        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
211        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
212        "extp             %[Temp2],       $ac3,           31              \n\t"
213
214        /* odd 4. pixel */
215        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
216        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
217        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
218        "extp             %[Temp1],       $ac2,           31              \n\t"
219
220        /* clamp */
221        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
222        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
223        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
224
225        /* store bytes */
226        "sb               %[p4],          0(%[odd_dst])                   \n\t"
227        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
228
229        "sb               %[p2],          0(%[odd_dst])                   \n\t"
230        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
231
232        "sb               %[p1],          0(%[odd_dst])                   \n\t"
233
234        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
235          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
236          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
237          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
238        : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
239          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
240    );
241
242    /* Next row... */
243    src += src_stride;
244    dst += 1;
245  }
246}
247
248static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
249                                                  int32_t src_stride,
250                                                  uint8_t *dst_ptr,
251                                                  int32_t dst_stride,
252                                                  const int16_t *filter_x0,
253                                                  int32_t h,
254                                                  int32_t count) {
255  int32_t       c, y;
256  const uint8_t *src;
257  uint8_t       *dst;
258  uint8_t       *cm = vpx_ff_cropTbl;
259  uint32_t      vector_64 = 64;
260  int32_t       Temp1, Temp2, Temp3;
261  uint32_t      qload1, qload2;
262  uint32_t      p1, p2, p3, p4, p5;
263  uint32_t      st1, st2, st3;
264  uint32_t      dst_pitch_2 = (dst_stride << 1);
265  uint8_t       *odd_dst;
266  const int16_t *filter = &filter_x0[3];
267  uint32_t      filter45;
268
269  filter45 = ((const int32_t *)filter)[0];
270
271  for (y = h; y--;) {
272    /* prefetch data to cache memory */
273    prefetch_load(src_ptr + src_stride);
274    prefetch_load(src_ptr + src_stride + 32);
275
276    src = src_ptr;
277    dst = dst_ptr;
278
279    odd_dst = (dst + dst_stride);
280
281    for (c = 0; c < count; c++) {
282      __asm__ __volatile__ (
283          "ulw              %[qload1],        0(%[src])                       \n\t"
284          "ulw              %[qload2],        4(%[src])                       \n\t"
285
286          /* even 1. pixel */
287          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
288          "mthi             $zero,            $ac1                            \n\t"
289          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
290          "mthi             $zero,            $ac2                            \n\t"
291          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
292          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
293          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
294          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
295          "ulw              %[qload1],        8(%[src])                       \n\t"
296          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
297          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
298
299          /* even 2. pixel */
300          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
301          "mthi             $zero,            $ac3                            \n\t"
302          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
303          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
304          "ulw              %[qload2],        12(%[src])                      \n\t"
305          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
306          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
307          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
308
309          /* even 3. pixel */
310          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
311          "mthi             $zero,            $ac1                            \n\t"
312          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
313          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
314          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
315          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
316          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
317          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
318
319          /* even 4. pixel */
320          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
321          "mthi             $zero,            $ac2                            \n\t"
322          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
323          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
324          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
325          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
326          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
327          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
328
329          /* even 5. pixel */
330          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
331          "mthi             $zero,            $ac3                            \n\t"
332          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
333          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
334          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
335          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
336          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
337
338          /* even 6. pixel */
339          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
340          "mthi             $zero,            $ac1                            \n\t"
341          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
342          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
343          "ulw              %[qload1],        20(%[src])                      \n\t"
344          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
345          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
346          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
347
348          /* even 7. pixel */
349          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
350          "mthi             $zero,            $ac2                            \n\t"
351          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
352          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
353          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
354          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
355          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
356          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
357
358          /* even 8. pixel */
359          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
360          "mthi             $zero,            $ac3                            \n\t"
361          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
362          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
363          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
364          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
365          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
366
367          /* ODD pixels */
368          "ulw              %[qload1],        1(%[src])                       \n\t"
369          "ulw              %[qload2],        5(%[src])                       \n\t"
370
371          /* odd 1. pixel */
372          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
373          "mthi             $zero,            $ac1                            \n\t"
374          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
375          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
376          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
377          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
378          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
379          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
380          "ulw              %[qload2],        9(%[src])                       \n\t"
381          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
382          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
383          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
384
385          /* odd 2. pixel */
386          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
387          "mthi             $zero,            $ac2                            \n\t"
388          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
389          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
390          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
391          "ulw              %[qload1],        13(%[src])                      \n\t"
392          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
393          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
394          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
395
396          /* odd 3. pixel */
397          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
398          "mthi             $zero,            $ac3                            \n\t"
399          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
400          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
401          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
402          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
403          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
404          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
405
406          /* odd 4. pixel */
407          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
408          "mthi             $zero,            $ac1                            \n\t"
409          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
410          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
411          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
412          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
413          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
414          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
415
416          /* odd 5. pixel */
417          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
418          "mthi             $zero,            $ac2                            \n\t"
419          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
420          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
421          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
422          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
423          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
424
425          /* odd 6. pixel */
426          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
427          "mthi             $zero,            $ac3                            \n\t"
428          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
429          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
430          "ulw              %[qload1],        21(%[src])                      \n\t"
431          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
432          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
433          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
434
435          /* odd 7. pixel */
436          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
437          "mthi             $zero,            $ac1                            \n\t"
438          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
439          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
440          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
441          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
442          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
443
444          /* odd 8. pixel */
445          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
446          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
447
448          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
449          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
450          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
451
452          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
453          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
454
455          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
456          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
457
458          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
459
460          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
461            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
462            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
463            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
464            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
465          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
466            [cm] "r" (cm),
467            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
468      );
469
470      src += 16;
471      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
472      odd_dst = (dst + dst_stride);
473    }
474
475    /* Next row... */
476    src_ptr += src_stride;
477    dst_ptr += 1;
478  }
479}
480
481static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
482                                                  int32_t src_stride,
483                                                  uint8_t *dst_ptr,
484                                                  int32_t dst_stride,
485                                                  const int16_t *filter_x0,
486                                                  int32_t h) {
487  int32_t       c, y;
488  const uint8_t *src;
489  uint8_t       *dst;
490  uint8_t       *cm = vpx_ff_cropTbl;
491  uint32_t      vector_64 = 64;
492  int32_t       Temp1, Temp2, Temp3;
493  uint32_t      qload1, qload2;
494  uint32_t      p1, p2, p3, p4, p5;
495  uint32_t      st1, st2, st3;
496  uint32_t      dst_pitch_2 = (dst_stride << 1);
497  uint8_t       *odd_dst;
498  const int16_t *filter = &filter_x0[3];
499  uint32_t      filter45;
500
501  filter45 = ((const int32_t *)filter)[0];
502
503  for (y = h; y--;) {
504    /* prefetch data to cache memory */
505    prefetch_load(src_ptr + src_stride);
506    prefetch_load(src_ptr + src_stride + 32);
507    prefetch_load(src_ptr + src_stride + 64);
508
509    src = src_ptr;
510    dst = dst_ptr;
511
512    odd_dst = (dst + dst_stride);
513
514    for (c = 0; c < 4; c++) {
515      __asm__ __volatile__ (
516          "ulw              %[qload1],        0(%[src])                       \n\t"
517          "ulw              %[qload2],        4(%[src])                       \n\t"
518
519          /* even 1. pixel */
520          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
521          "mthi             $zero,            $ac1                            \n\t"
522          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
523          "mthi             $zero,            $ac2                            \n\t"
524          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
525          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
526          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
527          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
528          "ulw              %[qload1],        8(%[src])                       \n\t"
529          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
530          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
531
532          /* even 2. pixel */
533          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
534          "mthi             $zero,            $ac3                            \n\t"
535          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
536          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
537          "ulw              %[qload2],        12(%[src])                      \n\t"
538          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
539          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
540          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
541
542          /* even 3. pixel */
543          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
544          "mthi             $zero,            $ac1                            \n\t"
545          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
546          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
547          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
548          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
549          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
550          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
551
552          /* even 4. pixel */
553          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
554          "mthi             $zero,            $ac2                            \n\t"
555          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
556          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
557          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
558          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
559          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
560          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
561
562          /* even 5. pixel */
563          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
564          "mthi             $zero,            $ac3                            \n\t"
565          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
566          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
567          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
568          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
569          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
570
571          /* even 6. pixel */
572          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
573          "mthi             $zero,            $ac1                            \n\t"
574          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
575          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
576          "ulw              %[qload1],        20(%[src])                      \n\t"
577          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
578          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
579          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
580
581          /* even 7. pixel */
582          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
583          "mthi             $zero,            $ac2                            \n\t"
584          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
585          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
586          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
587          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
588          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
589          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
590
591          /* even 8. pixel */
592          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
593          "mthi             $zero,            $ac3                            \n\t"
594          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
595          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
596          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
597          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
598          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
599
600          /* ODD pixels */
601          "ulw              %[qload1],        1(%[src])                       \n\t"
602          "ulw              %[qload2],        5(%[src])                       \n\t"
603
604          /* odd 1. pixel */
605          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
606          "mthi             $zero,            $ac1                            \n\t"
607          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
608          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
609          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
610          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
611          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
612          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
613          "ulw              %[qload2],        9(%[src])                       \n\t"
614          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
615          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
616          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
617
618          /* odd 2. pixel */
619          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
620          "mthi             $zero,            $ac2                            \n\t"
621          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
622          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
623          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
624          "ulw              %[qload1],        13(%[src])                      \n\t"
625          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
626          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
627          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
628
629          /* odd 3. pixel */
630          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
631          "mthi             $zero,            $ac3                            \n\t"
632          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
633          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
634          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
635          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
636          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
637          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
638
639          /* odd 4. pixel */
640          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
641          "mthi             $zero,            $ac1                            \n\t"
642          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
643          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
644          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
645          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
646          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
647          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
648
649          /* odd 5. pixel */
650          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
651          "mthi             $zero,            $ac2                            \n\t"
652          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
653          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
654          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
655          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
656          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
657
658          /* odd 6. pixel */
659          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
660          "mthi             $zero,            $ac3                            \n\t"
661          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
662          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
663          "ulw              %[qload1],        21(%[src])                      \n\t"
664          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
665          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
666          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
667
668          /* odd 7. pixel */
669          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
670          "mthi             $zero,            $ac1                            \n\t"
671          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
672          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
673          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
674          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
675          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
676
677          /* odd 8. pixel */
678          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
679          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
680
681          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
682          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
683          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
684
685          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
686          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
687
688          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
689          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
690
691          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
692
693          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
694            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
695            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
696            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
697            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
698          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
699            [cm] "r" (cm),
700            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
701      );
702
703      src += 16;
704      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
705      odd_dst = (dst + dst_stride);
706    }
707
708    /* Next row... */
709    src_ptr += src_stride;
710    dst_ptr += 1;
711  }
712}
713
714void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
715                                  uint8_t *dst, ptrdiff_t dst_stride,
716                                  const int16_t *filter, int w, int h) {
717  int x, y;
718
719  for (y = 0; y < h; ++y) {
720    for (x = 0; x < w; ++x) {
721      int sum = 0;
722
723      sum += src[x] * filter[3];
724      sum += src[x + 1] * filter[4];
725
726      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
727    }
728
729    src += src_stride;
730    dst += 1;
731  }
732}
733
734void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
735                         uint8_t *dst, ptrdiff_t dst_stride,
736                         const int16_t *filter,
737                         int w, int h) {
738  uint32_t pos = 38;
739
740  /* bit positon for extract from acc */
741  __asm__ __volatile__ (
742    "wrdsp      %[pos],     1           \n\t"
743    :
744    : [pos] "r" (pos)
745  );
746
747  /* prefetch data to cache memory */
748  prefetch_load(src);
749  prefetch_load(src + 32);
750
751  switch (w) {
752    case 4:
753      convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
754                                           dst, dst_stride,
755                                           filter, h);
756      break;
757    case 8:
758      convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
759                                           dst, dst_stride,
760                                           filter, h);
761      break;
762    case 16:
763    case 32:
764      convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
765                                            dst, dst_stride,
766                                            filter, h,
767                                            (w/16));
768      break;
769    case 64:
770      prefetch_load(src + 32);
771      convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
772                                            dst, dst_stride,
773                                            filter, h);
774      break;
775    default:
776      convolve_bi_horiz_transposed(src, src_stride,
777                                   dst, dst_stride,
778                                   filter, w, h);
779      break;
780  }
781}
782#endif
783