1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_ports/mem.h"
19#include "vp9/common/vp9_filter.h"
20#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22#if HAVE_DSPR2
23static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
24                                                 int32_t src_stride,
25                                                 uint8_t *dst,
26                                                 int32_t dst_stride,
27                                                 const int16_t *filter_x0,
28                                                 int32_t h) {
29  int32_t       y;
30  uint8_t       *cm = vp9_ff_cropTbl;
31  uint8_t       *dst_ptr;
32  int32_t       Temp1, Temp2;
33  uint32_t      vector4a = 64;
34  uint32_t      tp1, tp2;
35  uint32_t      p1, p2;
36  const int16_t *filter = &filter_x0[3];
37  uint32_t      filter45;
38
39  filter45 = ((const int32_t *)filter)[0];
40
41  for (y = h; y--;) {
42    dst_ptr = dst;
43    /* prefetch data to cache memory */
44    vp9_prefetch_load(src + src_stride);
45    vp9_prefetch_load(src + src_stride + 32);
46
47    __asm__ __volatile__ (
48        "ulw              %[tp1],         0(%[src])                      \n\t"
49        "ulw              %[tp2],         4(%[src])                      \n\t"
50
51        /* even 1. pixel */
52        "mtlo             %[vector4a],    $ac3                           \n\t"
53        "mthi             $zero,          $ac3                           \n\t"
54        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
55        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
56        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
57        "extp             %[Temp1],       $ac3,           31             \n\t"
58
59        /* even 2. pixel */
60        "mtlo             %[vector4a],    $ac2                           \n\t"
61        "mthi             $zero,          $ac2                           \n\t"
62        "balign           %[tp2],         %[tp1],         3              \n\t"
63        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
64        "extp             %[Temp2],       $ac2,           31             \n\t"
65
66        /* odd 1. pixel */
67        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
68        "mtlo             %[vector4a],    $ac3                           \n\t"
69        "mthi             $zero,          $ac3                           \n\t"
70        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
71        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
72        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
73        "extp             %[Temp1],       $ac3,           31             \n\t"
74
75        /* odd 2. pixel */
76        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
77        "mtlo             %[vector4a],    $ac2                           \n\t"
78        "mthi             $zero,          $ac2                           \n\t"
79        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
80        "extp             %[Temp2],       $ac2,           31             \n\t"
81
82        /* clamp */
83        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
84        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
85
86        /* store bytes */
87        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
88        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
89
90        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
91        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
92
93        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
94        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
95
96        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
97        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
98
99        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
100          [p1] "=&r" (p1), [p2] "=&r" (p2),
101          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
102          [dst_ptr] "+r" (dst_ptr)
103        : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
104          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
105    );
106
107    /* Next row... */
108    src += src_stride;
109    dst += 1;
110  }
111}
112
113static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
114                                                 int32_t src_stride,
115                                                 uint8_t *dst,
116                                                 int32_t dst_stride,
117                                                 const int16_t *filter_x0,
118                                                 int32_t h) {
119  int32_t y;
120  uint8_t *cm = vp9_ff_cropTbl;
121  uint8_t *dst_ptr;
122  uint32_t vector4a = 64;
123  int32_t Temp1, Temp2, Temp3;
124  uint32_t tp1, tp2, tp3;
125  uint32_t p1, p2, p3, p4;
126  uint8_t *odd_dst;
127  uint32_t dst_pitch_2 = (dst_stride << 1);
128  const int16_t *filter = &filter_x0[3];
129  uint32_t      filter45;
130
131  filter45 = ((const int32_t *)filter)[0];
132
133  for (y = h; y--;) {
134    /* prefetch data to cache memory */
135    vp9_prefetch_load(src + src_stride);
136    vp9_prefetch_load(src + src_stride + 32);
137
138    dst_ptr = dst;
139    odd_dst = (dst_ptr + dst_stride);
140
141    __asm__ __volatile__ (
142        "ulw              %[tp1],         0(%[src])                       \n\t"
143        "ulw              %[tp2],         4(%[src])                       \n\t"
144
145        /* even 1. pixel */
146        "mtlo             %[vector4a],    $ac3                            \n\t"
147        "mthi             $zero,          $ac3                            \n\t"
148        "mtlo             %[vector4a],    $ac2                            \n\t"
149        "mthi             $zero,          $ac2                            \n\t"
150        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
151        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
152        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
153        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
154        "ulw              %[tp3],         8(%[src])                       \n\t"
155        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
156        "extp             %[Temp1],       $ac3,           31              \n\t"
157
158        /* even 2. pixel */
159        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
160        "extp             %[Temp3],       $ac2,           31              \n\t"
161
162        /* even 3. pixel */
163        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
164        "mtlo             %[vector4a],    $ac1                            \n\t"
165        "mthi             $zero,          $ac1                            \n\t"
166        "balign           %[tp3],         %[tp2],         3              \n\t"
167        "balign           %[tp2],         %[tp1],         3              \n\t"
168        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
169        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
170        "extp             %[p3],          $ac1,           31              \n\t"
171
172        /* even 4. pixel */
173        "mtlo             %[vector4a],    $ac2                            \n\t"
174        "mthi             $zero,          $ac2                            \n\t"
175        "mtlo             %[vector4a],    $ac3                            \n\t"
176        "mthi             $zero,          $ac3                            \n\t"
177        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
178        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
179        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
180        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
181
182        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
183        "extp             %[Temp3],       $ac2,           31              \n\t"
184
185        "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"
186
187        /* odd 1. pixel */
188        "mtlo             %[vector4a],    $ac1                            \n\t"
189        "mthi             $zero,          $ac1                            \n\t"
190        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
191        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
192        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
193        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
194        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
195        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
196
197        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
198        "extp             %[Temp2],       $ac3,           31              \n\t"
199
200        /* odd 2. pixel */
201        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
202        "mtlo             %[vector4a],    $ac3                            \n\t"
203        "mthi             $zero,          $ac3                            \n\t"
204        "mtlo             %[vector4a],    $ac2                            \n\t"
205        "mthi             $zero,          $ac2                            \n\t"
206        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
207        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
208        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
209        "extp             %[Temp3],       $ac1,           31              \n\t"
210
211        /* odd 3. pixel */
212        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
213        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
214        "extp             %[Temp2],       $ac3,           31              \n\t"
215
216        /* odd 4. pixel */
217        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
218        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
219        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
220        "extp             %[Temp1],       $ac2,           31              \n\t"
221
222        /* clamp */
223        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
224        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
225        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
226
227        /* store bytes */
228        "sb               %[p4],          0(%[odd_dst])                   \n\t"
229        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
230
231        "sb               %[p2],          0(%[odd_dst])                   \n\t"
232        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
233
234        "sb               %[p1],          0(%[odd_dst])                   \n\t"
235
236        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
237          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
238          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
239          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
240        : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
241          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
242    );
243
244    /* Next row... */
245    src += src_stride;
246    dst += 1;
247  }
248}
249
250static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
251                                                  int32_t src_stride,
252                                                  uint8_t *dst_ptr,
253                                                  int32_t dst_stride,
254                                                  const int16_t *filter_x0,
255                                                  int32_t h,
256                                                  int32_t count) {
257  int32_t       c, y;
258  const uint8_t *src;
259  uint8_t       *dst;
260  uint8_t       *cm = vp9_ff_cropTbl;
261  uint32_t      vector_64 = 64;
262  int32_t       Temp1, Temp2, Temp3;
263  uint32_t      qload1, qload2;
264  uint32_t      p1, p2, p3, p4, p5;
265  uint32_t      st1, st2, st3;
266  uint32_t      dst_pitch_2 = (dst_stride << 1);
267  uint8_t       *odd_dst;
268  const int16_t *filter = &filter_x0[3];
269  uint32_t      filter45;
270
271  filter45 = ((const int32_t *)filter)[0];
272
273  for (y = h; y--;) {
274    /* prefetch data to cache memory */
275    vp9_prefetch_load(src_ptr + src_stride);
276    vp9_prefetch_load(src_ptr + src_stride + 32);
277
278    src = src_ptr;
279    dst = dst_ptr;
280
281    odd_dst = (dst + dst_stride);
282
283    for (c = 0; c < count; c++) {
284      __asm__ __volatile__ (
285          "ulw              %[qload1],        0(%[src])                       \n\t"
286          "ulw              %[qload2],        4(%[src])                       \n\t"
287
288          /* even 1. pixel */
289          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
290          "mthi             $zero,            $ac1                            \n\t"
291          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
292          "mthi             $zero,            $ac2                            \n\t"
293          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
294          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
295          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
296          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
297          "ulw              %[qload1],        8(%[src])                       \n\t"
298          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
299          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
300
301          /* even 2. pixel */
302          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
303          "mthi             $zero,            $ac3                            \n\t"
304          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
305          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
306          "ulw              %[qload2],        12(%[src])                      \n\t"
307          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
308          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
309          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
310
311          /* even 3. pixel */
312          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
313          "mthi             $zero,            $ac1                            \n\t"
314          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
315          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
316          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
317          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
318          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
319          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
320
321          /* even 4. pixel */
322          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
323          "mthi             $zero,            $ac2                            \n\t"
324          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
325          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
326          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
327          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
328          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
329          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
330
331          /* even 5. pixel */
332          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
333          "mthi             $zero,            $ac3                            \n\t"
334          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
335          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
336          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
337          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
338          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
339
340          /* even 6. pixel */
341          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
342          "mthi             $zero,            $ac1                            \n\t"
343          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
344          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
345          "ulw              %[qload1],        20(%[src])                      \n\t"
346          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
347          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
348          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
349
350          /* even 7. pixel */
351          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
352          "mthi             $zero,            $ac2                            \n\t"
353          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
354          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
355          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
356          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
357          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
358          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
359
360          /* even 8. pixel */
361          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
362          "mthi             $zero,            $ac3                            \n\t"
363          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
364          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
365          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
366          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
367          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
368
369          /* ODD pixels */
370          "ulw              %[qload1],        1(%[src])                       \n\t"
371          "ulw              %[qload2],        5(%[src])                       \n\t"
372
373          /* odd 1. pixel */
374          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
375          "mthi             $zero,            $ac1                            \n\t"
376          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
377          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
378          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
379          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
380          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
381          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
382          "ulw              %[qload2],        9(%[src])                       \n\t"
383          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
384          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
385          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
386
387          /* odd 2. pixel */
388          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
389          "mthi             $zero,            $ac2                            \n\t"
390          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
391          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
392          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
393          "ulw              %[qload1],        13(%[src])                      \n\t"
394          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
395          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
396          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
397
398          /* odd 3. pixel */
399          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
400          "mthi             $zero,            $ac3                            \n\t"
401          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
402          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
403          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
404          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
405          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
406          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
407
408          /* odd 4. pixel */
409          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
410          "mthi             $zero,            $ac1                            \n\t"
411          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
412          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
413          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
414          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
415          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
416          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
417
418          /* odd 5. pixel */
419          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
420          "mthi             $zero,            $ac2                            \n\t"
421          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
422          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
423          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
424          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
425          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
426
427          /* odd 6. pixel */
428          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
429          "mthi             $zero,            $ac3                            \n\t"
430          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
431          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
432          "ulw              %[qload1],        21(%[src])                      \n\t"
433          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
434          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
435          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
436
437          /* odd 7. pixel */
438          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
439          "mthi             $zero,            $ac1                            \n\t"
440          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
441          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
442          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
443          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
444          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
445
446          /* odd 8. pixel */
447          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
448          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
449
450          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
451          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
452          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
453
454          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
455          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
456
457          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
458          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
459
460          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
461
462          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
463            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
464            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
465            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
466            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
467          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
468            [cm] "r" (cm),
469            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
470      );
471
472      src += 16;
473      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
474      odd_dst = (dst + dst_stride);
475    }
476
477    /* Next row... */
478    src_ptr += src_stride;
479    dst_ptr += 1;
480  }
481}
482
483static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
484                                                  int32_t src_stride,
485                                                  uint8_t *dst_ptr,
486                                                  int32_t dst_stride,
487                                                  const int16_t *filter_x0,
488                                                  int32_t h) {
489  int32_t       c, y;
490  const uint8_t *src;
491  uint8_t       *dst;
492  uint8_t       *cm = vp9_ff_cropTbl;
493  uint32_t      vector_64 = 64;
494  int32_t       Temp1, Temp2, Temp3;
495  uint32_t      qload1, qload2;
496  uint32_t      p1, p2, p3, p4, p5;
497  uint32_t      st1, st2, st3;
498  uint32_t      dst_pitch_2 = (dst_stride << 1);
499  uint8_t       *odd_dst;
500  const int16_t *filter = &filter_x0[3];
501  uint32_t      filter45;
502
503  filter45 = ((const int32_t *)filter)[0];
504
505  for (y = h; y--;) {
506    /* prefetch data to cache memory */
507    vp9_prefetch_load(src_ptr + src_stride);
508    vp9_prefetch_load(src_ptr + src_stride + 32);
509    vp9_prefetch_load(src_ptr + src_stride + 64);
510
511    src = src_ptr;
512    dst = dst_ptr;
513
514    odd_dst = (dst + dst_stride);
515
516    for (c = 0; c < 4; c++) {
517      __asm__ __volatile__ (
518          "ulw              %[qload1],        0(%[src])                       \n\t"
519          "ulw              %[qload2],        4(%[src])                       \n\t"
520
521          /* even 1. pixel */
522          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
523          "mthi             $zero,            $ac1                            \n\t"
524          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
525          "mthi             $zero,            $ac2                            \n\t"
526          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
527          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
528          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
529          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
530          "ulw              %[qload1],        8(%[src])                       \n\t"
531          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
532          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
533
534          /* even 2. pixel */
535          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
536          "mthi             $zero,            $ac3                            \n\t"
537          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
538          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
539          "ulw              %[qload2],        12(%[src])                      \n\t"
540          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
541          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
542          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
543
544          /* even 3. pixel */
545          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
546          "mthi             $zero,            $ac1                            \n\t"
547          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
548          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
549          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
550          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
551          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
552          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
553
554          /* even 4. pixel */
555          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
556          "mthi             $zero,            $ac2                            \n\t"
557          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
558          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
559          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
560          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
561          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
562          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
563
564          /* even 5. pixel */
565          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
566          "mthi             $zero,            $ac3                            \n\t"
567          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
568          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
569          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
570          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
571          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
572
573          /* even 6. pixel */
574          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
575          "mthi             $zero,            $ac1                            \n\t"
576          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
577          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
578          "ulw              %[qload1],        20(%[src])                      \n\t"
579          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
580          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
581          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
582
583          /* even 7. pixel */
584          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
585          "mthi             $zero,            $ac2                            \n\t"
586          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
587          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
588          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
589          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
590          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
591          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
592
593          /* even 8. pixel */
594          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
595          "mthi             $zero,            $ac3                            \n\t"
596          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
597          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
598          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
599          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
600          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
601
602          /* ODD pixels */
603          "ulw              %[qload1],        1(%[src])                       \n\t"
604          "ulw              %[qload2],        5(%[src])                       \n\t"
605
606          /* odd 1. pixel */
607          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
608          "mthi             $zero,            $ac1                            \n\t"
609          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
610          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
611          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
612          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
613          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
614          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
615          "ulw              %[qload2],        9(%[src])                       \n\t"
616          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
617          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
618          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
619
620          /* odd 2. pixel */
621          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
622          "mthi             $zero,            $ac2                            \n\t"
623          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
624          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
625          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
626          "ulw              %[qload1],        13(%[src])                      \n\t"
627          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
628          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
629          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
630
631          /* odd 3. pixel */
632          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
633          "mthi             $zero,            $ac3                            \n\t"
634          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
635          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
636          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
637          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
638          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
639          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
640
641          /* odd 4. pixel */
642          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
643          "mthi             $zero,            $ac1                            \n\t"
644          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
645          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
646          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
647          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
648          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
649          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
650
651          /* odd 5. pixel */
652          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
653          "mthi             $zero,            $ac2                            \n\t"
654          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
655          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
656          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
657          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
658          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
659
660          /* odd 6. pixel */
661          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
662          "mthi             $zero,            $ac3                            \n\t"
663          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
664          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
665          "ulw              %[qload1],        21(%[src])                      \n\t"
666          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
667          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
668          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
669
670          /* odd 7. pixel */
671          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
672          "mthi             $zero,            $ac1                            \n\t"
673          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
674          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
675          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
676          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
677          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
678
679          /* odd 8. pixel */
680          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
681          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
682
683          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
684          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
685          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
686
687          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
688          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
689
690          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
691          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
692
693          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
694
695          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
696            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
697            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
698            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
699            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
700          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
701            [cm] "r" (cm),
702            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
703      );
704
705      src += 16;
706      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
707      odd_dst = (dst + dst_stride);
708    }
709
710    /* Next row... */
711    src_ptr += src_stride;
712    dst_ptr += 1;
713  }
714}
715
716void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
717                                  uint8_t *dst, ptrdiff_t dst_stride,
718                                  const int16_t *filter, int w, int h) {
719  int x, y;
720
721  for (y = 0; y < h; ++y) {
722    for (x = 0; x < w; ++x) {
723      int sum = 0;
724
725      sum += src[x] * filter[3];
726      sum += src[x + 1] * filter[4];
727
728      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
729    }
730
731    src += src_stride;
732    dst += 1;
733  }
734}
735
736void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
737                         uint8_t *dst, ptrdiff_t dst_stride,
738                         const int16_t *filter,
739                         int w, int h) {
740  uint32_t pos = 38;
741
742  /* bit positon for extract from acc */
743  __asm__ __volatile__ (
744    "wrdsp      %[pos],     1           \n\t"
745    :
746    : [pos] "r" (pos)
747  );
748
749  /* prefetch data to cache memory */
750  vp9_prefetch_load(src);
751  vp9_prefetch_load(src + 32);
752
753  switch (w) {
754    case 4:
755      convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
756                                           dst, dst_stride,
757                                           filter, h);
758      break;
759    case 8:
760      convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
761                                           dst, dst_stride,
762                                           filter, h);
763      break;
764    case 16:
765    case 32:
766      convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
767                                            dst, dst_stride,
768                                            filter, h,
769                                            (w/16));
770      break;
771    case 64:
772      vp9_prefetch_load(src + 32);
773      convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
774                                            dst, dst_stride,
775                                            filter, h);
776      break;
777    default:
778      convolve_bi_horiz_transposed(src, src_stride,
779                                   dst, dst_stride,
780                                   filter, w, h);
781      break;
782  }
783}
784#endif
785