1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_ports/mem.h"
19#include "vp9/common/vp9_convolve.h"
20#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22#if HAVE_DSPR2
23static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
24                                          int32_t src_stride,
25                                          uint8_t *dst,
26                                          int32_t dst_stride,
27                                          const int16_t *filter_x0,
28                                          int32_t h) {
29  int32_t y;
30  uint8_t *cm = vp9_ff_cropTbl;
31  int32_t  Temp1, Temp2, Temp3, Temp4;
32  uint32_t vector4a = 64;
33  uint32_t tp1, tp2;
34  uint32_t p1, p2, p3;
35  uint32_t tn1, tn2;
36  const int16_t *filter = &filter_x0[3];
37  uint32_t      filter45;
38
39  filter45 = ((const int32_t *)filter)[0];
40
41  for (y = h; y--;) {
42    /* prefetch data to cache memory */
43    vp9_prefetch_load(src + src_stride);
44    vp9_prefetch_load(src + src_stride + 32);
45    vp9_prefetch_store(dst + dst_stride);
46
47    __asm__ __volatile__ (
48        "ulw              %[tp1],         0(%[src])                      \n\t"
49        "ulw              %[tp2],         4(%[src])                      \n\t"
50
51        /* even 1. pixel */
52        "mtlo             %[vector4a],    $ac3                           \n\t"
53        "mthi             $zero,          $ac3                           \n\t"
54        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
55        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
56        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
57        "extp             %[Temp1],       $ac3,           31             \n\t"
58
59        /* even 2. pixel */
60        "mtlo             %[vector4a],    $ac2                           \n\t"
61        "mthi             $zero,          $ac2                           \n\t"
62        "balign           %[tp2],         %[tp1],         3              \n\t"
63        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
64        "extp             %[Temp3],       $ac2,           31             \n\t"
65
66        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
67
68        /* odd 1. pixel */
69        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
70        "mtlo             %[vector4a],    $ac3                           \n\t"
71        "mthi             $zero,          $ac3                           \n\t"
72        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
73        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
74        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
75        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
76        "extp             %[Temp2],       $ac3,           31             \n\t"
77
78        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
79
80        /* odd 2. pixel */
81        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
82        "mtlo             %[vector4a],    $ac2                           \n\t"
83        "mthi             $zero,          $ac2                           \n\t"
84        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
85        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
86        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
87        "extp             %[Temp4],       $ac2,           31             \n\t"
88
89        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
90        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
91
92        /* clamp */
93        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
94        "lbux             %[p3],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
95        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
96
97        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
98        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
99
100        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t"  /* average odd 2 */
101        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
102
103        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
104          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
105          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
106          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
107          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
108        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
109          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
110    );
111
112    /* Next row... */
113    src += src_stride;
114    dst += dst_stride;
115  }
116}
117
118static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
119                                         int32_t src_stride,
120                                         uint8_t *dst,
121                                         int32_t dst_stride,
122                                         const int16_t *filter_x0,
123                                         int32_t h) {
124  int32_t y;
125  uint8_t *cm = vp9_ff_cropTbl;
126  uint32_t vector4a = 64;
127  int32_t Temp1, Temp2, Temp3;
128  uint32_t tp1, tp2, tp3, tp4;
129  uint32_t p1, p2, p3, p4, n1;
130  uint32_t st0, st1;
131  const int16_t *filter = &filter_x0[3];
132  uint32_t filter45;;
133
134  filter45 = ((const int32_t *)filter)[0];
135
136  for (y = h; y--;) {
137    /* prefetch data to cache memory */
138    vp9_prefetch_load(src + src_stride);
139    vp9_prefetch_load(src + src_stride + 32);
140    vp9_prefetch_store(dst + dst_stride);
141
142    __asm__ __volatile__ (
143        "ulw              %[tp1],         0(%[src])                      \n\t"
144        "ulw              %[tp2],         4(%[src])                      \n\t"
145
146        /* even 1. pixel */
147        "mtlo             %[vector4a],    $ac3                           \n\t"
148        "mthi             $zero,          $ac3                           \n\t"
149        "mtlo             %[vector4a],    $ac2                           \n\t"
150        "mthi             $zero,          $ac2                           \n\t"
151        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
152        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
153        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
154        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
155        "ulw              %[tp3],         8(%[src])                      \n\t"
156        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
157        "extp             %[Temp1],       $ac3,           31             \n\t"
158        "lbu              %[Temp2],       0(%[dst])                      \n\t"
159        "lbu              %[tp4],         2(%[dst])                      \n\t"
160
161        /* even 2. pixel */
162        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
163        "extp             %[Temp3],       $ac2,           31             \n\t"
164
165        /* even 3. pixel */
166        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
167        "mtlo             %[vector4a],    $ac1                           \n\t"
168        "mthi             $zero,          $ac1                           \n\t"
169        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
170        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
171        "extp             %[Temp1],       $ac1,           31             \n\t"
172
173        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
174        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
175        "sb               %[Temp2],       0(%[dst])                      \n\t"
176        "sb               %[tp4],         2(%[dst])                      \n\t"
177
178        /* even 4. pixel */
179        "mtlo             %[vector4a],    $ac2                           \n\t"
180        "mthi             $zero,          $ac2                           \n\t"
181        "mtlo             %[vector4a],    $ac3                           \n\t"
182        "mthi             $zero,          $ac3                           \n\t"
183
184        "balign           %[tp3],         %[tp2],         3              \n\t"
185        "balign           %[tp2],         %[tp1],         3              \n\t"
186
187        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
188        "lbu              %[Temp2],       4(%[dst])                      \n\t"
189        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
190
191        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
192        "extp             %[Temp3],       $ac2,           31             \n\t"
193
194        /* odd 1. pixel */
195        "mtlo             %[vector4a],    $ac1                           \n\t"
196        "mthi             $zero,          $ac1                           \n\t"
197        "sb               %[Temp2],       4(%[dst])                      \n\t"
198        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
199        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
200        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
201        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
202        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
203        "extp             %[Temp2],       $ac3,           31             \n\t"
204
205        "lbu              %[tp1],         6(%[dst])                      \n\t"
206
207        /* odd 2. pixel */
208        "mtlo             %[vector4a],    $ac3                           \n\t"
209        "mthi             $zero,          $ac3                           \n\t"
210        "mtlo             %[vector4a],    $ac2                           \n\t"
211        "mthi             $zero,          $ac2                           \n\t"
212        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
213        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
214        "extp             %[Temp3],       $ac1,           31             \n\t"
215
216        "lbu              %[tp2],         1(%[dst])                      \n\t"
217        "lbu              %[tp3],         3(%[dst])                      \n\t"
218        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
219
220        /* odd 3. pixel */
221        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
222        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
223        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
224        "extp             %[Temp2],       $ac3,           31             \n\t"
225
226        "lbu              %[tp4],         5(%[dst])                      \n\t"
227
228        /* odd 4. pixel */
229        "sb               %[tp2],         1(%[dst])                      \n\t"
230        "sb               %[tp1],         6(%[dst])                      \n\t"
231        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
232        "extp             %[Temp1],       $ac2,           31             \n\t"
233
234        "lbu              %[tp1],         7(%[dst])                      \n\t"
235
236        /* clamp */
237        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
238        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
239
240        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
241        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
242
243        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
244        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
245
246        /* store bytes */
247        "sb               %[tp3],         3(%[dst])                      \n\t"
248        "sb               %[tp4],         5(%[dst])                      \n\t"
249        "sb               %[tp1],         7(%[dst])                      \n\t"
250
251        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
252          [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
253          [st0] "=&r" (st0), [st1] "=&r" (st1),
254          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
255          [n1] "=&r" (n1),
256          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
257        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
258          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
259    );
260
261    /* Next row... */
262    src += src_stride;
263    dst += dst_stride;
264  }
265}
266
267static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
268                                          int32_t src_stride,
269                                          uint8_t *dst_ptr,
270                                          int32_t dst_stride,
271                                          const int16_t *filter_x0,
272                                          int32_t h,
273                                          int32_t count) {
274  int32_t y, c;
275  const uint8_t *src;
276  uint8_t *dst;
277  uint8_t *cm = vp9_ff_cropTbl;
278  uint32_t vector_64 = 64;
279  int32_t Temp1, Temp2, Temp3;
280  uint32_t qload1, qload2, qload3;
281  uint32_t p1, p2, p3, p4, p5;
282  uint32_t st1, st2, st3;
283  const int16_t *filter = &filter_x0[3];
284  uint32_t filter45;;
285
286  filter45 = ((const int32_t *)filter)[0];
287
288  for (y = h; y--;) {
289    src = src_ptr;
290    dst = dst_ptr;
291
292    /* prefetch data to cache memory */
293    vp9_prefetch_load(src_ptr + src_stride);
294    vp9_prefetch_load(src_ptr + src_stride + 32);
295    vp9_prefetch_store(dst_ptr + dst_stride);
296
297    for (c = 0; c < count; c++) {
298      __asm__ __volatile__ (
299          "ulw              %[qload1],    0(%[src])                    \n\t"
300          "ulw              %[qload2],    4(%[src])                    \n\t"
301
302          /* even 1. pixel */
303          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
304          "mthi             $zero,        $ac1                         \n\t"
305          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
306          "mthi             $zero,        $ac2                         \n\t"
307          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
308          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
309          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
310          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
311          "ulw              %[qload3],    8(%[src])                    \n\t"
312          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
313          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
314          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
315
316          /* even 2. pixel */
317          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
318          "mthi             $zero,        $ac3                         \n\t"
319          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
320          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
321          "ulw              %[qload1],    12(%[src])                   \n\t"
322          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
323          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
324          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
325
326          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
327
328          /* even 3. pixel */
329          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
330          "mthi             $zero,        $ac1                         \n\t"
331          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
332          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
333          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
334          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
335          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
336          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
337
338          /* even 4. pixel */
339          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
340          "mthi             $zero,        $ac2                         \n\t"
341          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
342          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
343          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
344          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
345          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
346          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
347          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
348          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
349
350          /* even 5. pixel */
351          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
352          "mthi             $zero,        $ac3                         \n\t"
353          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
354          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
355          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
356          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
357          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
358
359          /* even 6. pixel */
360          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
361          "mthi             $zero,        $ac1                         \n\t"
362          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
363          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
364          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
365          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
366          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
367          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
368
369          /* even 7. pixel */
370          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
371          "mthi             $zero,        $ac2                         \n\t"
372          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
373          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
374          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
375          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
376          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
377          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
378
379          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
380
381          /* even 8. pixel */
382          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
383          "mthi             $zero,        $ac3                         \n\t"
384          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
385          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
386          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
387          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
388          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
389
390          /* ODD pixels */
391          "ulw              %[qload1],    1(%[src])                   \n\t"
392          "ulw              %[qload2],    5(%[src])                    \n\t"
393
394          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
395
396          /* odd 1. pixel */
397          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
398          "mthi             $zero,        $ac1                         \n\t"
399          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
400          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
401          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
402          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
403          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
404          "ulw              %[qload3],    9(%[src])                    \n\t"
405          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
406          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
407          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
408          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
409
410          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
411
412          /* odd 2. pixel */
413          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
414          "mthi             $zero,        $ac2                         \n\t"
415          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
416          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
417          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
418          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
419          "ulw              %[qload1],    13(%[src])                   \n\t"
420          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
421          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
422          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
423          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
424
425          /* odd 3. pixel */
426          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
427          "mthi             $zero,        $ac3                         \n\t"
428          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
429          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
430          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
431          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
432          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
433          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
434
435          /* odd 4. pixel */
436          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
437          "mthi             $zero,        $ac1                         \n\t"
438          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
439          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
440          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
441          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
442          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
443          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
444          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
445
446          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
447
448          /* odd 5. pixel */
449          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
450          "mthi             $zero,        $ac2                         \n\t"
451          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
452          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
453          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
454          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
455          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
456
457          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
458
459          /* odd 6. pixel */
460          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
461          "mthi             $zero,        $ac3                         \n\t"
462          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
463          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
464          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
465          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
466          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
467
468          /* odd 7. pixel */
469          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
470          "mthi             $zero,        $ac1                         \n\t"
471          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
472          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
473          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
474          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
475          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
476
477          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
478
479          /* odd 8. pixel */
480          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
481          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
482
483          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
484
485          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
486          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
487
488          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
489          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
490
491          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
492          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
493
494          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
495          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
496          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
497
498          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
499            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
500            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
501            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
502            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
503          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
504            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
505      );
506
507      src += 16;
508      dst += 16;
509    }
510
511    /* Next row... */
512    src_ptr += src_stride;
513    dst_ptr += dst_stride;
514  }
515}
516
517static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
518                                          int32_t src_stride,
519                                          uint8_t *dst_ptr,
520                                          int32_t dst_stride,
521                                          const int16_t *filter_x0,
522                                          int32_t h) {
523  int32_t y, c;
524  const uint8_t *src;
525  uint8_t *dst;
526  uint8_t *cm = vp9_ff_cropTbl;
527  uint32_t vector_64 = 64;
528  int32_t Temp1, Temp2, Temp3;
529  uint32_t qload1, qload2, qload3;
530  uint32_t p1, p2, p3, p4, p5;
531  uint32_t st1, st2, st3;
532  const int16_t *filter = &filter_x0[3];
533  uint32_t filter45;;
534
535  filter45 = ((const int32_t *)filter)[0];
536
537  for (y = h; y--;) {
538    src = src_ptr;
539    dst = dst_ptr;
540
541    /* prefetch data to cache memory */
542    vp9_prefetch_load(src_ptr + src_stride);
543    vp9_prefetch_load(src_ptr + src_stride + 32);
544    vp9_prefetch_load(src_ptr + src_stride + 64);
545    vp9_prefetch_store(dst_ptr + dst_stride);
546    vp9_prefetch_store(dst_ptr + dst_stride + 32);
547
548    for (c = 0; c < 4; c++) {
549      __asm__ __volatile__ (
550          "ulw              %[qload1],    0(%[src])                    \n\t"
551          "ulw              %[qload2],    4(%[src])                    \n\t"
552
553          /* even 1. pixel */
554          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
555          "mthi             $zero,        $ac1                         \n\t"
556          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
557          "mthi             $zero,        $ac2                         \n\t"
558          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
559          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
560          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
561          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
562          "ulw              %[qload3],    8(%[src])                    \n\t"
563          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
564          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
565          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
566
567          /* even 2. pixel */
568          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
569          "mthi             $zero,        $ac3                         \n\t"
570          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
571          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
572          "ulw              %[qload1],    12(%[src])                   \n\t"
573          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
574          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
575          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
576
577          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
578
579          /* even 3. pixel */
580          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
581          "mthi             $zero,        $ac1                         \n\t"
582          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
583          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
584          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
585          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
586          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
587          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
588
589          /* even 4. pixel */
590          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
591          "mthi             $zero,        $ac2                         \n\t"
592          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
593          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
594          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
595          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
596          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
597          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
598          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
599          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
600
601          /* even 5. pixel */
602          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
603          "mthi             $zero,        $ac3                         \n\t"
604          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
605          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
606          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
607          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
608          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
609
610          /* even 6. pixel */
611          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
612          "mthi             $zero,        $ac1                         \n\t"
613          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
614          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
615          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
616          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
617          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
618          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
619
620          /* even 7. pixel */
621          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
622          "mthi             $zero,        $ac2                         \n\t"
623          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
624          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
625          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
626          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
627          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
628          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
629
630          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
631
632          /* even 8. pixel */
633          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
634          "mthi             $zero,        $ac3                         \n\t"
635          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
636          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
637          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
638          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
639          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
640
641          /* ODD pixels */
642          "ulw              %[qload1],    1(%[src])                   \n\t"
643          "ulw              %[qload2],    5(%[src])                    \n\t"
644
645          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
646
647          /* odd 1. pixel */
648          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
649          "mthi             $zero,        $ac1                         \n\t"
650          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
651          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
652          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
653          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
654          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
655          "ulw              %[qload3],    9(%[src])                    \n\t"
656          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
657          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
658          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
659          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
660
661          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
662
663          /* odd 2. pixel */
664          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
665          "mthi             $zero,        $ac2                         \n\t"
666          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
667          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
668          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
669          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
670          "ulw              %[qload1],    13(%[src])                   \n\t"
671          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
672          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
673          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
674          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
675
676          /* odd 3. pixel */
677          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
678          "mthi             $zero,        $ac3                         \n\t"
679          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
680          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
681          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
682          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
683          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
684          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
685
686          /* odd 4. pixel */
687          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
688          "mthi             $zero,        $ac1                         \n\t"
689          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
690          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
691          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
692          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
693          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
694          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
695          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
696
697          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
698
699          /* odd 5. pixel */
700          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
701          "mthi             $zero,        $ac2                         \n\t"
702          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
703          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
704          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
705          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
706          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
707
708          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
709
710          /* odd 6. pixel */
711          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
712          "mthi             $zero,        $ac3                         \n\t"
713          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
714          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
715          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
716          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
717          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
718
719          /* odd 7. pixel */
720          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
721          "mthi             $zero,        $ac1                         \n\t"
722          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
723          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
724          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
725          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
726          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
727
728          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
729
730          /* odd 8. pixel */
731          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
732          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
733
734          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
735
736          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
737          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
738
739          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
740          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
741
742          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
743          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
744
745          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
746          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
747          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
748
749          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
750            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
751            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
752            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
753            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
754          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
755            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
756      );
757
758      src += 16;
759      dst += 16;
760    }
761
762    /* Next row... */
763    src_ptr += src_stride;
764    dst_ptr += dst_stride;
765  }
766}
767
768void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
769                                   uint8_t *dst, ptrdiff_t dst_stride,
770                                   const int16_t *filter_x, int x_step_q4,
771                                   const int16_t *filter_y, int y_step_q4,
772                                   int w, int h) {
773  if (16 == x_step_q4) {
774    uint32_t pos = 38;
775
776    /* bit positon for extract from acc */
777    __asm__ __volatile__ (
778      "wrdsp      %[pos],     1           \n\t"
779      :
780      : [pos] "r" (pos)
781    );
782
783    /* prefetch data to cache memory */
784    vp9_prefetch_load(src);
785    vp9_prefetch_load(src + 32);
786    vp9_prefetch_store(dst);
787
788    switch (w) {
789      case 4:
790        convolve_bi_avg_horiz_4_dspr2(src, src_stride,
791                                     dst, dst_stride,
792                                     filter_x, h);
793        break;
794      case 8:
795        convolve_bi_avg_horiz_8_dspr2(src, src_stride,
796                                     dst, dst_stride,
797                                     filter_x, h);
798        break;
799      case 16:
800        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
801                                      dst, dst_stride,
802                                      filter_x, h, 1);
803        break;
804      case 32:
805        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
806                                      dst, dst_stride,
807                                      filter_x, h, 2);
808        break;
809      case 64:
810        vp9_prefetch_load(src + 64);
811        vp9_prefetch_store(dst + 32);
812
813        convolve_bi_avg_horiz_64_dspr2(src, src_stride,
814                                      dst, dst_stride,
815                                      filter_x, h);
816        break;
817      default:
818        vp9_convolve8_avg_horiz_c(src, src_stride,
819                                  dst, dst_stride,
820                                  filter_x, x_step_q4,
821                                  filter_y, y_step_q4,
822                                  w, h);
823        break;
824    }
825  } else {
826    vp9_convolve8_avg_horiz_c(src, src_stride,
827                              dst, dst_stride,
828                              filter_x, x_step_q4,
829                              filter_y, y_step_q4,
830                              w, h);
831  }
832}
833#endif
834