1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_convolve.h"
17#include "vpx_dsp/vpx_dsp_common.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
22                                          int32_t src_stride, uint8_t *dst,
23                                          int32_t dst_stride,
24                                          const int16_t *filter_x0, int32_t h) {
25  int32_t y;
26  uint8_t *cm = vpx_ff_cropTbl;
27  int32_t Temp1, Temp2, Temp3, Temp4;
28  uint32_t vector4a = 64;
29  uint32_t tp1, tp2;
30  uint32_t p1, p2, p3;
31  uint32_t tn1, tn2;
32  const int16_t *filter = &filter_x0[3];
33  uint32_t filter45;
34
35  filter45 = ((const int32_t *)filter)[0];
36
37  for (y = h; y--;) {
38    /* prefetch data to cache memory */
39    prefetch_load(src + src_stride);
40    prefetch_load(src + src_stride + 32);
41    prefetch_store(dst + dst_stride);
42
43    __asm__ __volatile__(
44        "ulw              %[tp1],         0(%[src])                      \n\t"
45        "ulw              %[tp2],         4(%[src])                      \n\t"
46
47        /* even 1. pixel */
48        "mtlo             %[vector4a],    $ac3                           \n\t"
49        "mthi             $zero,          $ac3                           \n\t"
50        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
51        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
52        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
53        "extp             %[Temp1],       $ac3,           31             \n\t"
54
55        /* even 2. pixel */
56        "mtlo             %[vector4a],    $ac2                           \n\t"
57        "mthi             $zero,          $ac2                           \n\t"
58        "balign           %[tp2],         %[tp1],         3              \n\t"
59        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
60        "extp             %[Temp3],       $ac2,           31             \n\t"
61
62        "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
63
64        /* odd 1. pixel */
65        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
66        "mtlo             %[vector4a],    $ac3                           \n\t"
67        "mthi             $zero,          $ac3                           \n\t"
68        "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
69        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
70        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
71        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
72        "extp             %[Temp2],       $ac3,           31             \n\t"
73
74        "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
75
76        /* odd 2. pixel */
77        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
78        "mtlo             %[vector4a],    $ac2                           \n\t"
79        "mthi             $zero,          $ac2                           \n\t"
80        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
81        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
82        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
83        "extp             %[Temp4],       $ac2,           31             \n\t"
84
85        "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
86        "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
87
88        /* clamp */
89        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
90        "lbux             %[p3],          %[Temp4](%[cm])                \n\t" /* odd 2 */
91        "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
92
93        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
94        "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
95
96        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t" /* average odd 2 */
97        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
98
99        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
100          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
101          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
102          [Temp4] "=&r"(Temp4)
103        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
104          [dst] "r"(dst), [src] "r"(src));
105
106    /* Next row... */
107    src += src_stride;
108    dst += dst_stride;
109  }
110}
111
112static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
113                                          int32_t src_stride, uint8_t *dst,
114                                          int32_t dst_stride,
115                                          const int16_t *filter_x0, int32_t h) {
116  int32_t y;
117  uint8_t *cm = vpx_ff_cropTbl;
118  uint32_t vector4a = 64;
119  int32_t Temp1, Temp2, Temp3;
120  uint32_t tp1, tp2, tp3, tp4;
121  uint32_t p1, p2, p3, p4, n1;
122  uint32_t st0, st1;
123  const int16_t *filter = &filter_x0[3];
124  uint32_t filter45;
125
126  filter45 = ((const int32_t *)filter)[0];
127
128  for (y = h; y--;) {
129    /* prefetch data to cache memory */
130    prefetch_load(src + src_stride);
131    prefetch_load(src + src_stride + 32);
132    prefetch_store(dst + dst_stride);
133
134    __asm__ __volatile__(
135        "ulw              %[tp1],         0(%[src])                      \n\t"
136        "ulw              %[tp2],         4(%[src])                      \n\t"
137
138        /* even 1. pixel */
139        "mtlo             %[vector4a],    $ac3                           \n\t"
140        "mthi             $zero,          $ac3                           \n\t"
141        "mtlo             %[vector4a],    $ac2                           \n\t"
142        "mthi             $zero,          $ac2                           \n\t"
143        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
144        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
145        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
146        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
147        "ulw              %[tp3],         8(%[src])                      \n\t"
148        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
149        "extp             %[Temp1],       $ac3,           31             \n\t"
150        "lbu              %[Temp2],       0(%[dst])                      \n\t"
151        "lbu              %[tp4],         2(%[dst])                      \n\t"
152
153        /* even 2. pixel */
154        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
155        "extp             %[Temp3],       $ac2,           31             \n\t"
156
157        /* even 3. pixel */
158        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
159        "mtlo             %[vector4a],    $ac1                           \n\t"
160        "mthi             $zero,          $ac1                           \n\t"
161        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
162        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
163        "extp             %[Temp1],       $ac1,           31             \n\t"
164
165        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
166        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
167        "sb               %[Temp2],       0(%[dst])                      \n\t"
168        "sb               %[tp4],         2(%[dst])                      \n\t"
169
170        /* even 4. pixel */
171        "mtlo             %[vector4a],    $ac2                           \n\t"
172        "mthi             $zero,          $ac2                           \n\t"
173        "mtlo             %[vector4a],    $ac3                           \n\t"
174        "mthi             $zero,          $ac3                           \n\t"
175
176        "balign           %[tp3],         %[tp2],         3              \n\t"
177        "balign           %[tp2],         %[tp1],         3              \n\t"
178
179        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
180        "lbu              %[Temp2],       4(%[dst])                      \n\t"
181        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
182
183        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
184        "extp             %[Temp3],       $ac2,           31             \n\t"
185
186        /* odd 1. pixel */
187        "mtlo             %[vector4a],    $ac1                           \n\t"
188        "mthi             $zero,          $ac1                           \n\t"
189        "sb               %[Temp2],       4(%[dst])                      \n\t"
190        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
191        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
192        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
193        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
194        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
195        "extp             %[Temp2],       $ac3,           31             \n\t"
196
197        "lbu              %[tp1],         6(%[dst])                      \n\t"
198
199        /* odd 2. pixel */
200        "mtlo             %[vector4a],    $ac3                           \n\t"
201        "mthi             $zero,          $ac3                           \n\t"
202        "mtlo             %[vector4a],    $ac2                           \n\t"
203        "mthi             $zero,          $ac2                           \n\t"
204        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
205        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
206        "extp             %[Temp3],       $ac1,           31             \n\t"
207
208        "lbu              %[tp2],         1(%[dst])                      \n\t"
209        "lbu              %[tp3],         3(%[dst])                      \n\t"
210        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
211
212        /* odd 3. pixel */
213        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
214        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
215        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
216        "extp             %[Temp2],       $ac3,           31             \n\t"
217
218        "lbu              %[tp4],         5(%[dst])                      \n\t"
219
220        /* odd 4. pixel */
221        "sb               %[tp2],         1(%[dst])                      \n\t"
222        "sb               %[tp1],         6(%[dst])                      \n\t"
223        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
224        "extp             %[Temp1],       $ac2,           31             \n\t"
225
226        "lbu              %[tp1],         7(%[dst])                      \n\t"
227
228        /* clamp */
229        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
230        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
231
232        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
233        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
234
235        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
236        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
237
238        /* store bytes */
239        "sb               %[tp3],         3(%[dst])                      \n\t"
240        "sb               %[tp4],         5(%[dst])                      \n\t"
241        "sb               %[tp1],         7(%[dst])                      \n\t"
242
243        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
244          [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
245          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
246          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
247        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
248          [dst] "r"(dst), [src] "r"(src));
249
250    /* Next row... */
251    src += src_stride;
252    dst += dst_stride;
253  }
254}
255
256static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
257                                           int32_t src_stride, uint8_t *dst_ptr,
258                                           int32_t dst_stride,
259                                           const int16_t *filter_x0, int32_t h,
260                                           int32_t count) {
261  int32_t y, c;
262  const uint8_t *src;
263  uint8_t *dst;
264  uint8_t *cm = vpx_ff_cropTbl;
265  uint32_t vector_64 = 64;
266  int32_t Temp1, Temp2, Temp3;
267  uint32_t qload1, qload2, qload3;
268  uint32_t p1, p2, p3, p4, p5;
269  uint32_t st1, st2, st3;
270  const int16_t *filter = &filter_x0[3];
271  uint32_t filter45;
272
273  filter45 = ((const int32_t *)filter)[0];
274
275  for (y = h; y--;) {
276    src = src_ptr;
277    dst = dst_ptr;
278
279    /* prefetch data to cache memory */
280    prefetch_load(src_ptr + src_stride);
281    prefetch_load(src_ptr + src_stride + 32);
282    prefetch_store(dst_ptr + dst_stride);
283
284    for (c = 0; c < count; c++) {
285      __asm__ __volatile__(
286          "ulw              %[qload1],    0(%[src])                    \n\t"
287          "ulw              %[qload2],    4(%[src])                    \n\t"
288
289          /* even 1. pixel */
290          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
291          "mthi             $zero,        $ac1                         \n\t"
292          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
293          "mthi             $zero,        $ac2                         \n\t"
294          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
295          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
296          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
297          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
298          "ulw              %[qload3],    8(%[src])                    \n\t"
299          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
300          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
301          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
302
303          /* even 2. pixel */
304          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
305          "mthi             $zero,        $ac3                         \n\t"
306          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
307          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
308          "ulw              %[qload1],    12(%[src])                   \n\t"
309          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
310          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
311          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
312
313          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
314
315          /* even 3. pixel */
316          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
317          "mthi             $zero,        $ac1                         \n\t"
318          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
319          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
320          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
321          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
322          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
323          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
324
325          /* even 4. pixel */
326          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
327          "mthi             $zero,        $ac2                         \n\t"
328          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
329          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
330          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
331          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
332          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
333          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
334          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
335          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
336
337          /* even 5. pixel */
338          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
339          "mthi             $zero,        $ac3                         \n\t"
340          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
341          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
342          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
343          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
344          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
345
346          /* even 6. pixel */
347          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
348          "mthi             $zero,        $ac1                         \n\t"
349          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
350          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
351          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
352          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
353          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
354          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
355
356          /* even 7. pixel */
357          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
358          "mthi             $zero,        $ac2                         \n\t"
359          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
360          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
361          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
362          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
363          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
364          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
365
366          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
367
368          /* even 8. pixel */
369          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
370          "mthi             $zero,        $ac3                         \n\t"
371          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
372          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
373          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
374          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
375          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
376
377          /* ODD pixels */
378          "ulw              %[qload1],    1(%[src])                   \n\t"
379          "ulw              %[qload2],    5(%[src])                    \n\t"
380
381          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
382
383          /* odd 1. pixel */
384          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
385          "mthi             $zero,        $ac1                         \n\t"
386          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
387          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
388          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
389          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
390          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
391          "ulw              %[qload3],    9(%[src])                    \n\t"
392          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
393          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
394          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
395          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
396
397          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
398
399          /* odd 2. pixel */
400          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
401          "mthi             $zero,        $ac2                         \n\t"
402          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
403          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
404          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
405          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
406          "ulw              %[qload1],    13(%[src])                   \n\t"
407          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
408          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
409          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
410          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
411
412          /* odd 3. pixel */
413          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
414          "mthi             $zero,        $ac3                         \n\t"
415          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
416          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
417          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
418          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
419          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
420          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
421
422          /* odd 4. pixel */
423          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
424          "mthi             $zero,        $ac1                         \n\t"
425          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
426          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
427          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
428          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
429          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
430          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
431          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
432
433          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
434
435          /* odd 5. pixel */
436          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
437          "mthi             $zero,        $ac2                         \n\t"
438          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
439          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
440          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
441          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
442          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
443
444          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
445
446          /* odd 6. pixel */
447          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
448          "mthi             $zero,        $ac3                         \n\t"
449          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
450          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
451          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
452          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
453          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
454
455          /* odd 7. pixel */
456          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
457          "mthi             $zero,        $ac1                         \n\t"
458          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
459          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
460          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
461          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
462          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
463
464          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
465
466          /* odd 8. pixel */
467          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
468          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
469
470          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
471
472          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
473          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
474
475          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
476          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
477
478          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
479          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
480
481          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
482          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
483          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
484
485          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
486            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
487            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
488            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
489            [Temp3] "=&r"(Temp3)
490          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
491            [dst] "r"(dst), [src] "r"(src));
492
493      src += 16;
494      dst += 16;
495    }
496
497    /* Next row... */
498    src_ptr += src_stride;
499    dst_ptr += dst_stride;
500  }
501}
502
503static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
504                                           int32_t src_stride, uint8_t *dst_ptr,
505                                           int32_t dst_stride,
506                                           const int16_t *filter_x0,
507                                           int32_t h) {
508  int32_t y, c;
509  const uint8_t *src;
510  uint8_t *dst;
511  uint8_t *cm = vpx_ff_cropTbl;
512  uint32_t vector_64 = 64;
513  int32_t Temp1, Temp2, Temp3;
514  uint32_t qload1, qload2, qload3;
515  uint32_t p1, p2, p3, p4, p5;
516  uint32_t st1, st2, st3;
517  const int16_t *filter = &filter_x0[3];
518  uint32_t filter45;
519
520  filter45 = ((const int32_t *)filter)[0];
521
522  for (y = h; y--;) {
523    src = src_ptr;
524    dst = dst_ptr;
525
526    /* prefetch data to cache memory */
527    prefetch_load(src_ptr + src_stride);
528    prefetch_load(src_ptr + src_stride + 32);
529    prefetch_load(src_ptr + src_stride + 64);
530    prefetch_store(dst_ptr + dst_stride);
531    prefetch_store(dst_ptr + dst_stride + 32);
532
533    for (c = 0; c < 4; c++) {
534      __asm__ __volatile__(
535          "ulw              %[qload1],    0(%[src])                    \n\t"
536          "ulw              %[qload2],    4(%[src])                    \n\t"
537
538          /* even 1. pixel */
539          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
540          "mthi             $zero,        $ac1                         \n\t"
541          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
542          "mthi             $zero,        $ac2                         \n\t"
543          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
544          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
545          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
546          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
547          "ulw              %[qload3],    8(%[src])                    \n\t"
548          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
549          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
550          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
551
552          /* even 2. pixel */
553          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
554          "mthi             $zero,        $ac3                         \n\t"
555          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
556          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
557          "ulw              %[qload1],    12(%[src])                   \n\t"
558          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
559          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
560          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
561
562          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
563
564          /* even 3. pixel */
565          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
566          "mthi             $zero,        $ac1                         \n\t"
567          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
568          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
569          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
570          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
571          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
572          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
573
574          /* even 4. pixel */
575          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
576          "mthi             $zero,        $ac2                         \n\t"
577          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
578          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
579          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
580          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
581          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
582          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
583          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
584          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
585
586          /* even 5. pixel */
587          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
588          "mthi             $zero,        $ac3                         \n\t"
589          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
590          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
591          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
592          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
593          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
594
595          /* even 6. pixel */
596          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
597          "mthi             $zero,        $ac1                         \n\t"
598          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
599          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
600          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
601          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
602          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
603          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
604
605          /* even 7. pixel */
606          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
607          "mthi             $zero,        $ac2                         \n\t"
608          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
609          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
610          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
611          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
612          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
613          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
614
615          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
616
617          /* even 8. pixel */
618          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
619          "mthi             $zero,        $ac3                         \n\t"
620          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
621          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
622          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
623          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
624          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
625
626          /* ODD pixels */
627          "ulw              %[qload1],    1(%[src])                   \n\t"
628          "ulw              %[qload2],    5(%[src])                    \n\t"
629
630          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
631
632          /* odd 1. pixel */
633          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
634          "mthi             $zero,        $ac1                         \n\t"
635          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
636          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
637          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
638          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
639          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
640          "ulw              %[qload3],    9(%[src])                    \n\t"
641          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
642          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
643          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
644          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
645
646          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
647
648          /* odd 2. pixel */
649          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
650          "mthi             $zero,        $ac2                         \n\t"
651          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
652          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
653          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
654          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
655          "ulw              %[qload1],    13(%[src])                   \n\t"
656          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
657          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
658          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
659          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
660
661          /* odd 3. pixel */
662          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
663          "mthi             $zero,        $ac3                         \n\t"
664          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
665          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
666          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
667          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
668          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
669          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
670
671          /* odd 4. pixel */
672          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
673          "mthi             $zero,        $ac1                         \n\t"
674          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
675          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
676          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
677          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
678          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
679          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
680          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
681
682          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
683
684          /* odd 5. pixel */
685          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
686          "mthi             $zero,        $ac2                         \n\t"
687          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
688          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
689          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
690          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
691          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
692
693          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
694
695          /* odd 6. pixel */
696          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
697          "mthi             $zero,        $ac3                         \n\t"
698          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
699          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
700          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
701          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
702          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
703
704          /* odd 7. pixel */
705          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
706          "mthi             $zero,        $ac1                         \n\t"
707          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
708          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
709          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
710          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
711          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
712
713          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
714
715          /* odd 8. pixel */
716          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
717          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
718
719          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
720
721          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
722          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
723
724          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
725          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
726
727          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
728          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
729
730          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
731          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
732          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
733
734          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
735            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
736            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
737            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
738            [Temp3] "=&r"(Temp3)
739          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
740            [dst] "r"(dst), [src] "r"(src));
741
742      src += 16;
743      dst += 16;
744    }
745
746    /* Next row... */
747    src_ptr += src_stride;
748    dst_ptr += dst_stride;
749  }
750}
751
752void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
753                                   uint8_t *dst, ptrdiff_t dst_stride,
754                                   const InterpKernel *filter, int x0_q4,
755                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
756                                   int w, int h) {
757  const int16_t *const filter_x = filter[x0_q4];
758  uint32_t pos = 38;
759
760  assert(x_step_q4 == 16);
761
762  /* bit positon for extract from acc */
763  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
764                       :
765                       : [pos] "r"(pos));
766
767  /* prefetch data to cache memory */
768  prefetch_load(src);
769  prefetch_load(src + 32);
770  prefetch_store(dst);
771
772  switch (w) {
773    case 4:
774      convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
775                                    h);
776      break;
777    case 8:
778      convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
779                                    h);
780      break;
781    case 16:
782      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
783                                     h, 1);
784      break;
785    case 32:
786      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
787                                     h, 2);
788      break;
789    case 64:
790      prefetch_load(src + 64);
791      prefetch_store(dst + 32);
792
793      convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
794                                     h);
795      break;
796    default:
797      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
798                                x_step_q4, y0_q4, y_step_q4, w, h);
799      break;
800  }
801}
802#endif
803