1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_convolve.h"
17#include "vpx_dsp/vpx_dsp_common.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
22                                          int32_t src_stride,
23                                          uint8_t *dst,
24                                          int32_t dst_stride,
25                                          const int16_t *filter_x0,
26                                          int32_t h) {
27  int32_t y;
28  uint8_t *cm = vpx_ff_cropTbl;
29  int32_t  Temp1, Temp2, Temp3, Temp4;
30  uint32_t vector4a = 64;
31  uint32_t tp1, tp2;
32  uint32_t p1, p2, p3;
33  uint32_t tn1, tn2;
34  const int16_t *filter = &filter_x0[3];
35  uint32_t      filter45;
36
37  filter45 = ((const int32_t *)filter)[0];
38
39  for (y = h; y--;) {
40    /* prefetch data to cache memory */
41    prefetch_load(src + src_stride);
42    prefetch_load(src + src_stride + 32);
43    prefetch_store(dst + dst_stride);
44
45    __asm__ __volatile__ (
46        "ulw              %[tp1],         0(%[src])                      \n\t"
47        "ulw              %[tp2],         4(%[src])                      \n\t"
48
49        /* even 1. pixel */
50        "mtlo             %[vector4a],    $ac3                           \n\t"
51        "mthi             $zero,          $ac3                           \n\t"
52        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
53        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
54        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
55        "extp             %[Temp1],       $ac3,           31             \n\t"
56
57        /* even 2. pixel */
58        "mtlo             %[vector4a],    $ac2                           \n\t"
59        "mthi             $zero,          $ac2                           \n\t"
60        "balign           %[tp2],         %[tp1],         3              \n\t"
61        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
62        "extp             %[Temp3],       $ac2,           31             \n\t"
63
64        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
65
66        /* odd 1. pixel */
67        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
68        "mtlo             %[vector4a],    $ac3                           \n\t"
69        "mthi             $zero,          $ac3                           \n\t"
70        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
71        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
72        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
73        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
74        "extp             %[Temp2],       $ac3,           31             \n\t"
75
76        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
77
78        /* odd 2. pixel */
79        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
80        "mtlo             %[vector4a],    $ac2                           \n\t"
81        "mthi             $zero,          $ac2                           \n\t"
82        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
83        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
84        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
85        "extp             %[Temp4],       $ac2,           31             \n\t"
86
87        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
88        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
89
90        /* clamp */
91        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
92        "lbux             %[p3],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
93        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
94
95        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
96        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
97
98        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t"  /* average odd 2 */
99        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
100
101        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
102          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
103          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
104          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
105          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
106        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
107          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
108    );
109
110    /* Next row... */
111    src += src_stride;
112    dst += dst_stride;
113  }
114}
115
116static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
117                                         int32_t src_stride,
118                                         uint8_t *dst,
119                                         int32_t dst_stride,
120                                         const int16_t *filter_x0,
121                                         int32_t h) {
122  int32_t y;
123  uint8_t *cm = vpx_ff_cropTbl;
124  uint32_t vector4a = 64;
125  int32_t Temp1, Temp2, Temp3;
126  uint32_t tp1, tp2, tp3, tp4;
127  uint32_t p1, p2, p3, p4, n1;
128  uint32_t st0, st1;
129  const int16_t *filter = &filter_x0[3];
130  uint32_t filter45;;
131
132  filter45 = ((const int32_t *)filter)[0];
133
134  for (y = h; y--;) {
135    /* prefetch data to cache memory */
136    prefetch_load(src + src_stride);
137    prefetch_load(src + src_stride + 32);
138    prefetch_store(dst + dst_stride);
139
140    __asm__ __volatile__ (
141        "ulw              %[tp1],         0(%[src])                      \n\t"
142        "ulw              %[tp2],         4(%[src])                      \n\t"
143
144        /* even 1. pixel */
145        "mtlo             %[vector4a],    $ac3                           \n\t"
146        "mthi             $zero,          $ac3                           \n\t"
147        "mtlo             %[vector4a],    $ac2                           \n\t"
148        "mthi             $zero,          $ac2                           \n\t"
149        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
150        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
151        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
152        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
153        "ulw              %[tp3],         8(%[src])                      \n\t"
154        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
155        "extp             %[Temp1],       $ac3,           31             \n\t"
156        "lbu              %[Temp2],       0(%[dst])                      \n\t"
157        "lbu              %[tp4],         2(%[dst])                      \n\t"
158
159        /* even 2. pixel */
160        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
161        "extp             %[Temp3],       $ac2,           31             \n\t"
162
163        /* even 3. pixel */
164        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
165        "mtlo             %[vector4a],    $ac1                           \n\t"
166        "mthi             $zero,          $ac1                           \n\t"
167        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
168        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
169        "extp             %[Temp1],       $ac1,           31             \n\t"
170
171        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
172        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
173        "sb               %[Temp2],       0(%[dst])                      \n\t"
174        "sb               %[tp4],         2(%[dst])                      \n\t"
175
176        /* even 4. pixel */
177        "mtlo             %[vector4a],    $ac2                           \n\t"
178        "mthi             $zero,          $ac2                           \n\t"
179        "mtlo             %[vector4a],    $ac3                           \n\t"
180        "mthi             $zero,          $ac3                           \n\t"
181
182        "balign           %[tp3],         %[tp2],         3              \n\t"
183        "balign           %[tp2],         %[tp1],         3              \n\t"
184
185        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
186        "lbu              %[Temp2],       4(%[dst])                      \n\t"
187        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
188
189        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
190        "extp             %[Temp3],       $ac2,           31             \n\t"
191
192        /* odd 1. pixel */
193        "mtlo             %[vector4a],    $ac1                           \n\t"
194        "mthi             $zero,          $ac1                           \n\t"
195        "sb               %[Temp2],       4(%[dst])                      \n\t"
196        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
197        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
198        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
199        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
200        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
201        "extp             %[Temp2],       $ac3,           31             \n\t"
202
203        "lbu              %[tp1],         6(%[dst])                      \n\t"
204
205        /* odd 2. pixel */
206        "mtlo             %[vector4a],    $ac3                           \n\t"
207        "mthi             $zero,          $ac3                           \n\t"
208        "mtlo             %[vector4a],    $ac2                           \n\t"
209        "mthi             $zero,          $ac2                           \n\t"
210        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
211        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
212        "extp             %[Temp3],       $ac1,           31             \n\t"
213
214        "lbu              %[tp2],         1(%[dst])                      \n\t"
215        "lbu              %[tp3],         3(%[dst])                      \n\t"
216        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
217
218        /* odd 3. pixel */
219        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
220        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
221        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
222        "extp             %[Temp2],       $ac3,           31             \n\t"
223
224        "lbu              %[tp4],         5(%[dst])                      \n\t"
225
226        /* odd 4. pixel */
227        "sb               %[tp2],         1(%[dst])                      \n\t"
228        "sb               %[tp1],         6(%[dst])                      \n\t"
229        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
230        "extp             %[Temp1],       $ac2,           31             \n\t"
231
232        "lbu              %[tp1],         7(%[dst])                      \n\t"
233
234        /* clamp */
235        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
236        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
237
238        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
239        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
240
241        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
242        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
243
244        /* store bytes */
245        "sb               %[tp3],         3(%[dst])                      \n\t"
246        "sb               %[tp4],         5(%[dst])                      \n\t"
247        "sb               %[tp1],         7(%[dst])                      \n\t"
248
249        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
250          [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
251          [st0] "=&r" (st0), [st1] "=&r" (st1),
252          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
253          [n1] "=&r" (n1),
254          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
255        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
256          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
257    );
258
259    /* Next row... */
260    src += src_stride;
261    dst += dst_stride;
262  }
263}
264
265static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
266                                          int32_t src_stride,
267                                          uint8_t *dst_ptr,
268                                          int32_t dst_stride,
269                                          const int16_t *filter_x0,
270                                          int32_t h,
271                                          int32_t count) {
272  int32_t y, c;
273  const uint8_t *src;
274  uint8_t *dst;
275  uint8_t *cm = vpx_ff_cropTbl;
276  uint32_t vector_64 = 64;
277  int32_t Temp1, Temp2, Temp3;
278  uint32_t qload1, qload2, qload3;
279  uint32_t p1, p2, p3, p4, p5;
280  uint32_t st1, st2, st3;
281  const int16_t *filter = &filter_x0[3];
282  uint32_t filter45;;
283
284  filter45 = ((const int32_t *)filter)[0];
285
286  for (y = h; y--;) {
287    src = src_ptr;
288    dst = dst_ptr;
289
290    /* prefetch data to cache memory */
291    prefetch_load(src_ptr + src_stride);
292    prefetch_load(src_ptr + src_stride + 32);
293    prefetch_store(dst_ptr + dst_stride);
294
295    for (c = 0; c < count; c++) {
296      __asm__ __volatile__ (
297          "ulw              %[qload1],    0(%[src])                    \n\t"
298          "ulw              %[qload2],    4(%[src])                    \n\t"
299
300          /* even 1. pixel */
301          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
302          "mthi             $zero,        $ac1                         \n\t"
303          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
304          "mthi             $zero,        $ac2                         \n\t"
305          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
306          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
307          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
308          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
309          "ulw              %[qload3],    8(%[src])                    \n\t"
310          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
311          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
312          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
313
314          /* even 2. pixel */
315          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
316          "mthi             $zero,        $ac3                         \n\t"
317          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
318          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
319          "ulw              %[qload1],    12(%[src])                   \n\t"
320          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
321          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
322          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
323
324          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
325
326          /* even 3. pixel */
327          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
328          "mthi             $zero,        $ac1                         \n\t"
329          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
330          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
331          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
332          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
333          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
334          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
335
336          /* even 4. pixel */
337          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
338          "mthi             $zero,        $ac2                         \n\t"
339          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
340          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
341          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
342          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
343          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
344          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
345          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
346          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
347
348          /* even 5. pixel */
349          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
350          "mthi             $zero,        $ac3                         \n\t"
351          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
352          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
353          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
354          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
355          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
356
357          /* even 6. pixel */
358          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
359          "mthi             $zero,        $ac1                         \n\t"
360          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
361          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
362          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
363          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
364          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
365          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
366
367          /* even 7. pixel */
368          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
369          "mthi             $zero,        $ac2                         \n\t"
370          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
371          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
372          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
373          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
374          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
375          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
376
377          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
378
379          /* even 8. pixel */
380          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
381          "mthi             $zero,        $ac3                         \n\t"
382          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
383          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
384          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
385          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
386          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
387
388          /* ODD pixels */
389          "ulw              %[qload1],    1(%[src])                   \n\t"
390          "ulw              %[qload2],    5(%[src])                    \n\t"
391
392          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
393
394          /* odd 1. pixel */
395          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
396          "mthi             $zero,        $ac1                         \n\t"
397          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
398          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
399          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
400          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
401          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
402          "ulw              %[qload3],    9(%[src])                    \n\t"
403          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
404          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
405          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
406          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
407
408          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
409
410          /* odd 2. pixel */
411          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
412          "mthi             $zero,        $ac2                         \n\t"
413          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
414          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
415          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
416          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
417          "ulw              %[qload1],    13(%[src])                   \n\t"
418          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
419          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
420          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
421          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
422
423          /* odd 3. pixel */
424          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
425          "mthi             $zero,        $ac3                         \n\t"
426          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
427          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
428          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
429          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
430          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
431          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
432
433          /* odd 4. pixel */
434          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
435          "mthi             $zero,        $ac1                         \n\t"
436          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
437          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
438          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
439          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
440          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
441          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
442          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
443
444          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
445
446          /* odd 5. pixel */
447          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
448          "mthi             $zero,        $ac2                         \n\t"
449          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
450          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
451          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
452          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
453          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
454
455          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
456
457          /* odd 6. pixel */
458          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
459          "mthi             $zero,        $ac3                         \n\t"
460          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
461          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
462          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
463          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
464          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
465
466          /* odd 7. pixel */
467          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
468          "mthi             $zero,        $ac1                         \n\t"
469          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
470          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
471          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
472          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
473          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
474
475          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
476
477          /* odd 8. pixel */
478          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
479          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
480
481          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
482
483          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
484          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
485
486          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
487          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
488
489          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
490          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
491
492          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
493          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
494          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
495
496          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
497            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
498            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
499            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
500            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
501          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
502            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
503      );
504
505      src += 16;
506      dst += 16;
507    }
508
509    /* Next row... */
510    src_ptr += src_stride;
511    dst_ptr += dst_stride;
512  }
513}
514
515static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
516                                          int32_t src_stride,
517                                          uint8_t *dst_ptr,
518                                          int32_t dst_stride,
519                                          const int16_t *filter_x0,
520                                          int32_t h) {
521  int32_t y, c;
522  const uint8_t *src;
523  uint8_t *dst;
524  uint8_t *cm = vpx_ff_cropTbl;
525  uint32_t vector_64 = 64;
526  int32_t Temp1, Temp2, Temp3;
527  uint32_t qload1, qload2, qload3;
528  uint32_t p1, p2, p3, p4, p5;
529  uint32_t st1, st2, st3;
530  const int16_t *filter = &filter_x0[3];
531  uint32_t filter45;;
532
533  filter45 = ((const int32_t *)filter)[0];
534
535  for (y = h; y--;) {
536    src = src_ptr;
537    dst = dst_ptr;
538
539    /* prefetch data to cache memory */
540    prefetch_load(src_ptr + src_stride);
541    prefetch_load(src_ptr + src_stride + 32);
542    prefetch_load(src_ptr + src_stride + 64);
543    prefetch_store(dst_ptr + dst_stride);
544    prefetch_store(dst_ptr + dst_stride + 32);
545
546    for (c = 0; c < 4; c++) {
547      __asm__ __volatile__ (
548          "ulw              %[qload1],    0(%[src])                    \n\t"
549          "ulw              %[qload2],    4(%[src])                    \n\t"
550
551          /* even 1. pixel */
552          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
553          "mthi             $zero,        $ac1                         \n\t"
554          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
555          "mthi             $zero,        $ac2                         \n\t"
556          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
557          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
558          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
559          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
560          "ulw              %[qload3],    8(%[src])                    \n\t"
561          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
562          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
563          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
564
565          /* even 2. pixel */
566          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
567          "mthi             $zero,        $ac3                         \n\t"
568          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
569          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
570          "ulw              %[qload1],    12(%[src])                   \n\t"
571          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
572          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
573          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
574
575          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
576
577          /* even 3. pixel */
578          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
579          "mthi             $zero,        $ac1                         \n\t"
580          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
581          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
582          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
583          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
584          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
585          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
586
587          /* even 4. pixel */
588          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
589          "mthi             $zero,        $ac2                         \n\t"
590          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
591          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
592          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
593          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
594          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
595          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
596          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
597          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
598
599          /* even 5. pixel */
600          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
601          "mthi             $zero,        $ac3                         \n\t"
602          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
603          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
604          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
605          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
606          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
607
608          /* even 6. pixel */
609          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
610          "mthi             $zero,        $ac1                         \n\t"
611          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
612          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
613          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
614          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
615          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
616          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
617
618          /* even 7. pixel */
619          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
620          "mthi             $zero,        $ac2                         \n\t"
621          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
622          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
623          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
624          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
625          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
626          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
627
628          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
629
630          /* even 8. pixel */
631          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
632          "mthi             $zero,        $ac3                         \n\t"
633          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
634          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
635          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
636          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
637          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
638
639          /* ODD pixels */
640          "ulw              %[qload1],    1(%[src])                   \n\t"
641          "ulw              %[qload2],    5(%[src])                    \n\t"
642
643          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
644
645          /* odd 1. pixel */
646          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
647          "mthi             $zero,        $ac1                         \n\t"
648          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
649          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
650          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
651          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
652          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
653          "ulw              %[qload3],    9(%[src])                    \n\t"
654          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
655          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
656          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
657          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
658
659          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
660
661          /* odd 2. pixel */
662          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
663          "mthi             $zero,        $ac2                         \n\t"
664          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
665          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
666          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
667          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
668          "ulw              %[qload1],    13(%[src])                   \n\t"
669          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
670          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
671          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
672          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
673
674          /* odd 3. pixel */
675          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
676          "mthi             $zero,        $ac3                         \n\t"
677          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
678          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
679          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
680          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
681          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
682          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
683
684          /* odd 4. pixel */
685          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
686          "mthi             $zero,        $ac1                         \n\t"
687          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
688          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
689          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
690          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
691          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
692          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
693          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
694
695          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
696
697          /* odd 5. pixel */
698          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
699          "mthi             $zero,        $ac2                         \n\t"
700          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
701          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
702          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
703          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
704          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
705
706          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
707
708          /* odd 6. pixel */
709          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
710          "mthi             $zero,        $ac3                         \n\t"
711          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
712          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
713          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
714          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
715          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
716
717          /* odd 7. pixel */
718          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
719          "mthi             $zero,        $ac1                         \n\t"
720          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
721          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
722          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
723          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
724          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
725
726          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
727
728          /* odd 8. pixel */
729          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
730          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
731
732          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
733
734          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
735          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
736
737          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
738          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
739
740          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
741          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
742
743          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
744          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
745          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
746
747          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
748            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
749            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
750            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
751            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
752          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
753            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
754      );
755
756      src += 16;
757      dst += 16;
758    }
759
760    /* Next row... */
761    src_ptr += src_stride;
762    dst_ptr += dst_stride;
763  }
764}
765
766void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
767                                   uint8_t *dst, ptrdiff_t dst_stride,
768                                   const int16_t *filter_x, int x_step_q4,
769                                   const int16_t *filter_y, int y_step_q4,
770                                   int w, int h) {
771  uint32_t pos = 38;
772
773  assert(x_step_q4 == 16);
774
775  /* bit positon for extract from acc */
776  __asm__ __volatile__ (
777    "wrdsp      %[pos],     1           \n\t"
778    :
779    : [pos] "r" (pos)
780  );
781
782  /* prefetch data to cache memory */
783  prefetch_load(src);
784  prefetch_load(src + 32);
785  prefetch_store(dst);
786
787  switch (w) {
788    case 4:
789      convolve_bi_avg_horiz_4_dspr2(src, src_stride,
790                                   dst, dst_stride,
791                                   filter_x, h);
792      break;
793    case 8:
794      convolve_bi_avg_horiz_8_dspr2(src, src_stride,
795                                   dst, dst_stride,
796                                   filter_x, h);
797      break;
798    case 16:
799      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
800                                    dst, dst_stride,
801                                    filter_x, h, 1);
802      break;
803    case 32:
804      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
805                                    dst, dst_stride,
806                                    filter_x, h, 2);
807      break;
808    case 64:
809      prefetch_load(src + 64);
810      prefetch_store(dst + 32);
811
812      convolve_bi_avg_horiz_64_dspr2(src, src_stride,
813                                    dst, dst_stride,
814                                    filter_x, h);
815      break;
816    default:
817      vpx_convolve8_avg_horiz_c(src, src_stride,
818                                dst, dst_stride,
819                                filter_x, x_step_q4,
820                                filter_y, y_step_q4,
821                                w, h);
822      break;
823  }
824}
825#endif
826