1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_config.h"
15#include "./vp9_rtcd.h"
16#include "vp9/common/vp9_common.h"
17#include "vpx/vpx_integer.h"
18#include "vpx_ports/mem.h"
19#include "vp9/common/vp9_convolve.h"
20#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22#if HAVE_DSPR2
23static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
24                                       int32_t src_stride,
25                                       uint8_t *dst,
26                                       int32_t dst_stride,
27                                       const int16_t *filter_x0,
28                                       int32_t h) {
29  int32_t y;
30  uint8_t *cm = vp9_ff_cropTbl;
31  int32_t  vector1b, vector2b, vector3b, vector4b;
32  int32_t  Temp1, Temp2, Temp3, Temp4;
33  uint32_t vector4a = 64;
34  uint32_t tp1, tp2;
35  uint32_t p1, p2, p3, p4;
36  uint32_t n1, n2, n3, n4;
37  uint32_t tn1, tn2;
38
39  vector1b = ((const int32_t *)filter_x0)[0];
40  vector2b = ((const int32_t *)filter_x0)[1];
41  vector3b = ((const int32_t *)filter_x0)[2];
42  vector4b = ((const int32_t *)filter_x0)[3];
43
44  for (y = h; y--;) {
45    /* prefetch data to cache memory */
46    vp9_prefetch_load(src + src_stride);
47    vp9_prefetch_load(src + src_stride + 32);
48    vp9_prefetch_store(dst + dst_stride);
49
50    __asm__ __volatile__ (
51        "ulw              %[tp1],         0(%[src])                      \n\t"
52        "ulw              %[tp2],         4(%[src])                      \n\t"
53
54        /* even 1. pixel */
55        "mtlo             %[vector4a],    $ac3                           \n\t"
56        "mthi             $zero,          $ac3                           \n\t"
57        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
58        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
59        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
60        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
61        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
62        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
63        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
64        "ulw              %[tn2],         8(%[src])                      \n\t"
65        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
66        "extp             %[Temp1],       $ac3,           31             \n\t"
67
68        /* even 2. pixel */
69        "mtlo             %[vector4a],    $ac2                           \n\t"
70        "mthi             $zero,          $ac2                           \n\t"
71        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
72        "balign           %[tn1],         %[tn2],         3              \n\t"
73        "balign           %[tn2],         %[tp2],         3              \n\t"
74        "balign           %[tp2],         %[tp1],         3              \n\t"
75        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
76        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
77        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
78        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
79        "extp             %[Temp3],       $ac2,           31             \n\t"
80
81        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
82
83        /* odd 1. pixel */
84        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
85        "mtlo             %[vector4a],    $ac3                           \n\t"
86        "mthi             $zero,          $ac3                           \n\t"
87        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
88        "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
89        "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
90        "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
91        "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
92        "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
93        "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
94        "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
95        "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
96        "extp             %[Temp2],       $ac3,           31             \n\t"
97
98        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
99
100        /* odd 2. pixel */
101        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
102        "mtlo             %[vector4a],    $ac2                           \n\t"
103        "mthi             $zero,          $ac2                           \n\t"
104        "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
105        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
106        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
107        "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
108        "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
109        "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
110        "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
111        "extp             %[Temp4],       $ac2,           31             \n\t"
112
113        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
114        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
115
116        /* clamp */
117        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
118        "lbux             %[n2],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
119        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
120
121        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
122        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
123
124        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t"  /* average odd 2 */
125        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
126
127        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
128          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
129          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
130          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
131          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
132          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
133        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
134          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
135          [vector4a] "r" (vector4a),
136          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
137    );
138
139    /* Next row... */
140    src += src_stride;
141    dst += dst_stride;
142  }
143}
144
145static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
146                                       int32_t src_stride,
147                                       uint8_t *dst,
148                                       int32_t dst_stride,
149                                       const int16_t *filter_x0,
150                                       int32_t h) {
151  int32_t y;
152  uint8_t *cm = vp9_ff_cropTbl;
153  uint32_t vector4a = 64;
154  int32_t vector1b, vector2b, vector3b, vector4b;
155  int32_t Temp1, Temp2, Temp3;
156  uint32_t tp1, tp2;
157  uint32_t p1, p2, p3, p4, n1;
158  uint32_t tn1, tn2, tn3;
159  uint32_t st0, st1;
160
161  vector1b = ((const int32_t *)filter_x0)[0];
162  vector2b = ((const int32_t *)filter_x0)[1];
163  vector3b = ((const int32_t *)filter_x0)[2];
164  vector4b = ((const int32_t *)filter_x0)[3];
165
166  for (y = h; y--;) {
167    /* prefetch data to cache memory */
168    vp9_prefetch_load(src + src_stride);
169    vp9_prefetch_load(src + src_stride + 32);
170    vp9_prefetch_store(dst + dst_stride);
171
172    __asm__ __volatile__ (
173        "ulw              %[tp1],         0(%[src])                      \n\t"
174        "ulw              %[tp2],         4(%[src])                      \n\t"
175
176        /* even 1. pixel */
177        "mtlo             %[vector4a],    $ac3                           \n\t"
178        "mthi             $zero,          $ac3                           \n\t"
179        "mtlo             %[vector4a],    $ac2                           \n\t"
180        "mthi             $zero,          $ac2                           \n\t"
181        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
182        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
183        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
184        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
185        "ulw              %[tn2],         8(%[src])                      \n\t"
186        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
187        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
188        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
189        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
190        "extp             %[Temp1],       $ac3,           31             \n\t"
191        "lbu              %[Temp2],       0(%[dst])                      \n\t"
192        "lbu              %[tn3],         2(%[dst])                      \n\t"
193
194        /* even 2. pixel */
195        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
196        "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
197        "ulw              %[tn1],         12(%[src])                     \n\t"
198        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
199        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
200        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
201        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
202        "extp             %[Temp3],       $ac2,           31             \n\t"
203
204        /* even 3. pixel */
205        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
206        "mtlo             %[vector4a],    $ac1                           \n\t"
207        "mthi             $zero,          $ac1                           \n\t"
208        "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
209        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
210        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
211        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
212        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
213        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
214        "extp             %[Temp1],       $ac1,           31             \n\t"
215
216        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
217        "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
218        "sb               %[Temp2],       0(%[dst])                      \n\t"
219        "sb               %[tn3],         2(%[dst])                      \n\t"
220
221        /* even 4. pixel */
222        "mtlo             %[vector4a],    $ac2                           \n\t"
223        "mthi             $zero,          $ac2                           \n\t"
224        "mtlo             %[vector4a],    $ac3                           \n\t"
225        "mthi             $zero,          $ac3                           \n\t"
226
227        "balign           %[tn3],         %[tn1],         3              \n\t"
228        "balign           %[tn1],         %[tn2],         3              \n\t"
229        "balign           %[tn2],         %[tp2],         3              \n\t"
230        "balign           %[tp2],         %[tp1],         3              \n\t"
231
232        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
233        "lbu              %[Temp2],       4(%[dst])                      \n\t"
234        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
235
236        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
237        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
238        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
239        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
240        "extp             %[Temp3],       $ac2,           31             \n\t"
241
242        /* odd 1. pixel */
243        "mtlo             %[vector4a],    $ac1                           \n\t"
244        "mthi             $zero,          $ac1                           \n\t"
245        "sb               %[Temp2],       4(%[dst])                      \n\t"
246        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
247        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
248        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
249        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
250        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
251        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
252        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
253        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
254        "extp             %[Temp2],       $ac3,           31             \n\t"
255
256        "lbu              %[tp1],         6(%[dst])                      \n\t"
257
258        /* odd 2. pixel */
259        "mtlo             %[vector4a],    $ac3                           \n\t"
260        "mthi             $zero,          $ac3                           \n\t"
261        "mtlo             %[vector4a],    $ac2                           \n\t"
262        "mthi             $zero,          $ac2                           \n\t"
263        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
264        "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
265        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
266        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
267        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
268        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
269        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
270        "extp             %[Temp3],       $ac1,           31             \n\t"
271
272        "lbu              %[tp2],         1(%[dst])                      \n\t"
273        "lbu              %[tn2],         3(%[dst])                      \n\t"
274        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
275
276        /* odd 3. pixel */
277        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
278        "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
279        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
280        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
281        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
282        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
283        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
284        "extp             %[Temp2],       $ac3,           31             \n\t"
285
286        "lbu              %[tn3],         5(%[dst])                      \n\t"
287
288        /* odd 4. pixel */
289        "sb               %[tp2],         1(%[dst])                      \n\t"
290        "sb               %[tp1],         6(%[dst])                      \n\t"
291        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
292        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
293        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
294        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
295        "extp             %[Temp1],       $ac2,           31             \n\t"
296
297        "lbu              %[tn1],         7(%[dst])                      \n\t"
298
299        /* clamp */
300        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
301        "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
302
303        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
304        "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
305
306        "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
307        "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
308
309        /* store bytes */
310        "sb               %[tn2],         3(%[dst])                      \n\t"
311        "sb               %[tn3],         5(%[dst])                      \n\t"
312        "sb               %[tn1],         7(%[dst])                      \n\t"
313
314        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
315          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
316          [st0] "=&r" (st0), [st1] "=&r" (st1),
317          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
318          [n1] "=&r" (n1),
319          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
320        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
321          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
322          [vector4a] "r" (vector4a),
323          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
324    );
325
326    /* Next row... */
327    src += src_stride;
328    dst += dst_stride;
329  }
330}
331
332static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
333                                        int32_t src_stride,
334                                        uint8_t *dst_ptr,
335                                        int32_t dst_stride,
336                                        const int16_t *filter_x0,
337                                        int32_t h,
338                                        int32_t count) {
339  int32_t y, c;
340  const uint8_t *src;
341  uint8_t *dst;
342  uint8_t *cm = vp9_ff_cropTbl;
343  uint32_t vector_64 = 64;
344  int32_t filter12, filter34, filter56, filter78;
345  int32_t Temp1, Temp2, Temp3;
346  uint32_t qload1, qload2, qload3;
347  uint32_t p1, p2, p3, p4, p5;
348  uint32_t st1, st2, st3;
349
350  filter12 = ((const int32_t *)filter_x0)[0];
351  filter34 = ((const int32_t *)filter_x0)[1];
352  filter56 = ((const int32_t *)filter_x0)[2];
353  filter78 = ((const int32_t *)filter_x0)[3];
354
355  for (y = h; y--;) {
356    src = src_ptr;
357    dst = dst_ptr;
358
359    /* prefetch data to cache memory */
360    vp9_prefetch_load(src_ptr + src_stride);
361    vp9_prefetch_load(src_ptr + src_stride + 32);
362    vp9_prefetch_store(dst_ptr + dst_stride);
363
364    for (c = 0; c < count; c++) {
365      __asm__ __volatile__ (
366          "ulw              %[qload1],    0(%[src])                    \n\t"
367          "ulw              %[qload2],    4(%[src])                    \n\t"
368
369          /* even 1. pixel */
370          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
371          "mthi             $zero,        $ac1                         \n\t"
372          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
373          "mthi             $zero,        $ac2                         \n\t"
374          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
375          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
376          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
377          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
378          "ulw              %[qload3],    8(%[src])                    \n\t"
379          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
380          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
381          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
382          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
383          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
384          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
385
386          /* even 2. pixel */
387          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
388          "mthi             $zero,        $ac3                         \n\t"
389          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
390          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
391          "ulw              %[qload1],    12(%[src])                   \n\t"
392          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
393          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
394          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
395          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
396          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
397          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
398
399          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
400
401          /* even 3. pixel */
402          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
403          "mthi             $zero,        $ac1                         \n\t"
404          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
405          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
406          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
407          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
408          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
409          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
410          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
411          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
412          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
413
414          /* even 4. pixel */
415          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
416          "mthi             $zero,        $ac2                         \n\t"
417          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
418          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
419          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
420          "ulw              %[qload2],    16(%[src])                   \n\t"
421          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
422          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
423          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
424          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
425          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
426          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
427          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
428          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
429
430          /* even 5. pixel */
431          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
432          "mthi             $zero,        $ac3                         \n\t"
433          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
434          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
435          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
436          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
437          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
438          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
439          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
440          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
441          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
442
443          /* even 6. pixel */
444          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
445          "mthi             $zero,        $ac1                         \n\t"
446          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
447          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
448          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
449          "ulw              %[qload3],    20(%[src])                   \n\t"
450          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
451          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
452          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
453          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
454          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
455          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
456          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
457
458          /* even 7. pixel */
459          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
460          "mthi             $zero,        $ac2                         \n\t"
461          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
462          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
463          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
464          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
465          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
466          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
467          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
468          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
469          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
470          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
471
472          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
473
474          /* even 8. pixel */
475          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
476          "mthi             $zero,        $ac3                         \n\t"
477          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
478          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
479          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
480          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
481          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
482          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
483          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
484          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
485
486          /* ODD pixels */
487          "ulw              %[qload1],    1(%[src])                   \n\t"
488          "ulw              %[qload2],    5(%[src])                    \n\t"
489
490          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
491
492          /* odd 1. pixel */
493          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
494          "mthi             $zero,        $ac1                         \n\t"
495          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
496          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
497          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
498          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
499          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
500          "ulw              %[qload3],    9(%[src])                    \n\t"
501          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
502          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
503          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
504          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
505          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
506          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
507          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
508
509          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
510
511          /* odd 2. pixel */
512          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
513          "mthi             $zero,        $ac2                         \n\t"
514          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
515          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
516          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
517          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
518          "ulw              %[qload1],    13(%[src])                   \n\t"
519          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
520          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
521          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
522          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
523          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
524          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
525          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
526
527          /* odd 3. pixel */
528          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
529          "mthi             $zero,        $ac3                         \n\t"
530          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
531          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
532          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
533          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
534          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
535          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
536          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
537          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
538          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
539
540          /* odd 4. pixel */
541          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
542          "mthi             $zero,        $ac1                         \n\t"
543          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
544          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
545          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
546          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
547          "ulw              %[qload2],    17(%[src])                   \n\t"
548          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
549          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
550          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
551          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
552          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
553          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
554
555          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
556
557          /* odd 5. pixel */
558          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
559          "mthi             $zero,        $ac2                         \n\t"
560          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
561          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
562          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
563          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
564          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
565          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
566          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
567          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
568          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
569
570          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
571
572          /* odd 6. pixel */
573          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
574          "mthi             $zero,        $ac3                         \n\t"
575          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
576          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
577          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
578          "ulw              %[qload3],    21(%[src])                   \n\t"
579          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
580          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
581          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
582          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
583          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
584          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
585
586          /* odd 7. pixel */
587          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
588          "mthi             $zero,        $ac1                         \n\t"
589          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
590          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
591          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
592          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
593          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
594          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
595          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
596          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
597          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
598
599          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
600
601          /* odd 8. pixel */
602          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
603          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
604          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
605          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
606          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
607
608          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
609
610          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
611          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
612
613          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
614          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
615
616          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
617          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
618
619          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
620          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
621          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
622
623          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
624            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
625            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
626            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
627            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
628          : [filter12] "r" (filter12), [filter34] "r" (filter34),
629            [filter56] "r" (filter56), [filter78] "r" (filter78),
630            [vector_64] "r" (vector_64),
631            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
632      );
633
634      src += 16;
635      dst += 16;
636    }
637
638    /* Next row... */
639    src_ptr += src_stride;
640    dst_ptr += dst_stride;
641  }
642}
643
644static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
645                                        int32_t src_stride,
646                                        uint8_t *dst_ptr,
647                                        int32_t dst_stride,
648                                        const int16_t *filter_x0,
649                                        int32_t h) {
650  int32_t y, c;
651  const uint8_t *src;
652  uint8_t *dst;
653  uint8_t *cm = vp9_ff_cropTbl;
654  uint32_t vector_64 = 64;
655  int32_t filter12, filter34, filter56, filter78;
656  int32_t Temp1, Temp2, Temp3;
657  uint32_t qload1, qload2, qload3;
658  uint32_t p1, p2, p3, p4, p5;
659  uint32_t st1, st2, st3;
660
661  filter12 = ((const int32_t *)filter_x0)[0];
662  filter34 = ((const int32_t *)filter_x0)[1];
663  filter56 = ((const int32_t *)filter_x0)[2];
664  filter78 = ((const int32_t *)filter_x0)[3];
665
666  for (y = h; y--;) {
667    src = src_ptr;
668    dst = dst_ptr;
669
670    /* prefetch data to cache memory */
671    vp9_prefetch_load(src_ptr + src_stride);
672    vp9_prefetch_load(src_ptr + src_stride + 32);
673    vp9_prefetch_load(src_ptr + src_stride + 64);
674    vp9_prefetch_store(dst_ptr + dst_stride);
675    vp9_prefetch_store(dst_ptr + dst_stride + 32);
676
677    for (c = 0; c < 4; c++) {
678      __asm__ __volatile__ (
679          "ulw              %[qload1],    0(%[src])                    \n\t"
680          "ulw              %[qload2],    4(%[src])                    \n\t"
681
682          /* even 1. pixel */
683          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
684          "mthi             $zero,        $ac1                         \n\t"
685          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
686          "mthi             $zero,        $ac2                         \n\t"
687          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
688          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
689          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
690          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
691          "ulw              %[qload3],    8(%[src])                    \n\t"
692          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
693          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
694          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
695          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
696          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
697          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
698
699          /* even 2. pixel */
700          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
701          "mthi             $zero,        $ac3                         \n\t"
702          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
703          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
704          "ulw              %[qload1],    12(%[src])                   \n\t"
705          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
706          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
707          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
708          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
709          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
710          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
711
712          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
713
714          /* even 3. pixel */
715          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
716          "mthi             $zero,        $ac1                         \n\t"
717          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
718          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
719          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
720          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
721          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
722          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
723          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
724          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
725          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
726
727          /* even 4. pixel */
728          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
729          "mthi             $zero,        $ac2                         \n\t"
730          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
731          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
732          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
733          "ulw              %[qload2],    16(%[src])                   \n\t"
734          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
735          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
736          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
737          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
738          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
739          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
740          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
741          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
742
743          /* even 5. pixel */
744          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
745          "mthi             $zero,        $ac3                         \n\t"
746          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
747          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
748          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
749          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
750          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
751          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
752          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
753          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
754          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
755
756          /* even 6. pixel */
757          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
758          "mthi             $zero,        $ac1                         \n\t"
759          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
760          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
761          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
762          "ulw              %[qload3],    20(%[src])                   \n\t"
763          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
764          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
765          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
766          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
767          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
768          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
769          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
770
771          /* even 7. pixel */
772          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
773          "mthi             $zero,        $ac2                         \n\t"
774          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
775          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
776          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
777          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
778          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
779          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
780          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
781          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
782          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
783          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
784
785          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
786
787          /* even 8. pixel */
788          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
789          "mthi             $zero,        $ac3                         \n\t"
790          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
791          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
792          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
793          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
794          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
795          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
796          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
797          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
798
799          /* ODD pixels */
800          "ulw              %[qload1],    1(%[src])                   \n\t"
801          "ulw              %[qload2],    5(%[src])                    \n\t"
802
803          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
804
805          /* odd 1. pixel */
806          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
807          "mthi             $zero,        $ac1                         \n\t"
808          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
809          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
810          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
811          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
812          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
813          "ulw              %[qload3],    9(%[src])                    \n\t"
814          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
815          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
816          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
817          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
818          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
819          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
820          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
821
822          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
823
824          /* odd 2. pixel */
825          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
826          "mthi             $zero,        $ac2                         \n\t"
827          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
828          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
829          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
830          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
831          "ulw              %[qload1],    13(%[src])                   \n\t"
832          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
833          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
834          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
835          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
836          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
837          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
838          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
839
840          /* odd 3. pixel */
841          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
842          "mthi             $zero,        $ac3                         \n\t"
843          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
844          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
845          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
846          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
847          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
848          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
849          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
850          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
851          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
852
853          /* odd 4. pixel */
854          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
855          "mthi             $zero,        $ac1                         \n\t"
856          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
857          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
858          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
859          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
860          "ulw              %[qload2],    17(%[src])                   \n\t"
861          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
862          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
863          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
864          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
865          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
866          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
867
868          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
869
870          /* odd 5. pixel */
871          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
872          "mthi             $zero,        $ac2                         \n\t"
873          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
874          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
875          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
876          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
877          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
878          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
879          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
880          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
881          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
882
883          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
884
885          /* odd 6. pixel */
886          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
887          "mthi             $zero,        $ac3                         \n\t"
888          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
889          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
890          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
891          "ulw              %[qload3],    21(%[src])                   \n\t"
892          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
893          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
894          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
895          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
896          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
897          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
898
899          /* odd 7. pixel */
900          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
901          "mthi             $zero,        $ac1                         \n\t"
902          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
903          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
904          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
905          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
906          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
907          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
908          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
909          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
910          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
911
912          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
913
914          /* odd 8. pixel */
915          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
916          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
917          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
918          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
919          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
920
921          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
922
923          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
924          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
925
926          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
927          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
928
929          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
930          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
931
932          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
933          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
934          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
935
936          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
937            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
938            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
939            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
940            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
941          : [filter12] "r" (filter12), [filter34] "r" (filter34),
942            [filter56] "r" (filter56), [filter78] "r" (filter78),
943            [vector_64] "r" (vector_64),
944            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
945      );
946
947      src += 16;
948      dst += 16;
949    }
950
951    /* Next row... */
952    src_ptr += src_stride;
953    dst_ptr += dst_stride;
954  }
955}
956
957void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
958                                   uint8_t *dst, ptrdiff_t dst_stride,
959                                   const int16_t *filter_x, int x_step_q4,
960                                   const int16_t *filter_y, int y_step_q4,
961                                   int w, int h) {
962  if (((const int32_t *)filter_x)[1] == 0x800000) {
963    vp9_convolve_avg(src, src_stride,
964                     dst, dst_stride,
965                     filter_x, x_step_q4,
966                     filter_y, y_step_q4,
967                     w, h);
968  } else if (((const int32_t *)filter_x)[0] == 0) {
969    vp9_convolve2_avg_horiz_dspr2(src, src_stride,
970                                  dst, dst_stride,
971                                  filter_x, x_step_q4,
972                                  filter_y, y_step_q4,
973                                  w, h);
974  } else {
975    if (16 == x_step_q4) {
976      uint32_t pos = 38;
977
978      src -= 3;
979
980      /* bit positon for extract from acc */
981      __asm__ __volatile__ (
982        "wrdsp      %[pos],     1           \n\t"
983        :
984        : [pos] "r" (pos)
985      );
986
987      /* prefetch data to cache memory */
988      vp9_prefetch_load(src);
989      vp9_prefetch_load(src + 32);
990      vp9_prefetch_store(dst);
991
992      switch (w) {
993        case 4:
994          convolve_avg_horiz_4_dspr2(src, src_stride,
995                                     dst, dst_stride,
996                                     filter_x, h);
997          break;
998        case 8:
999          convolve_avg_horiz_8_dspr2(src, src_stride,
1000                                     dst, dst_stride,
1001                                     filter_x, h);
1002          break;
1003        case 16:
1004          convolve_avg_horiz_16_dspr2(src, src_stride,
1005                                      dst, dst_stride,
1006                                      filter_x, h, 1);
1007          break;
1008        case 32:
1009          convolve_avg_horiz_16_dspr2(src, src_stride,
1010                                      dst, dst_stride,
1011                                      filter_x, h, 2);
1012          break;
1013        case 64:
1014          vp9_prefetch_load(src + 64);
1015          vp9_prefetch_store(dst + 32);
1016
1017          convolve_avg_horiz_64_dspr2(src, src_stride,
1018                                      dst, dst_stride,
1019                                      filter_x, h);
1020          break;
1021        default:
1022          vp9_convolve8_avg_horiz_c(src + 3, src_stride,
1023                                    dst, dst_stride,
1024                                    filter_x, x_step_q4,
1025                                    filter_y, y_step_q4,
1026                                    w, h);
1027          break;
1028      }
1029    } else {
1030      vp9_convolve8_avg_horiz_c(src, src_stride,
1031                                dst, dst_stride,
1032                                filter_x, x_step_q4,
1033                                filter_y, y_step_q4,
1034                                w, h);
1035    }
1036  }
1037}
1038#endif
1039