convolve8_avg_horiz_dspr2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <stdio.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx_dsp/mips/convolve_common_dspr2.h"
16#include "vpx_dsp/vpx_convolve.h"
17#include "vpx_dsp/vpx_dsp_common.h"
18#include "vpx_ports/mem.h"
19
20#if HAVE_DSPR2
21static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
22                                       int32_t src_stride,
23                                       uint8_t *dst,
24                                       int32_t dst_stride,
25                                       const int16_t *filter_x0,
26                                       int32_t h) {
27  int32_t y;
28  uint8_t *cm = vpx_ff_cropTbl;
29  int32_t  vector1b, vector2b, vector3b, vector4b;
30  int32_t  Temp1, Temp2, Temp3, Temp4;
31  uint32_t vector4a = 64;
32  uint32_t tp1, tp2;
33  uint32_t p1, p2, p3, p4;
34  uint32_t n1, n2, n3, n4;
35  uint32_t tn1, tn2;
36
37  vector1b = ((const int32_t *)filter_x0)[0];
38  vector2b = ((const int32_t *)filter_x0)[1];
39  vector3b = ((const int32_t *)filter_x0)[2];
40  vector4b = ((const int32_t *)filter_x0)[3];
41
42  for (y = h; y--;) {
43    /* prefetch data to cache memory */
44    prefetch_load(src + src_stride);
45    prefetch_load(src + src_stride + 32);
46    prefetch_store(dst + dst_stride);
47
48    __asm__ __volatile__ (
49        "ulw              %[tp1],         0(%[src])                      \n\t"
50        "ulw              %[tp2],         4(%[src])                      \n\t"
51
52        /* even 1. pixel */
53        "mtlo             %[vector4a],    $ac3                           \n\t"
54        "mthi             $zero,          $ac3                           \n\t"
55        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
56        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
57        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
58        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
59        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
60        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
61        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
62        "ulw              %[tn2],         8(%[src])                      \n\t"
63        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
64        "extp             %[Temp1],       $ac3,           31             \n\t"
65
66        /* even 2. pixel */
67        "mtlo             %[vector4a],    $ac2                           \n\t"
68        "mthi             $zero,          $ac2                           \n\t"
69        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
70        "balign           %[tn1],         %[tn2],         3              \n\t"
71        "balign           %[tn2],         %[tp2],         3              \n\t"
72        "balign           %[tp2],         %[tp1],         3              \n\t"
73        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
74        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
75        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
76        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
77        "extp             %[Temp3],       $ac2,           31             \n\t"
78
79        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
80
81        /* odd 1. pixel */
82        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
83        "mtlo             %[vector4a],    $ac3                           \n\t"
84        "mthi             $zero,          $ac3                           \n\t"
85        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
86        "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
87        "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
88        "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
89        "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
90        "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
91        "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
92        "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
93        "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
94        "extp             %[Temp2],       $ac3,           31             \n\t"
95
96        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
97
98        /* odd 2. pixel */
99        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
100        "mtlo             %[vector4a],    $ac2                           \n\t"
101        "mthi             $zero,          $ac2                           \n\t"
102        "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
103        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
104        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
105        "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
106        "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
107        "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
108        "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
109        "extp             %[Temp4],       $ac2,           31             \n\t"
110
111        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
112        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
113
114        /* clamp */
115        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
116        "lbux             %[n2],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
117        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
118
119        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
120        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
121
122        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t"  /* average odd 2 */
123        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
124
125        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
126          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
127          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
128          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
129          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
130          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
131        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
132          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
133          [vector4a] "r" (vector4a),
134          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
135    );
136
137    /* Next row... */
138    src += src_stride;
139    dst += dst_stride;
140  }
141}
142
143static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
144                                       int32_t src_stride,
145                                       uint8_t *dst,
146                                       int32_t dst_stride,
147                                       const int16_t *filter_x0,
148                                       int32_t h) {
149  int32_t y;
150  uint8_t *cm = vpx_ff_cropTbl;
151  uint32_t vector4a = 64;
152  int32_t vector1b, vector2b, vector3b, vector4b;
153  int32_t Temp1, Temp2, Temp3;
154  uint32_t tp1, tp2;
155  uint32_t p1, p2, p3, p4, n1;
156  uint32_t tn1, tn2, tn3;
157  uint32_t st0, st1;
158
159  vector1b = ((const int32_t *)filter_x0)[0];
160  vector2b = ((const int32_t *)filter_x0)[1];
161  vector3b = ((const int32_t *)filter_x0)[2];
162  vector4b = ((const int32_t *)filter_x0)[3];
163
164  for (y = h; y--;) {
165    /* prefetch data to cache memory */
166    prefetch_load(src + src_stride);
167    prefetch_load(src + src_stride + 32);
168    prefetch_store(dst + dst_stride);
169
170    __asm__ __volatile__ (
171        "ulw              %[tp1],         0(%[src])                      \n\t"
172        "ulw              %[tp2],         4(%[src])                      \n\t"
173
174        /* even 1. pixel */
175        "mtlo             %[vector4a],    $ac3                           \n\t"
176        "mthi             $zero,          $ac3                           \n\t"
177        "mtlo             %[vector4a],    $ac2                           \n\t"
178        "mthi             $zero,          $ac2                           \n\t"
179        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
180        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
181        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
182        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
183        "ulw              %[tn2],         8(%[src])                      \n\t"
184        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
185        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
186        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
187        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
188        "extp             %[Temp1],       $ac3,           31             \n\t"
189        "lbu              %[Temp2],       0(%[dst])                      \n\t"
190        "lbu              %[tn3],         2(%[dst])                      \n\t"
191
192        /* even 2. pixel */
193        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
194        "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
195        "ulw              %[tn1],         12(%[src])                     \n\t"
196        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
197        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
198        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
199        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
200        "extp             %[Temp3],       $ac2,           31             \n\t"
201
202        /* even 3. pixel */
203        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
204        "mtlo             %[vector4a],    $ac1                           \n\t"
205        "mthi             $zero,          $ac1                           \n\t"
206        "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
207        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
208        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
209        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
210        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
211        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
212        "extp             %[Temp1],       $ac1,           31             \n\t"
213
214        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
215        "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
216        "sb               %[Temp2],       0(%[dst])                      \n\t"
217        "sb               %[tn3],         2(%[dst])                      \n\t"
218
219        /* even 4. pixel */
220        "mtlo             %[vector4a],    $ac2                           \n\t"
221        "mthi             $zero,          $ac2                           \n\t"
222        "mtlo             %[vector4a],    $ac3                           \n\t"
223        "mthi             $zero,          $ac3                           \n\t"
224
225        "balign           %[tn3],         %[tn1],         3              \n\t"
226        "balign           %[tn1],         %[tn2],         3              \n\t"
227        "balign           %[tn2],         %[tp2],         3              \n\t"
228        "balign           %[tp2],         %[tp1],         3              \n\t"
229
230        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
231        "lbu              %[Temp2],       4(%[dst])                      \n\t"
232        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
233
234        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
235        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
236        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
237        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
238        "extp             %[Temp3],       $ac2,           31             \n\t"
239
240        /* odd 1. pixel */
241        "mtlo             %[vector4a],    $ac1                           \n\t"
242        "mthi             $zero,          $ac1                           \n\t"
243        "sb               %[Temp2],       4(%[dst])                      \n\t"
244        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
245        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
246        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
247        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
248        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
249        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
250        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
251        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
252        "extp             %[Temp2],       $ac3,           31             \n\t"
253
254        "lbu              %[tp1],         6(%[dst])                      \n\t"
255
256        /* odd 2. pixel */
257        "mtlo             %[vector4a],    $ac3                           \n\t"
258        "mthi             $zero,          $ac3                           \n\t"
259        "mtlo             %[vector4a],    $ac2                           \n\t"
260        "mthi             $zero,          $ac2                           \n\t"
261        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
262        "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
263        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
264        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
265        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
266        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
267        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
268        "extp             %[Temp3],       $ac1,           31             \n\t"
269
270        "lbu              %[tp2],         1(%[dst])                      \n\t"
271        "lbu              %[tn2],         3(%[dst])                      \n\t"
272        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
273
274        /* odd 3. pixel */
275        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
276        "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
277        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
278        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
279        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
280        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
281        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
282        "extp             %[Temp2],       $ac3,           31             \n\t"
283
284        "lbu              %[tn3],         5(%[dst])                      \n\t"
285
286        /* odd 4. pixel */
287        "sb               %[tp2],         1(%[dst])                      \n\t"
288        "sb               %[tp1],         6(%[dst])                      \n\t"
289        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
290        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
291        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
292        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
293        "extp             %[Temp1],       $ac2,           31             \n\t"
294
295        "lbu              %[tn1],         7(%[dst])                      \n\t"
296
297        /* clamp */
298        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
299        "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
300
301        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
302        "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
303
304        "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
305        "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
306
307        /* store bytes */
308        "sb               %[tn2],         3(%[dst])                      \n\t"
309        "sb               %[tn3],         5(%[dst])                      \n\t"
310        "sb               %[tn1],         7(%[dst])                      \n\t"
311
312        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
313          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
314          [st0] "=&r" (st0), [st1] "=&r" (st1),
315          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
316          [n1] "=&r" (n1),
317          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
318        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
319          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
320          [vector4a] "r" (vector4a),
321          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
322    );
323
324    /* Next row... */
325    src += src_stride;
326    dst += dst_stride;
327  }
328}
329
330static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
331                                        int32_t src_stride,
332                                        uint8_t *dst_ptr,
333                                        int32_t dst_stride,
334                                        const int16_t *filter_x0,
335                                        int32_t h,
336                                        int32_t count) {
337  int32_t y, c;
338  const uint8_t *src;
339  uint8_t *dst;
340  uint8_t *cm = vpx_ff_cropTbl;
341  uint32_t vector_64 = 64;
342  int32_t filter12, filter34, filter56, filter78;
343  int32_t Temp1, Temp2, Temp3;
344  uint32_t qload1, qload2, qload3;
345  uint32_t p1, p2, p3, p4, p5;
346  uint32_t st1, st2, st3;
347
348  filter12 = ((const int32_t *)filter_x0)[0];
349  filter34 = ((const int32_t *)filter_x0)[1];
350  filter56 = ((const int32_t *)filter_x0)[2];
351  filter78 = ((const int32_t *)filter_x0)[3];
352
353  for (y = h; y--;) {
354    src = src_ptr;
355    dst = dst_ptr;
356
357    /* prefetch data to cache memory */
358    prefetch_load(src_ptr + src_stride);
359    prefetch_load(src_ptr + src_stride + 32);
360    prefetch_store(dst_ptr + dst_stride);
361
362    for (c = 0; c < count; c++) {
363      __asm__ __volatile__ (
364          "ulw              %[qload1],    0(%[src])                    \n\t"
365          "ulw              %[qload2],    4(%[src])                    \n\t"
366
367          /* even 1. pixel */
368          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
369          "mthi             $zero,        $ac1                         \n\t"
370          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
371          "mthi             $zero,        $ac2                         \n\t"
372          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
373          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
374          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
375          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
376          "ulw              %[qload3],    8(%[src])                    \n\t"
377          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
378          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
379          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
380          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
381          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
382          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
383
384          /* even 2. pixel */
385          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
386          "mthi             $zero,        $ac3                         \n\t"
387          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
388          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
389          "ulw              %[qload1],    12(%[src])                   \n\t"
390          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
391          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
392          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
393          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
394          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
395          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
396
397          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
398
399          /* even 3. pixel */
400          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
401          "mthi             $zero,        $ac1                         \n\t"
402          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
403          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
404          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
405          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
406          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
407          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
408          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
409          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
410          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
411
412          /* even 4. pixel */
413          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
414          "mthi             $zero,        $ac2                         \n\t"
415          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
416          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
417          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
418          "ulw              %[qload2],    16(%[src])                   \n\t"
419          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
420          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
421          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
422          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
423          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
424          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
425          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
426          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
427
428          /* even 5. pixel */
429          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
430          "mthi             $zero,        $ac3                         \n\t"
431          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
432          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
433          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
434          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
435          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
436          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
437          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
438          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
439          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
440
441          /* even 6. pixel */
442          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
443          "mthi             $zero,        $ac1                         \n\t"
444          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
445          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
446          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
447          "ulw              %[qload3],    20(%[src])                   \n\t"
448          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
449          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
450          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
451          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
452          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
453          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
454          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
455
456          /* even 7. pixel */
457          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
458          "mthi             $zero,        $ac2                         \n\t"
459          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
460          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
461          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
462          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
463          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
464          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
465          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
466          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
467          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
468          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
469
470          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
471
472          /* even 8. pixel */
473          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
474          "mthi             $zero,        $ac3                         \n\t"
475          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
476          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
477          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
478          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
479          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
480          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
481          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
482          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
483
484          /* ODD pixels */
485          "ulw              %[qload1],    1(%[src])                   \n\t"
486          "ulw              %[qload2],    5(%[src])                    \n\t"
487
488          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
489
490          /* odd 1. pixel */
491          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
492          "mthi             $zero,        $ac1                         \n\t"
493          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
494          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
495          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
496          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
497          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
498          "ulw              %[qload3],    9(%[src])                    \n\t"
499          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
500          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
501          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
502          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
503          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
504          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
505          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
506
507          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
508
509          /* odd 2. pixel */
510          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
511          "mthi             $zero,        $ac2                         \n\t"
512          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
513          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
514          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
515          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
516          "ulw              %[qload1],    13(%[src])                   \n\t"
517          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
518          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
519          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
520          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
521          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
522          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
523          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
524
525          /* odd 3. pixel */
526          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
527          "mthi             $zero,        $ac3                         \n\t"
528          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
529          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
530          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
531          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
532          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
533          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
534          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
535          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
536          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
537
538          /* odd 4. pixel */
539          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
540          "mthi             $zero,        $ac1                         \n\t"
541          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
542          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
543          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
544          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
545          "ulw              %[qload2],    17(%[src])                   \n\t"
546          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
547          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
548          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
549          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
550          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
551          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
552
553          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
554
555          /* odd 5. pixel */
556          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
557          "mthi             $zero,        $ac2                         \n\t"
558          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
559          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
560          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
561          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
562          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
563          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
564          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
565          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
566          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
567
568          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
569
570          /* odd 6. pixel */
571          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
572          "mthi             $zero,        $ac3                         \n\t"
573          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
574          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
575          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
576          "ulw              %[qload3],    21(%[src])                   \n\t"
577          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
578          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
579          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
580          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
581          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
582          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
583
584          /* odd 7. pixel */
585          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
586          "mthi             $zero,        $ac1                         \n\t"
587          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
588          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
589          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
590          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
591          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
592          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
593          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
594          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
595          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
596
597          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
598
599          /* odd 8. pixel */
600          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
601          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
602          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
603          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
604          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
605
606          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
607
608          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
609          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
610
611          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
612          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
613
614          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
615          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
616
617          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
618          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
619          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
620
621          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
622            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
623            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
624            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
625            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
626          : [filter12] "r" (filter12), [filter34] "r" (filter34),
627            [filter56] "r" (filter56), [filter78] "r" (filter78),
628            [vector_64] "r" (vector_64),
629            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
630      );
631
632      src += 16;
633      dst += 16;
634    }
635
636    /* Next row... */
637    src_ptr += src_stride;
638    dst_ptr += dst_stride;
639  }
640}
641
642static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
643                                        int32_t src_stride,
644                                        uint8_t *dst_ptr,
645                                        int32_t dst_stride,
646                                        const int16_t *filter_x0,
647                                        int32_t h) {
648  int32_t y, c;
649  const uint8_t *src;
650  uint8_t *dst;
651  uint8_t *cm = vpx_ff_cropTbl;
652  uint32_t vector_64 = 64;
653  int32_t filter12, filter34, filter56, filter78;
654  int32_t Temp1, Temp2, Temp3;
655  uint32_t qload1, qload2, qload3;
656  uint32_t p1, p2, p3, p4, p5;
657  uint32_t st1, st2, st3;
658
659  filter12 = ((const int32_t *)filter_x0)[0];
660  filter34 = ((const int32_t *)filter_x0)[1];
661  filter56 = ((const int32_t *)filter_x0)[2];
662  filter78 = ((const int32_t *)filter_x0)[3];
663
664  for (y = h; y--;) {
665    src = src_ptr;
666    dst = dst_ptr;
667
668    /* prefetch data to cache memory */
669    prefetch_load(src_ptr + src_stride);
670    prefetch_load(src_ptr + src_stride + 32);
671    prefetch_load(src_ptr + src_stride + 64);
672    prefetch_store(dst_ptr + dst_stride);
673    prefetch_store(dst_ptr + dst_stride + 32);
674
675    for (c = 0; c < 4; c++) {
676      __asm__ __volatile__ (
677          "ulw              %[qload1],    0(%[src])                    \n\t"
678          "ulw              %[qload2],    4(%[src])                    \n\t"
679
680          /* even 1. pixel */
681          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
682          "mthi             $zero,        $ac1                         \n\t"
683          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
684          "mthi             $zero,        $ac2                         \n\t"
685          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
686          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
687          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
688          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
689          "ulw              %[qload3],    8(%[src])                    \n\t"
690          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
691          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
692          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
693          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
694          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
695          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
696
697          /* even 2. pixel */
698          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
699          "mthi             $zero,        $ac3                         \n\t"
700          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
701          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
702          "ulw              %[qload1],    12(%[src])                   \n\t"
703          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
704          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
705          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
706          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
707          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
708          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
709
710          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
711
712          /* even 3. pixel */
713          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
714          "mthi             $zero,        $ac1                         \n\t"
715          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
716          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
717          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
718          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
719          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
720          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
721          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
722          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
723          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
724
725          /* even 4. pixel */
726          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
727          "mthi             $zero,        $ac2                         \n\t"
728          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
729          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
730          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
731          "ulw              %[qload2],    16(%[src])                   \n\t"
732          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
733          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
734          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
735          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
736          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
737          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
738          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
739          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
740
741          /* even 5. pixel */
742          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
743          "mthi             $zero,        $ac3                         \n\t"
744          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
745          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
746          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
747          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
748          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
749          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
750          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
751          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
752          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
753
754          /* even 6. pixel */
755          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
756          "mthi             $zero,        $ac1                         \n\t"
757          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
758          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
759          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
760          "ulw              %[qload3],    20(%[src])                   \n\t"
761          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
762          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
763          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
764          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
765          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
766          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
767          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
768
769          /* even 7. pixel */
770          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
771          "mthi             $zero,        $ac2                         \n\t"
772          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
773          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
774          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
775          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
776          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
777          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
778          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
779          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
780          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
781          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
782
783          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
784
785          /* even 8. pixel */
786          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
787          "mthi             $zero,        $ac3                         \n\t"
788          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
789          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
790          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
791          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
792          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
793          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
794          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
795          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
796
797          /* ODD pixels */
798          "ulw              %[qload1],    1(%[src])                   \n\t"
799          "ulw              %[qload2],    5(%[src])                    \n\t"
800
801          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
802
803          /* odd 1. pixel */
804          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
805          "mthi             $zero,        $ac1                         \n\t"
806          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
807          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
808          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
809          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
810          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
811          "ulw              %[qload3],    9(%[src])                    \n\t"
812          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
813          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
814          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
815          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
816          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
817          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
818          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
819
820          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
821
822          /* odd 2. pixel */
823          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
824          "mthi             $zero,        $ac2                         \n\t"
825          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
826          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
827          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
828          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
829          "ulw              %[qload1],    13(%[src])                   \n\t"
830          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
831          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
832          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
833          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
834          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
835          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
836          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
837
838          /* odd 3. pixel */
839          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
840          "mthi             $zero,        $ac3                         \n\t"
841          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
842          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
843          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
844          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
845          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
846          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
847          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
848          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
849          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
850
851          /* odd 4. pixel */
852          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
853          "mthi             $zero,        $ac1                         \n\t"
854          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
855          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
856          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
857          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
858          "ulw              %[qload2],    17(%[src])                   \n\t"
859          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
860          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
861          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
862          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
863          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
864          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
865
866          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
867
868          /* odd 5. pixel */
869          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
870          "mthi             $zero,        $ac2                         \n\t"
871          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
872          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
873          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
874          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
875          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
876          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
877          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
878          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
879          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
880
881          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
882
883          /* odd 6. pixel */
884          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
885          "mthi             $zero,        $ac3                         \n\t"
886          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
887          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
888          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
889          "ulw              %[qload3],    21(%[src])                   \n\t"
890          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
891          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
892          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
893          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
894          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
895          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
896
897          /* odd 7. pixel */
898          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
899          "mthi             $zero,        $ac1                         \n\t"
900          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
901          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
902          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
903          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
904          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
905          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
906          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
907          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
908          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
909
910          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
911
912          /* odd 8. pixel */
913          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
914          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
915          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
916          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
917          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
918
919          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
920
921          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
922          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
923
924          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
925          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
926
927          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
928          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
929
930          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
931          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
932          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
933
934          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
935            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
936            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
937            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
938            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
939          : [filter12] "r" (filter12), [filter34] "r" (filter34),
940            [filter56] "r" (filter56), [filter78] "r" (filter78),
941            [vector_64] "r" (vector_64),
942            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
943      );
944
945      src += 16;
946      dst += 16;
947    }
948
949    /* Next row... */
950    src_ptr += src_stride;
951    dst_ptr += dst_stride;
952  }
953}
954
955void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
956                                   uint8_t *dst, ptrdiff_t dst_stride,
957                                   const int16_t *filter_x, int x_step_q4,
958                                   const int16_t *filter_y, int y_step_q4,
959                                   int w, int h) {
960  assert(x_step_q4 == 16);
961  assert(((const int32_t *)filter_x)[1] != 0x800000);
962
963  if (((const int32_t *)filter_x)[0] == 0) {
964    vpx_convolve2_avg_horiz_dspr2(src, src_stride,
965                                  dst, dst_stride,
966                                  filter_x, x_step_q4,
967                                  filter_y, y_step_q4,
968                                  w, h);
969  } else {
970    uint32_t pos = 38;
971
972    src -= 3;
973
974    /* bit positon for extract from acc */
975    __asm__ __volatile__ (
976      "wrdsp      %[pos],     1           \n\t"
977      :
978      : [pos] "r" (pos)
979    );
980
981    /* prefetch data to cache memory */
982    prefetch_load(src);
983    prefetch_load(src + 32);
984    prefetch_store(dst);
985
986    switch (w) {
987      case 4:
988        convolve_avg_horiz_4_dspr2(src, src_stride,
989                                   dst, dst_stride,
990                                   filter_x, h);
991        break;
992      case 8:
993        convolve_avg_horiz_8_dspr2(src, src_stride,
994                                   dst, dst_stride,
995                                   filter_x, h);
996        break;
997      case 16:
998        convolve_avg_horiz_16_dspr2(src, src_stride,
999                                    dst, dst_stride,
1000                                    filter_x, h, 1);
1001        break;
1002      case 32:
1003        convolve_avg_horiz_16_dspr2(src, src_stride,
1004                                    dst, dst_stride,
1005                                    filter_x, h, 2);
1006        break;
1007      case 64:
1008        prefetch_load(src + 64);
1009        prefetch_store(dst + 32);
1010
1011        convolve_avg_horiz_64_dspr2(src, src_stride,
1012                                    dst, dst_stride,
1013                                    filter_x, h);
1014        break;
1015      default:
1016        vpx_convolve8_avg_horiz_c(src + 3, src_stride,
1017                                  dst, dst_stride,
1018                                  filter_x, x_step_q4,
1019                                  filter_y, y_step_q4,
1020                                  w, h);
1021        break;
1022    }
1023  }
1024}
1025#endif
1026