1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/*
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan *
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <assert.h>
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <stdio.h>
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "./vpx_config.h"
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "./vp9_rtcd.h"
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_common.h"
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx/vpx_integer.h"
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_ports/mem.h"
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_convolve.h"
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan#if HAVE_DSPR2
23233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void convolve_avg_vert_4_dspr2(const uint8_t *src,
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      int32_t src_stride,
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      uint8_t *dst,
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      int32_t dst_stride,
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      const int16_t *filter_y,
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      int32_t w,
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      int32_t h) {
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int32_t       x, y;
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const uint8_t *src_ptr;
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint8_t       *dst_ptr;
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint8_t       *cm = vp9_ff_cropTbl;
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      vector4a = 64;
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      load1, load2, load3, load4;
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      p1, p2;
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      n1, n2;
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      scratch1, scratch2;
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      store1, store2;
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int32_t       vector1b, vector2b, vector3b, vector4b;
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int32_t       Temp1, Temp2;
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vector1b = ((const int32_t *)filter_y)[0];
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vector2b = ((const int32_t *)filter_y)[1];
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vector3b = ((const int32_t *)filter_y)[2];
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vector4b = ((const int32_t *)filter_y)[3];
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src -= 3 * src_stride;
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (y = h; y--;) {
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    /* prefetch data to cache memory */
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_prefetch_store(dst + dst_stride);
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (x = 0; x < w; x += 4) {
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan      src_ptr = src + x;
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dst_ptr = dst + x;
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __asm__ __volatile__ (
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mtlo             %[vector4a],  $ac0                            \n\t"
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mtlo             %[vector4a],  $ac1                            \n\t"
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mtlo             %[vector4a],  $ac2                            \n\t"
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mtlo             %[vector4a],  $ac3                            \n\t"
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mthi             $zero,        $ac0                            \n\t"
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mthi             $zero,        $ac1                            \n\t"
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mthi             $zero,        $ac2                            \n\t"
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mthi             $zero,        $ac3                            \n\t"
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "extp             %[Temp1],     $ac0,           31              \n\t"
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "extp             %[Temp2],     $ac1,           31              \n\t"
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "extp             %[Temp1],     $ac2,           31              \n\t"
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "extp             %[Temp2],     $ac3,           31              \n\t"
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan          : [load1] "=&r" (load1), [load2] "=&r" (load2),
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [load3] "=&r" (load3), [load4] "=&r" (load4),
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [store1] "=&r" (store1), [store2] "=&r" (store2),
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [src_ptr] "+r" (src_ptr)
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [vector4a] "r" (vector4a),
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan      );
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan    /* Next row... */
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    src += src_stride;
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst += dst_stride;
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void convolve_avg_vert_64_dspr2(const uint8_t *src,
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       int32_t src_stride,
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       uint8_t *dst,
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       int32_t dst_stride,
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       const int16_t *filter_y,
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       int32_t h) {
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int32_t       x, y;
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const uint8_t *src_ptr;
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint8_t       *dst_ptr;
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint8_t       *cm = vp9_ff_cropTbl;
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      vector4a = 64;
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      load1, load2, load3, load4;
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      p1, p2;
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      n1, n2;
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      scratch1, scratch2;
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t      store1, store2;
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int32_t       vector1b, vector2b, vector3b, vector4b;
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int32_t       Temp1, Temp2;
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vector1b = ((const int32_t *)filter_y)[0];
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vector2b = ((const int32_t *)filter_y)[1];
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vector3b = ((const int32_t *)filter_y)[2];
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vector4b = ((const int32_t *)filter_y)[3];
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src -= 3 * src_stride;
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (y = h; y--;) {
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    /* prefetch data to cache memory */
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_prefetch_store(dst + dst_stride);
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_prefetch_store(dst + dst_stride + 32);
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (x = 0; x < 64; x += 4) {
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan      src_ptr = src + x;
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan      dst_ptr = dst + x;
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __asm__ __volatile__ (
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mtlo             %[vector4a],  $ac0                            \n\t"
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mtlo             %[vector4a],  $ac1                            \n\t"
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mtlo             %[vector4a],  $ac2                            \n\t"
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mtlo             %[vector4a],  $ac3                            \n\t"
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mthi             $zero,        $ac0                            \n\t"
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mthi             $zero,        $ac1                            \n\t"
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mthi             $zero,        $ac2                            \n\t"
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "mthi             $zero,        $ac3                            \n\t"
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "extp             %[Temp1],     $ac0,           31              \n\t"
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "extp             %[Temp2],     $ac1,           31              \n\t"
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "extp             %[Temp1],     $ac2,           31              \n\t"
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "extp             %[Temp2],     $ac3,           31              \n\t"
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan          : [load1] "=&r" (load1), [load2] "=&r" (load2),
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [load3] "=&r" (load3), [load4] "=&r" (load4),
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [store1] "=&r" (store1), [store2] "=&r" (store2),
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [src_ptr] "+r" (src_ptr)
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [vector4a] "r" (vector4a),
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan      );
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    /* Next row... */
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    src += src_stride;
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst += dst_stride;
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan
347233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  uint8_t *dst, ptrdiff_t dst_stride,
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const int16_t *filter_x, int x_step_q4,
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const int16_t *filter_y, int y_step_q4,
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  int w, int h) {
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (((const int32_t *)filter_y)[1] == 0x800000) {
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_convolve_avg(src, src_stride,
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan                     dst, dst_stride,
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan                     filter_x, x_step_q4,
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan                     filter_y, y_step_q4,
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan                     w, h);
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan  } else if (((const int32_t *)filter_y)[0] == 0) {
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp9_convolve2_avg_vert_dspr2(src, src_stride,
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                 dst, dst_stride,
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                 filter_x, x_step_q4,
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                 filter_y, y_step_q4,
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                 w, h);
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan  } else {
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    if (16 == y_step_q4) {
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan      uint32_t pos = 38;
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan      /* bit positon for extract from acc */
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __asm__ __volatile__ (
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan        "wrdsp      %[pos],     1           \n\t"
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan        :
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan        : [pos] "r" (pos)
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan      );
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan      vp9_prefetch_store(dst);
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan      switch (w) {
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan        case 4:
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan        case 8:
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan        case 16:
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan        case 32:
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan          convolve_avg_vert_4_dspr2(src, src_stride,
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    dst, dst_stride,
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    filter_y, w, h);
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan          break;
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan        case 64:
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan          vp9_prefetch_store(dst + 32);
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan          convolve_avg_vert_64_dspr2(src, src_stride,
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     dst, dst_stride,
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     filter_y, h);
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan          break;
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan        default:
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan          vp9_convolve8_avg_vert_c(src, src_stride,
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                   dst, dst_stride,
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                   filter_x, x_step_q4,
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                   filter_y, y_step_q4,
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                   w, h);
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan          break;
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan    } else {
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan      vp9_convolve8_avg_vert_c(src, src_stride,
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               dst, dst_stride,
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               filter_x, x_step_q4,
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               filter_y, y_step_q4,
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               w, h);
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan
410233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             uint8_t *dst, ptrdiff_t dst_stride,
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             const int16_t *filter_x, int x_step_q4,
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             const int16_t *filter_y, int y_step_q4,
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             int w, int h) {
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan  /* Fixed size intermediate buffer places limits on parameters. */
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan  assert(w <= 64);
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan  assert(h <= 64);
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (intermediate_height < h)
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    intermediate_height = h;
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (x_step_q4 != 16 || y_step_q4 != 16)
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return vp9_convolve8_avg_c(src, src_stride,
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               dst, dst_stride,
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               filter_x, x_step_q4,
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               filter_y, y_step_q4,
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               w, h);
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      temp, 64,
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      filter_x, x_step_q4,
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      filter_y, y_step_q4,
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      w, intermediate_height);
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_convolve8_avg_vert(temp + 64 * 3, 64,
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         dst, dst_stride,
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         filter_x, x_step_q4,
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         filter_y, y_step_q4,
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         w, h);
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan
445233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            uint8_t *dst, ptrdiff_t dst_stride,
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            const int16_t *filter_x, int filter_x_stride,
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            const int16_t *filter_y, int filter_y_stride,
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            int w, int h) {
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x, y;
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t tp1, tp2, tn1;
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan  uint32_t tp3, tp4, tn2;
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan  /* prefetch data to cache memory */
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_prefetch_load(src);
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_prefetch_load(src + 32);
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_prefetch_store(dst);
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan  switch (w) {
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    case 4:
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan      /* 1 word storage */
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan      for (y = h; y--; ) {
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride);
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride + 32);
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_store(dst + dst_stride);
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __asm__ __volatile__ (
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         0(%[src])      \n\t"
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         0(%[dst])      \n\t"
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tp2] "=&r" (tp2)
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [src] "r" (src), [dst] "r" (dst)
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan        );
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src += src_stride;
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst += dst_stride;
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan      break;
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan    case 8:
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan      /* 2 word storage */
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan      for (y = h; y--; ) {
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride);
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride + 32);
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_store(dst + dst_stride);
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __asm__ __volatile__ (
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         0(%[src])      \n\t"
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         0(%[dst])      \n\t"
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         4(%[src])      \n\t"
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         4(%[dst])      \n\t"
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [src] "r" (src), [dst] "r" (dst)
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan        );
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src += src_stride;
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst += dst_stride;
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan      break;
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    case 16:
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan      /* 4 word storage */
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan      for (y = h; y--; ) {
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride);
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride + 32);
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_store(dst + dst_stride);
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __asm__ __volatile__ (
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         0(%[src])      \n\t"
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         0(%[dst])      \n\t"
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         4(%[src])      \n\t"
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         4(%[dst])      \n\t"
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         8(%[src])      \n\t"
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         8(%[dst])      \n\t"
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         12(%[src])     \n\t"
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         12(%[dst])     \n\t"
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [src] "r" (src), [dst] "r" (dst)
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan        );
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src += src_stride;
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst += dst_stride;
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan      break;
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan    case 32:
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan      /* 8 word storage */
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan      for (y = h; y--; ) {
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride);
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride + 32);
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_store(dst + dst_stride);
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __asm__ __volatile__ (
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         0(%[src])      \n\t"
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         0(%[dst])      \n\t"
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         4(%[src])      \n\t"
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         4(%[dst])      \n\t"
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         8(%[src])      \n\t"
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         8(%[dst])      \n\t"
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         12(%[src])     \n\t"
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         12(%[dst])     \n\t"
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         16(%[src])     \n\t"
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         16(%[dst])     \n\t"
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         20(%[src])     \n\t"
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         20(%[dst])     \n\t"
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         24(%[src])     \n\t"
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         24(%[dst])     \n\t"
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         28(%[src])     \n\t"
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         28(%[dst])     \n\t"
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [src] "r" (src), [dst] "r" (dst)
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan        );
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src += src_stride;
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst += dst_stride;
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan      break;
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan    case 64:
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan      vp9_prefetch_load(src + 64);
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan      vp9_prefetch_store(dst + 32);
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan      /* 16 word storage */
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan      for (y = h; y--; ) {
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride);
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride + 32);
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_load(src + src_stride + 64);
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_store(dst + dst_stride);
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp9_prefetch_store(dst + dst_stride + 32);
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __asm__ __volatile__ (
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         0(%[src])      \n\t"
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         0(%[dst])      \n\t"
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         4(%[src])      \n\t"
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         4(%[dst])      \n\t"
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         8(%[src])      \n\t"
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         8(%[dst])      \n\t"
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         12(%[src])     \n\t"
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         12(%[dst])     \n\t"
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         16(%[src])     \n\t"
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         16(%[dst])     \n\t"
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         20(%[src])     \n\t"
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         20(%[dst])     \n\t"
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         24(%[src])     \n\t"
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         24(%[dst])     \n\t"
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         28(%[src])     \n\t"
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         28(%[dst])     \n\t"
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         32(%[src])     \n\t"
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         32(%[dst])     \n\t"
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         36(%[src])     \n\t"
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         36(%[dst])     \n\t"
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         40(%[src])     \n\t"
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         40(%[dst])     \n\t"
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         44(%[src])     \n\t"
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         44(%[dst])     \n\t"
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         48(%[src])     \n\t"
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         48(%[dst])     \n\t"
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         52(%[src])     \n\t"
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         52(%[dst])     \n\t"
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp1],         56(%[src])     \n\t"
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp2],         56(%[dst])     \n\t"
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp3],         60(%[src])     \n\t"
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "ulw              %[tp4],         60(%[dst])     \n\t"
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan            "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan            : [src] "r" (src), [dst] "r" (dst)
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan        );
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src += src_stride;
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst += dst_stride;
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan      break;
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan    default:
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan      for (y = h; y > 0; --y) {
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan        for (x = 0; x < w; ++x) {
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan          dst[x] = (dst[x] + src[x] + 1) >> 1;
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan        }
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src += src_stride;
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst += dst_stride;
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan      break;
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan#endif
696