15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h>
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vpx_config.h"
155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vp9_rtcd.h"
165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_common.h"
175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx/vpx_integer.h"
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx_ports/mem.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_convolve.h"
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_avg_vert_4_dspr2(const uint8_t *src,
245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      int32_t src_stride,
255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      uint8_t *dst,
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      int32_t dst_stride,
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      const int16_t *filter_y,
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      int32_t w,
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      int32_t h) {
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       x, y;
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src_ptr;
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t       *dst_ptr;
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t       *cm = vp9_ff_cropTbl;
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      vector4a = 64;
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      load1, load2, load3, load4;
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      p1, p2;
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      n1, n2;
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      scratch1, scratch2;
395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      store1, store2;
405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       vector1b, vector2b, vector3b, vector4b;
415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       Temp1, Temp2;
425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_y)[0];
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_y)[1];
455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_y)[2];
465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_y)[3];
475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  src -= 3 * src_stride;
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_store(dst + dst_stride);
535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < w; x += 4) {
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src_ptr = src + x;
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst_ptr = dst + x;
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac0                            \n\t"
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac1                            \n\t"
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac2                            \n\t"
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac3                            \n\t"
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac0                            \n\t"
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                            \n\t"
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                            \n\t"
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                            \n\t"
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac0,           31              \n\t"
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac1,           31              \n\t"
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac2,           31              \n\t"
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac3,           31              \n\t"
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [load1] "=&r" (load1), [load2] "=&r" (load2),
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [load3] "=&r" (load3), [load4] "=&r" (load4),
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [store1] "=&r" (store1), [store2] "=&r" (store2),
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src_ptr] "+r" (src_ptr)
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [vector4a] "r" (vector4a),
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_avg_vert_64_dspr2(const uint8_t *src,
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                       int32_t src_stride,
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                       uint8_t *dst,
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                       int32_t dst_stride,
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                       const int16_t *filter_y,
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                       int32_t h) {
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       x, y;
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src_ptr;
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t       *dst_ptr;
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t       *cm = vp9_ff_cropTbl;
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      vector4a = 64;
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      load1, load2, load3, load4;
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      p1, p2;
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      n1, n2;
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      scratch1, scratch2;
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      store1, store2;
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       vector1b, vector2b, vector3b, vector4b;
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       Temp1, Temp2;
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_y)[0];
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_y)[1];
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_y)[2];
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_y)[3];
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  src -= 3 * src_stride;
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_store(dst + dst_stride);
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_store(dst + dst_stride + 32);
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < 64; x += 4) {
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src_ptr = src + x;
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst_ptr = dst + x;
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac0                            \n\t"
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac1                            \n\t"
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac2                            \n\t"
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac3                            \n\t"
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac0                            \n\t"
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                            \n\t"
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                            \n\t"
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                            \n\t"
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac0,           31              \n\t"
2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac1,           31              \n\t"
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac2,           31              \n\t"
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac3,           31              \n\t"
3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
3235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [load1] "=&r" (load1), [load2] "=&r" (load2),
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [load3] "=&r" (load3), [load4] "=&r" (load4),
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [store1] "=&r" (store1), [store2] "=&r" (store2),
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src_ptr] "+r" (src_ptr)
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [vector4a] "r" (vector4a),
3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
3405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  uint8_t *dst, ptrdiff_t dst_stride,
3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  const int16_t *filter_x, int x_step_q4,
3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  const int16_t *filter_y, int y_step_q4,
3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  int w, int h) {
3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (((const int32_t *)filter_y)[1] == 0x800000) {
3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_convolve_avg(src, src_stride,
3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                     dst, dst_stride,
3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                     filter_x, x_step_q4,
3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                     filter_y, y_step_q4,
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                     w, h);
3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else if (((const int32_t *)filter_y)[0] == 0) {
3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_convolve2_avg_vert_dspr2(src, src_stride,
3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 dst, dst_stride,
3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 filter_x, x_step_q4,
3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 filter_y, y_step_q4,
3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 w, h);
3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    if (16 == y_step_q4) {
3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t pos = 38;
3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* bit positon for extract from acc */
3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "wrdsp      %[pos],     1           \n\t"
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        :
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [pos] "r" (pos)
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      vp9_prefetch_store(dst);
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      switch (w) {
3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        case 4:
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        case 8:
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        case 16:
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        case 32:
3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          convolve_avg_vert_4_dspr2(src, src_stride,
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                    dst, dst_stride,
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                    filter_y, w, h);
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          break;
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        case 64:
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          vp9_prefetch_store(dst + 32);
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          convolve_avg_vert_64_dspr2(src, src_stride,
3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                     dst, dst_stride,
3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                     filter_y, h);
3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          break;
3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        default:
3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          vp9_convolve8_avg_vert_c(src, src_stride,
3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                   dst, dst_stride,
3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                   filter_x, x_step_q4,
3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                   filter_y, y_step_q4,
3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                   w, h);
3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          break;
3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    } else {
4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      vp9_convolve8_avg_vert_c(src, src_stride,
4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               dst, dst_stride,
4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               filter_x, x_step_q4,
4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               filter_y, y_step_q4,
4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               w, h);
4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             uint8_t *dst, ptrdiff_t dst_stride,
4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             const int16_t *filter_x, int x_step_q4,
4135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             const int16_t *filter_y, int y_step_q4,
4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             int w, int h) {
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* Fixed size intermediate buffer places limits on parameters. */
4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  assert(w <= 64);
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  assert(h <= 64);
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (intermediate_height < h)
4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    intermediate_height = h;
4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (x_step_q4 != 16 || y_step_q4 != 16)
4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    return vp9_convolve8_avg_c(src, src_stride,
4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               dst, dst_stride,
4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               filter_x, x_step_q4,
4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               filter_y, y_step_q4,
4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               w, h);
4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                      temp, 64,
4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                      filter_x, x_step_q4,
4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                      filter_y, y_step_q4,
4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                      w, intermediate_height);
4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_convolve8_avg_vert(temp + 64 * 3, 64,
4395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         dst, dst_stride,
4405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         filter_x, x_step_q4,
4415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         filter_y, y_step_q4,
4425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         w, h);
4435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
4445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
4465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            uint8_t *dst, ptrdiff_t dst_stride,
4475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            const int16_t *filter_x, int filter_x_stride,
4485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            const int16_t *filter_y, int filter_y_stride,
4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            int w, int h) {
4505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x, y;
4515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2, tn1;
4525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp3, tp4, tn2;
4535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* prefetch data to cache memory */
4555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(src);
4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(src + 32);
4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_store(dst);
4585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  switch (w) {
4605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 4:
4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 1 word storage */
4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
4685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
4695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
4705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
4715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp2] "=&r" (tp2)
4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
4825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 8:
4835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 2 word storage */
4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
4855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
4865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
4875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
4885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
4905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
4915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
4925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         4(%[src])      \n\t"
4935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         4(%[dst])      \n\t"
4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
5015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
5025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
5035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
5045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
5065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
5075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 16:
5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 4 word storage */
5115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
5145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
5155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
5185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         4(%[src])      \n\t"
5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         4(%[dst])      \n\t"
5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         8(%[src])      \n\t"
5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         8(%[dst])      \n\t"
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
5275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         12(%[src])     \n\t"
5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[dst])     \n\t"
5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
5305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
5315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
5325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
5355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
5365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
5375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
5385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
5395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
5415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
5425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 32:
5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 8 word storage */
5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
5495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
5505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
5525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
5535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
5545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         4(%[src])      \n\t"
5555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         4(%[dst])      \n\t"
5565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
5575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         8(%[src])      \n\t"
5585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         8(%[dst])      \n\t"
5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         12(%[src])     \n\t"
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[dst])     \n\t"
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         16(%[src])     \n\t"
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         16(%[dst])     \n\t"
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
5685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
5695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
5705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         20(%[src])     \n\t"
5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         20(%[dst])     \n\t"
5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
5735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         24(%[src])     \n\t"
5745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         24(%[dst])     \n\t"
5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
5765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
5775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
5785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         28(%[src])     \n\t"
5795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         28(%[dst])     \n\t"
5805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
5815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
5825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
5835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
5845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
5925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
5935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
5945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 64:
5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      vp9_prefetch_load(src + 64);
5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      vp9_prefetch_store(dst + 32);
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 16 word storage */
6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 64);
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride + 32);
6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         4(%[src])      \n\t"
6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         4(%[dst])      \n\t"
6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
6135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         8(%[src])      \n\t"
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         8(%[dst])      \n\t"
6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
6175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
6185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         12(%[src])     \n\t"
6195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[dst])     \n\t"
6205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
6215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         16(%[src])     \n\t"
6225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         16(%[dst])     \n\t"
6235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
6245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
6255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
6265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         20(%[src])     \n\t"
6275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         20(%[dst])     \n\t"
6285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         24(%[src])     \n\t"
6305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         24(%[dst])     \n\t"
6315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
6325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         28(%[src])     \n\t"
6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         28(%[dst])     \n\t"
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         32(%[src])     \n\t"
6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         32(%[dst])     \n\t"
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         36(%[src])     \n\t"
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         36(%[dst])     \n\t"
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         40(%[src])     \n\t"
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         40(%[dst])     \n\t"
6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         44(%[src])     \n\t"
6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         44(%[dst])     \n\t"
6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         48(%[src])     \n\t"
6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         48(%[dst])     \n\t"
6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         52(%[src])     \n\t"
6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         52(%[dst])     \n\t"
6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         56(%[src])     \n\t"
6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         56(%[dst])     \n\t"
6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         60(%[src])     \n\t"
6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         60(%[dst])     \n\t"
6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
6765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
6775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
6785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
6835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    default:
6845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y > 0; --y) {
6855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        for (x = 0; x < w; ++x) {
6865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          dst[x] = (dst[x] + src[x] + 1) >> 1;
6875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
6885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
6915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
6925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
6945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
6955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif
696