15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h>
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vpx_config.h"
155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vp9_rtcd.h"
165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_common.h"
175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx/vpx_integer.h"
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx_ports/mem.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_convolve.h"
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                         int32_t src_stride,
255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                         uint8_t *dst,
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                         int32_t dst_stride,
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                         const int16_t *filter_y,
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                         int32_t w,
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                         int32_t h) {
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       x, y;
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src_ptr;
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t       *dst_ptr;
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t       *cm = vp9_ff_cropTbl;
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      vector4a = 64;
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      load1, load2;
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      p1, p2;
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      scratch1, scratch2;
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      store1, store2;
395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       Temp1, Temp2;
405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *filter = &filter_y[3];
415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      filter45;
425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter45 = ((const int32_t *)filter)[0];
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_store(dst + dst_stride);
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < w; x += 4) {
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src_ptr = src + x;
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst_ptr = dst + x;
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac0                            \n\t"
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac1                            \n\t"
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac2                            \n\t"
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac3                            \n\t"
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac0                            \n\t"
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                            \n\t"
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                            \n\t"
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                            \n\t"
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac0,           31              \n\t"
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac1,           31              \n\t"
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac2,           31              \n\t"
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac3,           31              \n\t"
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [load1] "=&r" (load1), [load2] "=&r" (load2),
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [p1] "=&r" (p1), [p2] "=&r" (p2),
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [store1] "=&r" (store1), [store2] "=&r" (store2),
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src_ptr] "+r" (src_ptr)
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src_stride] "r" (src_stride), [cm] "r" (cm),
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [dst_ptr] "r" (dst_ptr)
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          int32_t src_stride,
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          uint8_t *dst,
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          int32_t dst_stride,
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          const int16_t *filter_y,
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          int32_t h) {
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       x, y;
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src_ptr;
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t       *dst_ptr;
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t       *cm = vp9_ff_cropTbl;
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      vector4a = 64;
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      load1, load2;
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      p1, p2;
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      scratch1, scratch2;
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t      store1, store2;
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t       Temp1, Temp2;
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *filter = &filter_y[3];
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t filter45;;
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter45 = ((const int32_t *)filter)[0];
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_store(dst + dst_stride);
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_store(dst + dst_stride + 32);
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < 64; x += 4) {
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src_ptr = src + x;
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst_ptr = dst + x;
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac0                            \n\t"
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac1                            \n\t"
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac2                            \n\t"
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac3                            \n\t"
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac0                            \n\t"
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                            \n\t"
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                            \n\t"
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                            \n\t"
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac0,           31              \n\t"
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac1,           31              \n\t"
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac2,           31              \n\t"
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac3,           31              \n\t"
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [load1] "=&r" (load1), [load2] "=&r" (load2),
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [p1] "=&r" (p1), [p2] "=&r" (p2),
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [store1] "=&r" (store1), [store2] "=&r" (store2),
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src_ptr] "+r" (src_ptr)
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src_stride] "r" (src_stride), [cm] "r" (cm),
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [dst_ptr] "r" (dst_ptr)
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  uint8_t *dst, ptrdiff_t dst_stride,
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  const int16_t *filter_x, int x_step_q4,
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  const int16_t *filter_y, int y_step_q4,
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  int w, int h) {
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (16 == y_step_q4) {
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    uint32_t pos = 38;
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* bit positon for extract from acc */
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      "wrdsp      %[pos],     1           \n\t"
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      :
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      : [pos] "r" (pos)
2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_store(dst);
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    switch (w) {
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 4:
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 8:
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 16:
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 32:
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_bi_avg_vert_4_dspr2(src, src_stride,
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                     dst, dst_stride,
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                     filter_y, w, h);
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 64:
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + 32);
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_bi_avg_vert_64_dspr2(src, src_stride,
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      dst, dst_stride,
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      filter_y, h);
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      default:
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_convolve8_avg_vert_c(src, src_stride,
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 dst, dst_stride,
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 filter_x, x_step_q4,
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 filter_y, y_step_q4,
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                 w, h);
2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_convolve8_avg_vert_c(src, src_stride,
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             dst, dst_stride,
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             filter_x, x_step_q4,
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             filter_y, y_step_q4,
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             w, h);
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif
282