15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h>
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/convolve_common_dspr2.h"
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/vpx_convolve.h"
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/vpx_dsp_common.h"
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx_ports/mem.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          int32_t src_stride, uint8_t *dst,
235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          int32_t dst_stride,
247bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          const int16_t *filter_x0, int32_t h) {
255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y;
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32_t Temp1, Temp2, Temp3, Temp4;
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector4a = 64;
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2;
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3;
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tn1, tn2;
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *filter = &filter_x0[3];
337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t filter45;
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter45 = ((const int32_t *)filter)[0];
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + src_stride);
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + src_stride + 32);
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst + dst_stride);
425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],         0(%[src])                      \n\t"
455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         4(%[src])                      \n\t"
465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 1. pixel */
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac3,           31             \n\t"
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 2. pixel */
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tp2],         %[tp1],         3              \n\t"
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31             \n\t"
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
627bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 1. pixel */
657bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
687bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31             \n\t"
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
747bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 2. pixel */
777bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
807bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
817bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp4],       $ac2,           31             \n\t"
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
857bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
867bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* clamp */
897bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "lbux             %[p3],          %[Temp4](%[cm])                \n\t" /* odd 2 */
917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
937bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t" /* average odd 2 */
977bc9febe8749e98a3812a0dc4380ceae75c29450Johann        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
997bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
1007bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
1017bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1027bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [Temp4] "=&r"(Temp4)
1037bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
1047bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [dst] "r"(dst), [src] "r"(src));
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
1137bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          int32_t src_stride, uint8_t *dst,
1147bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          int32_t dst_stride,
1157bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                          const int16_t *filter_x0, int32_t h) {
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y;
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector4a = 64;
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3;
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2, tp3, tp4;
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, n1;
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st0, st1;
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *filter = &filter_x0[3];
1247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t filter45;
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter45 = ((const int32_t *)filter)[0];
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + src_stride);
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + src_stride + 32);
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst + dst_stride);
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1347bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],         0(%[src])                      \n\t"
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         4(%[src])                      \n\t"
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 1. pixel */
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp3],         8(%[src])                      \n\t"
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac3,           31             \n\t"
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu              %[Temp2],       0(%[dst])                      \n\t"
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu              %[tp4],         2(%[dst])                      \n\t"
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 2. pixel */
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31             \n\t"
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 3. pixel */
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac1                           \n\t"
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac1                           \n\t"
1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac1,           31             \n\t"
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[Temp2],       0(%[dst])                      \n\t"
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp4],         2(%[dst])                      \n\t"
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 4. pixel */
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tp3],         %[tp2],         3              \n\t"
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tp2],         %[tp1],         3              \n\t"
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu              %[Temp2],       4(%[dst])                      \n\t"
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31             \n\t"
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 1. pixel */
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac1                           \n\t"
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac1                           \n\t"
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[Temp2],       4(%[dst])                      \n\t"
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31             \n\t"
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu              %[tp1],         6(%[dst])                      \n\t"
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 2. pixel */
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac1,           31             \n\t"
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu              %[tp2],         1(%[dst])                      \n\t"
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu              %[tp3],         3(%[dst])                      \n\t"
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 3. pixel */
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31             \n\t"
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu              %[tp4],         5(%[dst])                      \n\t"
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 4. pixel */
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp2],         1(%[dst])                      \n\t"
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp1],         6(%[dst])                      \n\t"
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac2,           31             \n\t"
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbu              %[tp1],         7(%[dst])                      \n\t"
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* clamp */
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* store bytes */
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp3],         3(%[dst])                      \n\t"
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp4],         5(%[dst])                      \n\t"
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp1],         7(%[dst])                      \n\t"
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2437bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
2447bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
2457bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
2467bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
2477bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
2487bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [dst] "r"(dst), [src] "r"(src));
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
2577bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                           int32_t src_stride, uint8_t *dst_ptr,
2587bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                           int32_t dst_stride,
2597bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                           const int16_t *filter_x0, int32_t h,
2607bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                           int32_t count) {
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y, c;
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src;
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst;
264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector_64 = 64;
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3;
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t qload1, qload2, qload3;
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, p5;
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st1, st2, st3;
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *filter = &filter_x0[3];
2717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t filter45;
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter45 = ((const int32_t *)filter)[0];
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src = src_ptr;
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst = dst_ptr;
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride);
281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 32);
282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst_ptr + dst_stride);
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (c = 0; c < count; c++) {
2857bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __asm__ __volatile__(
2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    0(%[src])                    \n\t"
2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    4(%[src])                    \n\t"
2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 1. pixel */
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    8(%[src])                    \n\t"
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 2. pixel */
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    12(%[src])                   \n\t"
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 3. pixel */
3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
3235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 4. pixel */
3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 5. pixel */
3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
3405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
3415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 6. pixel */
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 7. pixel */
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 8. pixel */
3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* ODD pixels */
3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    1(%[src])                   \n\t"
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    5(%[src])                    \n\t"
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 1. pixel */
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    9(%[src])                    \n\t"
3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 2. pixel */
4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    13(%[src])                   \n\t"
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 3. pixel */
4135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 4. pixel */
4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 5. pixel */
4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
4395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
4405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
4415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
4425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
4435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
4455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 6. pixel */
4475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
4485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
4505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
4515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
4525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
4535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
4545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 7. pixel */
4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
4585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
4595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
4605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 8. pixel */
4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
4685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
4695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
4715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
4825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
4835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4857bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
4867bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
4877bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
4887bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
4897bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [Temp3] "=&r"(Temp3)
4907bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
4917bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [dst] "r"(dst), [src] "r"(src));
4925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src += 16;
4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst += 16;
4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src_ptr += src_stride;
4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr += dst_stride;
5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
5015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
5025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
5047bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                           int32_t src_stride, uint8_t *dst_ptr,
5057bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                           int32_t dst_stride,
5067bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                           const int16_t *filter_x0,
5077bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                           int32_t h) {
5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y, c;
5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src;
5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst;
511da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector_64 = 64;
5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3;
5145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t qload1, qload2, qload3;
5155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, p5;
5165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st1, st2, st3;
5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const int16_t *filter = &filter_x0[3];
5187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t filter45;
5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter45 = ((const int32_t *)filter)[0];
5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src = src_ptr;
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst = dst_ptr;
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
527da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride);
528da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 32);
529da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 64);
530da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst_ptr + dst_stride);
531da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst_ptr + dst_stride + 32);
5325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (c = 0; c < 4; c++) {
5347bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __asm__ __volatile__(
5355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    0(%[src])                    \n\t"
5365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    4(%[src])                    \n\t"
5375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 1. pixel */
5395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
5415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
5425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    8(%[src])                    \n\t"
5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
5495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
5505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 2. pixel */
5535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
5545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
5555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
5565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
5575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    12(%[src])                   \n\t"
5585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 3. pixel */
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
5685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
5695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
5705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
5735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 4. pixel */
5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
5765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
5775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
5785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
5795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
5805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
5815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
5825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
5835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
5845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
5855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 5. pixel */
5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
5925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
5935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
5945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 6. pixel */
5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 7. pixel */
6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
6135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 8. pixel */
6185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
6195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
6205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
6215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
6225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
6235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
6245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
6255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* ODD pixels */
6275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    1(%[src])                   \n\t"
6285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    5(%[src])                    \n\t"
6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
6315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 1. pixel */
6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    9(%[src])                    \n\t"
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 2. pixel */
6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    13(%[src])                   \n\t"
6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 3. pixel */
6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 4. pixel */
6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
6765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
6775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
6785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
6835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 5. pixel */
6855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
6865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
6875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
6885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
6895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
6915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
6925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
6945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 6. pixel */
6965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
6975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
6985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
6995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
7005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
7015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
7025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
7035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 7. pixel */
7055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
7065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
7075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
7085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
7095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
7105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
7115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
7125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
7145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 8. pixel */
7165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
7175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
7185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
7205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
7225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
7235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
7255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
7265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
7285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
7295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
7315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
7325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
7335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7347bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
7357bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
7367bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
7377bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
7387bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [Temp3] "=&r"(Temp3)
7397bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
7407bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [dst] "r"(dst), [src] "r"(src));
7415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src += 16;
7435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst += 16;
7445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
7455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
7475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src_ptr += src_stride;
7485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr += dst_stride;
7495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
7505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
7515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
752da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
7535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                   uint8_t *dst, ptrdiff_t dst_stride,
754df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                   const InterpKernel *filter, int x0_q4,
755df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
7565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                   int w, int h) {
757df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int16_t *const filter_x = filter[x0_q4];
758da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32_t pos = 38;
759da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
760da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(x_step_q4 == 16);
761da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
762da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* bit positon for extract from acc */
7637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
7647bc9febe8749e98a3812a0dc4380ceae75c29450Johann                       :
7657bc9febe8749e98a3812a0dc4380ceae75c29450Johann                       : [pos] "r"(pos));
766da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
767da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  /* prefetch data to cache memory */
768da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(src);
769da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(src + 32);
770da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_store(dst);
771da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
772da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  switch (w) {
773da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    case 4:
7747bc9febe8749e98a3812a0dc4380ceae75c29450Johann      convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
7757bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    h);
776da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      break;
777da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    case 8:
7787bc9febe8749e98a3812a0dc4380ceae75c29450Johann      convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
7797bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    h);
780da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      break;
781da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    case 16:
7827bc9febe8749e98a3812a0dc4380ceae75c29450Johann      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
7837bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     h, 1);
784da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      break;
785da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    case 32:
7867bc9febe8749e98a3812a0dc4380ceae75c29450Johann      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
7877bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     h, 2);
788da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      break;
789da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    case 64:
790da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      prefetch_load(src + 64);
791da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      prefetch_store(dst + 32);
792da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
7937bc9febe8749e98a3812a0dc4380ceae75c29450Johann      convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
7947bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     h);
795da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      break;
796da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    default:
797df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
798df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                x_step_q4, y0_q4, y_step_q4, w, h);
799da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      break;
8005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
8015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
8025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif
803