15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h>
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/convolve_common_dspr2.h"
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/vpx_dsp_common.h"
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/vpx_filter.h"
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx_ports/mem.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
217bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                   uint8_t *dst, int32_t dst_stride,
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                   const int16_t *filter_x0, int32_t h) {
245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y;
25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t vector1b, vector2b, vector3b, vector4b;
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3, Temp4;
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector4a = 64;
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2;
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4;
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t n1, n2, n3, n4;
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tn1, tn2;
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_x0)[0];
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_x0)[1];
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_x0)[2];
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_x0)[3];
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + src_stride);
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + src_stride + 32);
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst + dst_stride);
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],      0(%[src])                      \n\t"
475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],      4(%[src])                      \n\t"
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 1. pixel */
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac3                           \n\t"
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac3                           \n\t"
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tn2],      8(%[src])                      \n\t"
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],    $ac3,           31             \n\t"
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 2. pixel */
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac2                           \n\t"
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac2                           \n\t"
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn1],      %[tn2],         3              \n\t"
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn2],      %[tp2],         3              \n\t"
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tp2],      %[tp1],         3              \n\t"
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],    $ac2,           31             \n\t"
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 1. pixel */
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac3                           \n\t"
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac3                           \n\t"
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],    $ac3,           31             \n\t"
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 2. pixel */
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac2                           \n\t"
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac2                           \n\t"
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp4],    $ac2,           31             \n\t"
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* clamp */
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* store bytes */
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp1],      0(%[dst])                      \n\t"
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tn1],      1(%[dst])                      \n\t"
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp2],      2(%[dst])                      \n\t"
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[n2],       3(%[dst])                      \n\t"
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1117bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
1137bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
1147bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
1157bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
1167bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1177bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
1187bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
1197bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [src] "r"(src));
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1277bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
1287bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                   uint8_t *dst, int32_t dst_stride,
1297bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                   const int16_t *filter_x0, int32_t h) {
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y;
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector4a = 64;
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t vector1b, vector2b, vector3b, vector4b;
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3;
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2;
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, n1;
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tn1, tn2, tn3;
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st0, st1;
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_x0)[0];
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_x0)[1];
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_x0)[2];
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_x0)[3];
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + src_stride);
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + src_stride + 32);
149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst + dst_stride);
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],      0(%[src])                      \n\t"
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],      4(%[src])                      \n\t"
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 1. pixel */
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac3                           \n\t"
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac3                           \n\t"
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac2                           \n\t"
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac2                           \n\t"
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tn2],      8(%[src])                      \n\t"
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],    $ac3,           31             \n\t"
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 2. pixel */
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tn1],      12(%[src])                     \n\t"
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],    $ac2,           31             \n\t"
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 3. pixel */
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac1                           \n\t"
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac1                           \n\t"
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],    $ac1,           31             \n\t"
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 4. pixel */
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac2                           \n\t"
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac2                           \n\t"
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac3                           \n\t"
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac3                           \n\t"
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[st0],      0(%[dst])                      \n\t"
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn3],      %[tn1],         3              \n\t"
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn1],      %[tn2],         3              \n\t"
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn2],      %[tp2],         3              \n\t"
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tp2],      %[tp1],         3              \n\t"
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],    $ac2,           31             \n\t"
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 1. pixel */
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac1                           \n\t"
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac1                           \n\t"
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[st1],      2(%[dst])                      \n\t"
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[st0],      4(%[dst])                      \n\t"
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],    $ac3,           31             \n\t"
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 2. pixel */
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac3                           \n\t"
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac3                           \n\t"
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a], $ac2                           \n\t"
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,       $ac2                           \n\t"
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],    $ac1,           31             \n\t"
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 3. pixel */
2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],    $ac3,           31             \n\t"
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 4. pixel */
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[st1],      1(%[dst])                      \n\t"
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[st0],      6(%[dst])                      \n\t"
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],    $ac2,           31             \n\t"
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* clamp */
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* store bytes */
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[p4],       3(%[dst])                      \n\t"
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[p2],       5(%[dst])                      \n\t"
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[n1],       7(%[dst])                      \n\t"
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2707bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
2717bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
2727bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
2737bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
2747bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
2757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
2767bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
2777bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
2787bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [src] "r"(src));
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2867bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
2877bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    uint8_t *dst_ptr, int32_t dst_stride,
2887bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    const int16_t *filter_x0, int32_t h,
2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                    int32_t count) {
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y, c;
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src;
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst;
293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector_64 = 64;
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t filter12, filter34, filter56, filter78;
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3;
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t qload1, qload2, qload3;
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, p5;
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st1, st2, st3;
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter12 = ((const int32_t *)filter_x0)[0];
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter34 = ((const int32_t *)filter_x0)[1];
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter56 = ((const int32_t *)filter_x0)[2];
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter78 = ((const int32_t *)filter_x0)[3];
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src = src_ptr;
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst = dst_ptr;
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride);
312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 32);
313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst_ptr + dst_stride);
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (c = 0; c < count; c++) {
3167bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __asm__ __volatile__(
3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    0(%[src])                    \n\t"
3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    4(%[src])                    \n\t"
3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 1. pixel */
3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
3235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    8(%[src])                    \n\t"
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 2. pixel */
3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
3405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
3415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    12(%[src])                   \n\t"
3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 3. pixel */
3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 4. pixel */
3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    16(%[src])                   \n\t"
3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 5. pixel */
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 6. pixel */
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    20(%[src])                   \n\t"
3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 7. pixel */
4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 8. pixel */
4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
4135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* ODD pixels */
4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    1(%[src])                    \n\t"
4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    5(%[src])                    \n\t"
4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 1. pixel */
4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    9(%[src])                    \n\t"
4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
4395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
4405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
4415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 2. pixel */
4435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
4445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
4455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
4465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
4475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
4485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    13(%[src])                   \n\t"
4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
4505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
4515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
4525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
4535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
4545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
4555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 3. pixel */
4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
4585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
4595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
4605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 4. pixel */
4695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
4705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
4715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    17(%[src])                   \n\t"
4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
4785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 5. pixel */
4825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
4835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
4855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
4865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
4875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
4885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
4895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
4905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
4915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
4925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 6. pixel */
4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    21(%[src])                   \n\t"
4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
5015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
5025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
5035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
5045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
5055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 7. pixel */
5075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
5115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
5145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
5155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
5165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 8. pixel */
5185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
5275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
5305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
5315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5327bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
5337bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
5347bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
5357bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
5367bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
5377bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [filter12] "r"(filter12), [filter34] "r"(filter34),
5387bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [filter56] "r"(filter56), [filter78] "r"(filter78),
5397bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
5407bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [src] "r"(src));
5415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src += 16;
5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst += 16;
5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src_ptr += src_stride;
5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr += dst_stride;
5495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
5505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5527bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
5537bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    uint8_t *dst_ptr, int32_t dst_stride,
5547bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    const int16_t *filter_x0, int32_t h) {
5555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y, c;
5565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src;
5575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst;
558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector_64 = 64;
5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t filter12, filter34, filter56, filter78;
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3;
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t qload1, qload2, qload3;
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, p5;
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st1, st2, st3;
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter12 = ((const int32_t *)filter_x0)[0];
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter34 = ((const int32_t *)filter_x0)[1];
5685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter56 = ((const int32_t *)filter_x0)[2];
5695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter78 = ((const int32_t *)filter_x0)[3];
5705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src = src_ptr;
5735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst = dst_ptr;
5745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
576da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride);
577da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 32);
578da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 64);
579da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst_ptr + dst_stride);
580da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst_ptr + dst_stride + 32);
5815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (c = 0; c < 4; c++) {
5837bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __asm__ __volatile__(
5845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    0(%[src])                    \n\t"
5855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    4(%[src])                    \n\t"
5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 1. pixel */
5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
5925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
5935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
5945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    8(%[src])                    \n\t"
5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 2. pixel */
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    12(%[src])                   \n\t"
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
6135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 3. pixel */
6175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
6185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
6195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
6205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
6215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
6225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
6235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
6245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
6255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
6265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
6275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 4. pixel */
6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
6305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
6315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
6325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    16(%[src])                   \n\t"
6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 5. pixel */
6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 6. pixel */
6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    20(%[src])                   \n\t"
6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 7. pixel */
6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
6765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
6775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 8. pixel */
6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
6835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
6845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
6855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
6865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
6875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
6885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* ODD pixels */
6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    1(%[src])                    \n\t"
6915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    5(%[src])                    \n\t"
6925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 1. pixel */
6945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
6955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
6965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
6975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
6985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
6995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
7005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
7015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    9(%[src])                    \n\t"
7025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
7035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
7045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
7055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
7065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
7075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
7085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 2. pixel */
7105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
7115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
7125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
7135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
7145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
7155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],    13(%[src])                   \n\t"
7165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
7175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
7185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
7195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
7205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
7215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
7225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 3. pixel */
7245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
7255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
7265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
7275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
7285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
7295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
7305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
7315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
7325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
7335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
7345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 4. pixel */
7365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
7375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
7385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
7395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
7405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],    17(%[src])                   \n\t"
7415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
7425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
7435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
7445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
7455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
7465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
7475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 5. pixel */
7495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
7505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                         \n\t"
7515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
7525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
7535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
7545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
7555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
7565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
7575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
7585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
7595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 6. pixel */
7615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
7625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                         \n\t"
7635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
7645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
7655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload3],    21(%[src])                   \n\t"
7665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
7675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
7685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
7695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
7705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
7715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
7725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 7. pixel */
7745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
7755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                         \n\t"
7765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
7775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
7785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
7795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
7805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
7815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
7825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
7835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 8. pixel */
7855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
7865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
7875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
7885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
7895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
7905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
7925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
7935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
7945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
7965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
7975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
7985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7997bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
8007bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
8017bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
8027bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
8037bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
8047bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [filter12] "r"(filter12), [filter34] "r"(filter34),
8057bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [filter56] "r"(filter56), [filter78] "r"(filter78),
8067bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
8077bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [src] "r"(src));
8085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src += 16;
8105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst += 16;
8115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
8125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
8145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src_ptr += src_stride;
8155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr += dst_stride;
8165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
8175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
8185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
819da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
8205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               uint8_t *dst, ptrdiff_t dst_stride,
821df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               const InterpKernel *filter, int x0_q4,
822df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               int x_step_q4, int y0_q4, int y_step_q4, int w,
8237bc9febe8749e98a3812a0dc4380ceae75c29450Johann                               int h) {
824df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int16_t *const filter_x = filter[x0_q4];
825da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(x_step_q4 == 16);
826da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(((const int32_t *)filter_x)[1] != 0x800000);
827da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
828da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (((const int32_t *)filter_x)[0] == 0) {
829df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
830df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              x_step_q4, y0_q4, y_step_q4, w, h);
8315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
832da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint32_t pos = 38;
8335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
834da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load((const uint8_t *)filter_x);
835da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src -= 3;
8365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
837da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* bit positon for extract from acc */
8387bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
8397bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         :
8407bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         : [pos] "r"(pos));
8415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
842da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* prefetch data to cache memory */
843da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src);
844da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_load(src + 32);
845da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst);
846da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
847da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (w) {
848da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
8497bc9febe8749e98a3812a0dc4380ceae75c29450Johann        convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
8507bc9febe8749e98a3812a0dc4380ceae75c29450Johann                               (int32_t)dst_stride, filter_x, (int32_t)h);
851da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
852da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
8537bc9febe8749e98a3812a0dc4380ceae75c29450Johann        convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
8547bc9febe8749e98a3812a0dc4380ceae75c29450Johann                               (int32_t)dst_stride, filter_x, (int32_t)h);
855da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
856da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
8577bc9febe8749e98a3812a0dc4380ceae75c29450Johann        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
8587bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                (int32_t)dst_stride, filter_x, (int32_t)h, 1);
859da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
860da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
8617bc9febe8749e98a3812a0dc4380ceae75c29450Johann        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
8627bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                (int32_t)dst_stride, filter_x, (int32_t)h, 2);
863da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
864da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
865da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + 64);
866da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_store(dst + 32);
867da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
8687bc9febe8749e98a3812a0dc4380ceae75c29450Johann        convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
8697bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                (int32_t)dst_stride, filter_x, (int32_t)h);
870da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
871da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
872df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
873df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
874da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
8755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
8765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
8775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
8785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif
879