15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h>
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vpx_config.h"
155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "./vp9_rtcd.h"
165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_common.h"
175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx/vpx_integer.h"
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx_ports/mem.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/vp9_filter.h"
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuanguint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuanguint8_t *vp9_ff_cropTbl;
255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_dsputil_static_init(void) {
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int i;
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (i = 0; i < CROP_WIDTH; i++) {
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_ff_cropTbl_a[i] = 0;
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t src_stride,
415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              uint8_t *dst,
425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t dst_stride,
435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              const int16_t *filter_x0,
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t h) {
455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y;
465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *cm = vp9_ff_cropTbl;
475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst_ptr;
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t vector1b, vector2b, vector3b, vector4b;
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3, Temp4;
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector4a = 64;
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2;
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4;
535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tn1, tn2;
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_x0)[0];
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_x0)[1];
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_x0)[2];
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_x0)[3];
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr = dst;
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src + src_stride);
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src + src_stride + 32);
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],         0(%[src])                      \n\t"
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         4(%[src])                      \n\t"
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 1. pixel */
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tn2],         8(%[src])                      \n\t"
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac3,           31             \n\t"
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 2. pixel */
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn1],         %[tn2],         3              \n\t"
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn2],         %[tp2],         3              \n\t"
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tp2],         %[tp1],         3              \n\t"
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31             \n\t"
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 1. pixel */
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31             \n\t"
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 2. pixel */
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp4],       $ac2,           31             \n\t"
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* clamp */
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* store bytes */
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [dst_ptr] "+r" (dst_ptr)
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [vector4a] "r" (vector4a),
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += 1;
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t src_stride,
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              uint8_t *dst,
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t dst_stride,
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              const int16_t *filter_x0,
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t h) {
1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y;
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *cm = vp9_ff_cropTbl;
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst_ptr;
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector4a = 64;
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t vector1b, vector2b, vector3b, vector4b;
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3;
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2, tp3;
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, n1;
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *odd_dst;
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t dst_pitch_2 = (dst_stride << 1);
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_x0)[0];
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_x0)[1];
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_x0)[2];
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_x0)[3];
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src + src_stride);
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src + src_stride + 32);
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr = dst;
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    odd_dst = (dst_ptr + dst_stride);
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         0(%[src])                       \n\t"
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],         4(%[src])                       \n\t"
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 1. pixel */
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                            \n\t"
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                            \n\t"
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                            \n\t"
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                            \n\t"
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp3],         8(%[src])                       \n\t"
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac3,           31              \n\t"
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 2. pixel */
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         12(%[src])                      \n\t"
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31              \n\t"
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 3. pixel */
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac1                            \n\t"
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac1                            \n\t"
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[p3],          $ac1,           31              \n\t"
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 4. pixel */
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                            \n\t"
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                            \n\t"
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                            \n\t"
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                            \n\t"
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],         1(%[src])                       \n\t"
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp3],         5(%[src])                       \n\t"
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31              \n\t"
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 1. pixel */
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac1                            \n\t"
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac1                            \n\t"
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         9(%[src])                       \n\t"
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31              \n\t"
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 2. pixel */
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                            \n\t"
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                            \n\t"
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                            \n\t"
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                            \n\t"
2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[Temp1],       13(%[src])                      \n\t"
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac1,           31              \n\t"
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 3. pixel */
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31              \n\t"
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 4. pixel */
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac2,           31              \n\t"
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* clamp */
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* store bytes */
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[p4],          0(%[odd_dst])                   \n\t"
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[p2],          0(%[odd_dst])                   \n\t"
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[n1],          0(%[odd_dst])                   \n\t"
3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [n1] "=&r" (n1),
3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [vector4a] "r" (vector4a), [cm] "r" (cm),
3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
3235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += 1;
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t src_stride,
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               uint8_t *dst_ptr,
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t dst_stride,
3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               const int16_t *filter_x0,
3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t h,
3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t count) {
3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t c, y;
3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src;
3405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst;
3415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *cm = vp9_ff_cropTbl;
3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector_64 = 64;
3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t  filter12, filter34, filter56, filter78;
3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t  Temp1, Temp2, Temp3;
3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t qload1, qload2;
3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, p5;
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st1, st2, st3;
3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t dst_pitch_2 = (dst_stride << 1);
3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t  *odd_dst;
3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter12 = ((const int32_t *)filter_x0)[0];
3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter34 = ((const int32_t *)filter_x0)[1];
3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter56 = ((const int32_t *)filter_x0)[2];
3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter78 = ((const int32_t *)filter_x0)[3];
3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src_ptr + src_stride);
3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src_ptr + src_stride + 32);
3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src = src_ptr;
3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst = dst_ptr;
3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    odd_dst = (dst + dst_stride);
3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (c = 0; c < count; c++) {
3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        0(%[src])                       \n\t"
3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        4(%[src])                       \n\t"
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 1. pixel */
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        8(%[src])                       \n\t"
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 2. pixel */
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        12(%[src])                      \n\t"
3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 3. pixel */
4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 4. pixel */
4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        16(%[src])                      \n\t"
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 5. pixel */
4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
4395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 6. pixel */
4415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
4425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
4435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
4445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
4455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        20(%[src])                      \n\t"
4475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
4485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
4505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
4515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
4525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
4535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 7. pixel */
4555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
4585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
4595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 8. pixel */
4685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
4695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
4705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
4715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
4785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* ODD pixels */
4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        1(%[src])                       \n\t"
4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        5(%[src])                       \n\t"
4825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 1. pixel */
4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
4855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
4865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
4875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
4885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
4895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
4905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
4915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        9(%[src])                       \n\t"
4935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 2. pixel */
5015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
5025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
5035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
5045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
5055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
5065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        13(%[src])                      \n\t"
5075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
5115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 3. pixel */
5155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
5165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
5185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 4. pixel */
5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
5305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
5315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
5325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        17(%[src])                      \n\t"
5345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
5355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
5365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
5375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
5385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
5395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 5. pixel */
5425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
5495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
5505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
5525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
5535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 6. pixel */
5555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
5565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
5575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
5585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        21(%[src])                      \n\t"
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 7. pixel */
5695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
5705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
5735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
5765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
5775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
5785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
5795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 8. pixel */
5815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
5825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
5835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
5845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
5855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
5925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [filter12] "r" (filter12), [filter34] "r" (filter34),
6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [filter56] "r" (filter56), [filter78] "r" (filter78),
6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [vector_64] "r" (vector_64), [cm] "r" (cm),
6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src += 16;
6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      odd_dst = (dst + dst_stride);
6135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src_ptr += src_stride;
6175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr += 1;
6195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
6205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
6215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
6235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t src_stride,
6245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               uint8_t *dst_ptr,
6255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t dst_stride,
6265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               const int16_t *filter_x0,
6275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t h) {
6285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t c, y;
6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src;
6305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst;
6315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *cm = vp9_ff_cropTbl;
6325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector_64 = 64;
6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t  filter12, filter34, filter56, filter78;
6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t  Temp1, Temp2, Temp3;
6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t qload1, qload2;
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, p5;
6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st1, st2, st3;
6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t dst_pitch_2 = (dst_stride << 1);
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t  *odd_dst;
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter12 = ((const int32_t *)filter_x0)[0];
6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter34 = ((const int32_t *)filter_x0)[1];
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter56 = ((const int32_t *)filter_x0)[2];
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter78 = ((const int32_t *)filter_x0)[3];
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src_ptr + src_stride);
6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src_ptr + src_stride + 32);
6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src_ptr + src_stride + 64);
6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src = src_ptr;
6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst = dst_ptr;
6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    odd_dst = (dst + dst_stride);
6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (c = 0; c < 4; c++) {
6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        0(%[src])                       \n\t"
6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        4(%[src])                       \n\t"
6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 1. pixel */
6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        8(%[src])                       \n\t"
6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
6765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
6775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 2. pixel */
6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
6835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        12(%[src])                      \n\t"
6845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
6855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
6865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
6875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
6885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
6895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 3. pixel */
6925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
6945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
6955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
6965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
6975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
6985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
6995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
7005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
7015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
7025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
7035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 4. pixel */
7055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
7065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
7075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
7085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
7095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        16(%[src])                      \n\t"
7115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
7125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
7135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
7145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
7155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
7165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
7175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 5. pixel */
7195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
7205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
7215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
7225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
7235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
7255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
7265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
7275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
7285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
7295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
7305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 6. pixel */
7325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
7335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
7345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
7355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
7365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        20(%[src])                      \n\t"
7385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
7395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
7405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
7415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
7425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
7435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
7445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 7. pixel */
7465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
7475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
7485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
7495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
7505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
7525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
7535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
7545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
7555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
7565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
7575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 8. pixel */
7595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
7605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
7615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
7625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
7635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
7645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
7665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
7675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
7685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
7695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* ODD pixels */
7715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        1(%[src])                       \n\t"
7725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        5(%[src])                       \n\t"
7735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 1. pixel */
7755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
7765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
7775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
7785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
7795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
7805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
7815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
7825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        9(%[src])                       \n\t"
7845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
7855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
7865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
7875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
7885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
7895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
7905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 2. pixel */
7925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
7935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
7945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
7955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
7965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
7975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        13(%[src])                      \n\t"
7985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
7995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
8005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
8015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
8025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
8035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
8045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 3. pixel */
8065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
8075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
8085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
8095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
8105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
8125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
8135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
8145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
8155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
8165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
8175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 4. pixel */
8195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
8205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
8215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
8225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
8235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        17(%[src])                      \n\t"
8255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
8265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
8275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
8285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
8295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
8305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
8315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 5. pixel */
8335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
8345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
8355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
8365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
8375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
8395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
8405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
8415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
8425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
8435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
8445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 6. pixel */
8465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
8475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
8485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
8495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
8505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        21(%[src])                      \n\t"
8525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
8535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
8545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
8555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
8565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
8575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
8585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 7. pixel */
8605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
8615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
8625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
8635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
8645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
8665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
8675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
8685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
8695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
8705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 8. pixel */
8725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
8735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
8745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
8755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
8765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
8775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
8795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
8805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
8815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
8835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
8865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
8895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
8915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
8925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
8935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
8945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
8955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [filter12] "r" (filter12), [filter34] "r" (filter34),
8965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [filter56] "r" (filter56), [filter78] "r" (filter78),
8975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [vector_64] "r" (vector_64), [cm] "r" (cm),
8985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
8995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
9005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src += 16;
9025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
9035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      odd_dst = (dst + dst_stride);
9045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
9055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
9075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src_ptr += src_stride;
9085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr += 1;
9105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
9115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
9125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
9145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               uint8_t *dst, ptrdiff_t dst_stride,
9155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               const int16_t *filter, int w, int h) {
9165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x, y, k;
9175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = 0; y < h; ++y) {
9195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < w; ++x) {
9205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      int sum = 0;
9215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (k = 0; k < 8; ++k)
9235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        sum += src[x + k] * filter[k];
9245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
9265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
9275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
9295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += 1;
9305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
9315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
9325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
9345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                           uint8_t *dst, ptrdiff_t dst_stride,
9355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                           int w, int h) {
9365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x, y;
9375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = 0; y < h; ++y) {
9395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < w; ++x) {
9405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst[x * dst_stride] = src[x];
9415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
9425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
9445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += 1;
9455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
9465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
9475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
9495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         uint8_t *dst, ptrdiff_t dst_stride,
9505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         const int16_t *filter_x, int x_step_q4,
9515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         const int16_t *filter_y, int y_step_q4,
9525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         int w, int h) {
9535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
9545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
9555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t pos = 38;
9565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* bit positon for extract from acc */
9585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __asm__ __volatile__ (
9595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    "wrdsp      %[pos],     1           \n\t"
9605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    :
9615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    : [pos] "r" (pos)
9625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  );
9635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (intermediate_height < h)
9655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    intermediate_height = h;
9665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (x_step_q4 != 16 || y_step_q4 != 16)
9685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    return vp9_convolve8_c(src, src_stride,
9695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                           dst, dst_stride,
9705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                           filter_x, x_step_q4,
9715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                           filter_y, y_step_q4,
9725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                           w, h);
9735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if ((((const int32_t *)filter_x)[1] == 0x800000)
9755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      && (((const int32_t *)filter_y)[1] == 0x800000))
9765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    return vp9_convolve_copy(src, src_stride,
9775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             dst, dst_stride,
9785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             filter_x, x_step_q4,
9795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             filter_y, y_step_q4,
9805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             w, h);
9815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* copy the src to dst */
9835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (filter_x[3] == 0x80) {
9845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    copy_horiz_transposed(src - src_stride * 3, src_stride,
9855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                          temp, intermediate_height,
9865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                          w, intermediate_height);
9875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else if (((const int32_t *)filter_x)[0] == 0) {
9885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
9895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        temp, intermediate_height,
9905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        filter_x,
9915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        w, intermediate_height);
9925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
9935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src -= (src_stride * 3 + 3);
9945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
9965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src);
9975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_prefetch_load(src + 32);
9985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    switch (w) {
10005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 4:
10015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_4_transposed_dspr2(src, src_stride,
10025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          temp, intermediate_height,
10035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          filter_x, intermediate_height);
10045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 8:
10065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_8_transposed_dspr2(src, src_stride,
10075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          temp, intermediate_height,
10085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          filter_x, intermediate_height);
10095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 16:
10115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 32:
10125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_16_transposed_dspr2(src, src_stride,
10135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           temp, intermediate_height,
10145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           filter_x, intermediate_height,
10155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           (w/16));
10165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 64:
10185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + 32);
10195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_64_transposed_dspr2(src, src_stride,
10205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           temp, intermediate_height,
10215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           filter_x, intermediate_height);
10225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      default:
10245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_transposed(src, src_stride,
10255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  temp, intermediate_height,
10265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  filter_x, w, intermediate_height);
10275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
10295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
10305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* copy the src to dst */
10325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (filter_y[3] == 0x80) {
10335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    copy_horiz_transposed(temp + 3, intermediate_height,
10345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                          dst, dst_stride,
10355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                          h, w);
10365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else if (((const int32_t *)filter_y)[0] == 0) {
10375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    vp9_convolve2_dspr2(temp + 3, intermediate_height,
10385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        dst, dst_stride,
10395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        filter_y,
10405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        h, w);
10415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
10425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    switch (h) {
10435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 4:
10445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
10455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          dst, dst_stride,
10465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          filter_y, w);
10475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 8:
10495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
10505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          dst, dst_stride,
10515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          filter_y, w);
10525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 16:
10545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 32:
10555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
10565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           dst, dst_stride,
10575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           filter_y, w, (h/16));
10585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 64:
10605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
10615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           dst, dst_stride,
10625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           filter_y, w);
10635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      default:
10655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_transposed(temp, intermediate_height,
10665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  dst, dst_stride,
10675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  filter_y, h, w);
10685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
10705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
10715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
10725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
10745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             uint8_t *dst, ptrdiff_t dst_stride,
10755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             const int16_t *filter_x, int filter_x_stride,
10765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             const int16_t *filter_y, int filter_y_stride,
10775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             int w, int h) {
10785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x, y;
10795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* prefetch data to cache memory */
10815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(src);
10825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_load(src + 32);
10835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vp9_prefetch_store(dst);
10845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  switch (w) {
10865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 4:
10875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
10885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1;
10895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 1 word storage */
10915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
10925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
10935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
10945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
10955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
10975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         (%[src])      \n\t"
10985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         (%[dst])      \n\t"  /* store */
10995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1)
11015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
11025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
11035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
11055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
11065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
11095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 8:
11105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
11115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1, tp2;
11125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 2 word storage */
11145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
11155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
11165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
11175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
11185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
11205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
11215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         4(%[src])      \n\t"
11225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
11235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
11245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
11265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
11275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
11285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
11305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
11315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
11345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 16:
11355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
11365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1, tp2, tp3, tp4;
11375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 4 word storage */
11395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
11405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
11415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
11425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
11435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
11455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
11465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         4(%[src])      \n\t"
11475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         8(%[src])      \n\t"
11485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[src])     \n\t"
11495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
11515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
11525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
11535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
11545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
11565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
11575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
11585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
11595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
11615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
11625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
11655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 32:
11665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
11675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1, tp2, tp3, tp4;
11685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp5, tp6, tp7, tp8;
11695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 8 word storage */
11715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
11725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
11735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
11745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
11755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
11775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
11785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         4(%[src])      \n\t"
11795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         8(%[src])      \n\t"
11805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[src])     \n\t"
11815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp5],         16(%[src])     \n\t"
11825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp6],         20(%[src])     \n\t"
11835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp7],         24(%[src])     \n\t"
11845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp8],         28(%[src])     \n\t"
11855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
11875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
11885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
11895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
11905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
11915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
11925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
11935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
11945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
11965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
11975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
11985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
11995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
12005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
12015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
12035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
12045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
12055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
12065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
12075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 64:
12085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
12095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1, tp2, tp3, tp4;
12105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp5, tp6, tp7, tp8;
12115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      vp9_prefetch_load(src + 64);
12135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      vp9_prefetch_store(dst + 32);
12145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 16 word storage */
12165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
12175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride);
12185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 32);
12195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_load(src + src_stride + 64);
12205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride);
12215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        vp9_prefetch_store(dst + dst_stride + 32);
12225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
12245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
12255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         4(%[src])      \n\t"
12265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         8(%[src])      \n\t"
12275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[src])     \n\t"
12285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp5],         16(%[src])     \n\t"
12295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp6],         20(%[src])     \n\t"
12305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp7],         24(%[src])     \n\t"
12315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp8],         28(%[src])     \n\t"
12325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
12345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
12355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
12365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
12375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
12385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
12395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
12405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
12415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         32(%[src])     \n\t"
12435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         36(%[src])     \n\t"
12445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         40(%[src])     \n\t"
12455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         44(%[src])     \n\t"
12465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp5],         48(%[src])     \n\t"
12475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp6],         52(%[src])     \n\t"
12485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp7],         56(%[src])     \n\t"
12495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp8],         60(%[src])     \n\t"
12505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
12525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
12535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
12545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
12555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
12565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
12575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
12585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
12595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
12615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
12625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
12635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
12645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
12655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
12665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
12685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
12695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
12705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
12715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
12725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    default:
12735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
12745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        for (x = 0; x < w; ++x) {
12755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          dst[x] = src[x];
12765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
12775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
12795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
12805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
12815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
12825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
12835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
12845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif
1285