15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h>
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/mips/convolve_common_dspr2.h"
167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/vpx_dsp_common.h"
177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/vpx_filter.h"
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx_ports/mem.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t src_stride,
235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              uint8_t *dst,
245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t dst_stride,
255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              const int16_t *filter_x0,
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t h) {
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y;
287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst_ptr;
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t vector1b, vector2b, vector3b, vector4b;
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3, Temp4;
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector4a = 64;
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2;
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4;
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tn1, tn2;
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_x0)[0];
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_x0)[1];
395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_x0)[2];
405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_x0)[3];
415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr = dst;
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src + src_stride);
467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src + src_stride + 32);
475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],         0(%[src])                      \n\t"
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         4(%[src])                      \n\t"
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 1. pixel */
535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tn2],         8(%[src])                      \n\t"
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac3,           31             \n\t"
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 2. pixel */
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn1],         %[tn2],         3              \n\t"
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tn2],         %[tp2],         3              \n\t"
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "balign           %[tp2],         %[tp1],         3              \n\t"
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31             \n\t"
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 1. pixel */
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                           \n\t"
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                           \n\t"
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31             \n\t"
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 2. pixel */
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                           \n\t"
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                           \n\t"
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp4],       $ac2,           31             \n\t"
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* clamp */
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* store bytes */
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [dst_ptr] "+r" (dst_ptr)
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [vector4a] "r" (vector4a),
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += 1;
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t src_stride,
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              uint8_t *dst,
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t dst_stride,
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              const int16_t *filter_x0,
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                              int32_t h) {
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t y;
1447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst_ptr;
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector4a = 64;
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t vector1b, vector2b, vector3b, vector4b;
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t Temp1, Temp2, Temp3;
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t tp1, tp2, tp3;
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, n1;
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *odd_dst;
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t dst_pitch_2 = (dst_stride << 1);
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_x0)[0];
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_x0)[1];
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_x0)[2];
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_x0)[3];
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
1617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src + src_stride);
1627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src + src_stride + 32);
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr = dst;
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    odd_dst = (dst_ptr + dst_stride);
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __asm__ __volatile__ (
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         0(%[src])                       \n\t"
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],         4(%[src])                       \n\t"
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 1. pixel */
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                            \n\t"
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                            \n\t"
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                            \n\t"
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                            \n\t"
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp3],         8(%[src])                       \n\t"
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac3,           31              \n\t"
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 2. pixel */
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         12(%[src])                      \n\t"
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31              \n\t"
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 3. pixel */
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac1                            \n\t"
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac1                            \n\t"
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[p3],          $ac1,           31              \n\t"
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* even 4. pixel */
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                            \n\t"
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                            \n\t"
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                            \n\t"
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                            \n\t"
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp1],         1(%[src])                       \n\t"
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp3],         5(%[src])                       \n\t"
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac2,           31              \n\t"
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 1. pixel */
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac1                            \n\t"
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac1                            \n\t"
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[tp2],         9(%[src])                       \n\t"
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31              \n\t"
2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 2. pixel */
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac3                            \n\t"
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac3                            \n\t"
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mtlo             %[vector4a],    $ac2                            \n\t"
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "mthi             $zero,          $ac2                            \n\t"
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "ulw              %[Temp1],       13(%[src])                      \n\t"
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp3],       $ac1,           31              \n\t"
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 3. pixel */
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp2],       $ac3,           31              \n\t"
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* odd 4. pixel */
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "extp             %[Temp1],       $ac2,           31              \n\t"
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* clamp */
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* store bytes */
2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[p4],          0(%[odd_dst])                   \n\t"
2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[p2],          0(%[odd_dst])                   \n\t"
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        "sb               %[n1],          0(%[odd_dst])                   \n\t"
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [n1] "=&r" (n1),
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [vector4a] "r" (vector4a), [cm] "r" (cm),
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    );
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += 1;
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t src_stride,
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               uint8_t *dst_ptr,
3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t dst_stride,
3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               const int16_t *filter_x0,
3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t h,
3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t count) {
3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t c, y;
3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src;
3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst;
3237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector_64 = 64;
3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t  filter12, filter34, filter56, filter78;
3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t  Temp1, Temp2, Temp3;
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t qload1, qload2;
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, p5;
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st1, st2, st3;
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t dst_pitch_2 = (dst_stride << 1);
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t  *odd_dst;
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter12 = ((const int32_t *)filter_x0)[0];
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter34 = ((const int32_t *)filter_x0)[1];
3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter56 = ((const int32_t *)filter_x0)[2];
3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter78 = ((const int32_t *)filter_x0)[3];
3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
3407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride);
3417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 32);
3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src = src_ptr;
3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst = dst_ptr;
3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    odd_dst = (dst + dst_stride);
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (c = 0; c < count; c++) {
3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        0(%[src])                       \n\t"
3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        4(%[src])                       \n\t"
3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 1. pixel */
3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        8(%[src])                       \n\t"
3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 2. pixel */
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        12(%[src])                      \n\t"
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 3. pixel */
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 4. pixel */
3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        16(%[src])                      \n\t"
4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 5. pixel */
4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
4135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 6. pixel */
4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        20(%[src])                      \n\t"
4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 7. pixel */
4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
4395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
4405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
4415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
4435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
4445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
4455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
4465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
4475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
4485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 8. pixel */
4505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
4515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
4525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
4535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
4545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
4555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
4585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
4595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
4605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* ODD pixels */
4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        1(%[src])                       \n\t"
4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        5(%[src])                       \n\t"
4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 1. pixel */
4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
4685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
4695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
4705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
4715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        9(%[src])                       \n\t"
4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
4785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 2. pixel */
4835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
4855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
4865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
4875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
4885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        13(%[src])                      \n\t"
4895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
4905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
4915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
4925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
4935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 3. pixel */
4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
5015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
5035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
5045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
5055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
5065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
5075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 4. pixel */
5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
5115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
5145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        17(%[src])                      \n\t"
5165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
5185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 5. pixel */
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
5275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
5305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
5315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
5325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
5345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
5355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 6. pixel */
5375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
5385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
5395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
5415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        21(%[src])                      \n\t"
5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
5495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 7. pixel */
5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
5525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
5535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
5545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
5555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
5575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
5585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 8. pixel */
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
5685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
5705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
5745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
5775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
5785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
5805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
5825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
5835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
5845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
5855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [filter12] "r" (filter12), [filter34] "r" (filter34),
5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [filter56] "r" (filter56), [filter78] "r" (filter78),
5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [vector_64] "r" (vector_64), [cm] "r" (cm),
5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src += 16;
5935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
5945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      odd_dst = (dst + dst_stride);
5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src_ptr += src_stride;
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr += 1;
6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t src_stride,
6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               uint8_t *dst_ptr,
6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t dst_stride,
6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               const int16_t *filter_x0,
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                               int32_t h) {
6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t c, y;
6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src;
6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t *dst;
6137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  uint8_t *cm = vpx_ff_cropTbl;
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t vector_64 = 64;
6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t  filter12, filter34, filter56, filter78;
6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t  Temp1, Temp2, Temp3;
6175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t qload1, qload2;
6185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t p1, p2, p3, p4, p5;
6195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t st1, st2, st3;
6205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t dst_pitch_2 = (dst_stride << 1);
6215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint8_t  *odd_dst;
6225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter12 = ((const int32_t *)filter_x0)[0];
6245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter34 = ((const int32_t *)filter_x0)[1];
6255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter56 = ((const int32_t *)filter_x0)[2];
6265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  filter78 = ((const int32_t *)filter_x0)[3];
6275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
6307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride);
6317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 32);
6327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src_ptr + src_stride + 64);
6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src = src_ptr;
6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst = dst_ptr;
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    odd_dst = (dst + dst_stride);
6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (c = 0; c < 4; c++) {
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      __asm__ __volatile__ (
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        0(%[src])                       \n\t"
6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        4(%[src])                       \n\t"
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 1. pixel */
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        8(%[src])                       \n\t"
6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 2. pixel */
6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        12(%[src])                      \n\t"
6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 3. pixel */
6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
6765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
6775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
6785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
6835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
6845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
6855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 4. pixel */
6875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
6885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
6895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
6915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
6925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        16(%[src])                      \n\t"
6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
6945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
6955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
6965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
6975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
6985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
6995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 5. pixel */
7015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
7025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
7035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
7045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
7055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
7075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
7085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
7095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
7105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
7115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
7125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 6. pixel */
7145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
7155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
7165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
7175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
7185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        20(%[src])                      \n\t"
7205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
7215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
7225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
7235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
7245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
7255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
7265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 7. pixel */
7285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
7295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
7305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
7315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
7325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
7345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
7355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
7365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
7375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
7385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
7395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* even 8. pixel */
7415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
7425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
7435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
7445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
7455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
7465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
7485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
7495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
7505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
7515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* ODD pixels */
7535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        1(%[src])                       \n\t"
7545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        5(%[src])                       \n\t"
7555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 1. pixel */
7575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
7585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
7595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
7605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
7615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
7625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
7635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
7645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
7655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        9(%[src])                       \n\t"
7665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
7675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
7685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
7695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
7705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
7715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
7725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 2. pixel */
7745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
7755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
7765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
7775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
7785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
7795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        13(%[src])                      \n\t"
7805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
7815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
7825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
7835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
7845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
7855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
7865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 3. pixel */
7885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
7895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
7905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
7915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
7925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
7935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
7945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
7955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
7965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
7975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
7985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
7995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 4. pixel */
8015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
8025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
8035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
8045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
8055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload2],        17(%[src])                      \n\t"
8075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
8085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
8095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
8105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
8115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
8125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
8135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 5. pixel */
8155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
8165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac2                            \n\t"
8175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
8185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
8195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
8215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
8225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
8235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
8245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
8255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
8265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 6. pixel */
8285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
8295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac3                            \n\t"
8305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
8315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
8325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[qload1],        21(%[src])                      \n\t"
8345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
8355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
8365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
8375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
8385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
8395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
8405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 7. pixel */
8425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
8435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,            $ac1                            \n\t"
8445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
8455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
8465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
8485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
8495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
8505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
8515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
8525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          /* odd 8. pixel */
8545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
8555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
8565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
8575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
8585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
8595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
8615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
8625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
8635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
8655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
8685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
8695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
8715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
8735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
8745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
8755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
8765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
8775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          : [filter12] "r" (filter12), [filter34] "r" (filter34),
8785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [filter56] "r" (filter56), [filter78] "r" (filter78),
8795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [vector_64] "r" (vector_64), [cm] "r" (cm),
8805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
8815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      );
8825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src += 16;
8845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
8855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      odd_dst = (dst + dst_stride);
8865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
8875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
8895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src_ptr += src_stride;
8905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst_ptr += 1;
8925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
8935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
8945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
8965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               uint8_t *dst, ptrdiff_t dst_stride,
8975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                               const int16_t *filter, int w, int h) {
8985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x, y, k;
8995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = 0; y < h; ++y) {
9015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < w; ++x) {
9025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      int sum = 0;
9035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (k = 0; k < 8; ++k)
9055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        sum += src[x + k] * filter[k];
9065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
9085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
9095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
9115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += 1;
9125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
9135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
9145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
9165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                           uint8_t *dst, ptrdiff_t dst_stride,
9175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                           int w, int h) {
9185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x, y;
9195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = 0; y < h; ++y) {
9215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < w; ++x) {
9225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst[x * dst_stride] = src[x];
9235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
9245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
9265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += 1;
9275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
9285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
9295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
9315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         uint8_t *dst, ptrdiff_t dst_stride,
9325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         const int16_t *filter_x, int x_step_q4,
9335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         const int16_t *filter_y, int y_step_q4,
9345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                         int w, int h) {
9357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
9365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
9375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  uint32_t pos = 38;
9385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  assert(x_step_q4 == 16);
9407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  assert(y_step_q4 == 16);
9417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  assert(((const int32_t *)filter_x)[1] != 0x800000);
9427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  assert(((const int32_t *)filter_y)[1] != 0x800000);
9437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* bit positon for extract from acc */
9465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  __asm__ __volatile__ (
9475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    "wrdsp      %[pos],     1           \n\t"
9485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    :
9495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    : [pos] "r" (pos)
9505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  );
9515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (intermediate_height < h)
9535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    intermediate_height = h;
9545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* copy the src to dst */
9565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (filter_x[3] == 0x80) {
9575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    copy_horiz_transposed(src - src_stride * 3, src_stride,
9585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                          temp, intermediate_height,
9595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                          w, intermediate_height);
9605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else if (((const int32_t *)filter_x)[0] == 0) {
9617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpx_convolve2_dspr2(src - src_stride * 3, src_stride,
9625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        temp, intermediate_height,
9635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        filter_x,
9645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        w, intermediate_height);
9655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
9665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src -= (src_stride * 3 + 3);
9675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
9697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src);
9707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    prefetch_load(src + 32);
9715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    switch (w) {
9735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 4:
9745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_4_transposed_dspr2(src, src_stride,
9755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          temp, intermediate_height,
9765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          filter_x, intermediate_height);
9775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
9785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 8:
9795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_8_transposed_dspr2(src, src_stride,
9805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          temp, intermediate_height,
9815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          filter_x, intermediate_height);
9825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
9835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 16:
9845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 32:
9855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_16_transposed_dspr2(src, src_stride,
9865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           temp, intermediate_height,
9875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           filter_x, intermediate_height,
9885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           (w/16));
9895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
9905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 64:
9917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + 32);
9925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_64_transposed_dspr2(src, src_stride,
9935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           temp, intermediate_height,
9945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           filter_x, intermediate_height);
9955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
9965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      default:
9975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_transposed(src, src_stride,
9985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  temp, intermediate_height,
9995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  filter_x, w, intermediate_height);
10005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
10025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
10035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* copy the src to dst */
10055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  if (filter_y[3] == 0x80) {
10065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    copy_horiz_transposed(temp + 3, intermediate_height,
10075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                          dst, dst_stride,
10085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                          h, w);
10095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else if (((const int32_t *)filter_y)[0] == 0) {
10107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpx_convolve2_dspr2(temp + 3, intermediate_height,
10115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        dst, dst_stride,
10125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        filter_y,
10135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        h, w);
10145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
10155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    switch (h) {
10165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 4:
10175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
10185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          dst, dst_stride,
10195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          filter_y, w);
10205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 8:
10225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
10235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          dst, dst_stride,
10245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                          filter_y, w);
10255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 16:
10275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 32:
10285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
10295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           dst, dst_stride,
10305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           filter_y, w, (h/16));
10315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      case 64:
10335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
10345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           dst, dst_stride,
10355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                           filter_y, w);
10365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      default:
10385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        convolve_horiz_transposed(temp, intermediate_height,
10395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  dst, dst_stride,
10405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  filter_y, h, w);
10415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        break;
10425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
10435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
10445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
10455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
10475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             uint8_t *dst, ptrdiff_t dst_stride,
10485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             const int16_t *filter_x, int filter_x_stride,
10495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             const int16_t *filter_y, int filter_y_stride,
10505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             int w, int h) {
10515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x, y;
10525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* prefetch data to cache memory */
10547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  prefetch_load(src);
10557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  prefetch_load(src + 32);
10567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  prefetch_store(dst);
10575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  switch (w) {
10595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 4:
10605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
10615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1;
10625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 1 word storage */
10645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
10657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
10667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
10677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
10685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
10705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         (%[src])      \n\t"
10715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         (%[dst])      \n\t"  /* store */
10725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1)
10745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
10755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
10765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
10785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
10795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
10805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
10815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
10825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 8:
10835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
10845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1, tp2;
10855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 2 word storage */
10875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
10887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
10897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
10907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
10915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
10935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
10945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         4(%[src])      \n\t"
10955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
10965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
10975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
10985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
10995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
11005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
11015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
11035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
11045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
11075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 16:
11085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
11095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1, tp2, tp3, tp4;
11105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 4 word storage */
11125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
11137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
11147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
11157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
11165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
11185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
11195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         4(%[src])      \n\t"
11205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         8(%[src])      \n\t"
11215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[src])     \n\t"
11225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
11245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
11255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
11265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
11275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
11295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
11305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
11315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
11325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
11345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
11355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
11385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 32:
11395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
11405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1, tp2, tp3, tp4;
11415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp5, tp6, tp7, tp8;
11425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 8 word storage */
11445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
11457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
11467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
11477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
11485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
11505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
11515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         4(%[src])      \n\t"
11525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         8(%[src])      \n\t"
11535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[src])     \n\t"
11545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp5],         16(%[src])     \n\t"
11555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp6],         20(%[src])     \n\t"
11565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp7],         24(%[src])     \n\t"
11575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp8],         28(%[src])     \n\t"
11585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
11605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
11615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
11625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
11635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
11645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
11655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
11665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
11675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
11695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
11705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
11715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
11725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
11735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
11745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
11765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
11775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
11795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
11805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 64:
11815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      {
11825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp1, tp2, tp3, tp4;
11835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      uint32_t tp5, tp6, tp7, tp8;
11845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      prefetch_load(src + 64);
11867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian      prefetch_store(dst + 32);
11875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 16 word storage */
11895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
11907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
11917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
11927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 64);
11937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
11947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride + 32);
11955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
11965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __asm__ __volatile__ (
11975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
11985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         4(%[src])      \n\t"
11995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         8(%[src])      \n\t"
12005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[src])     \n\t"
12015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp5],         16(%[src])     \n\t"
12025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp6],         20(%[src])     \n\t"
12035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp7],         24(%[src])     \n\t"
12045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp8],         28(%[src])     \n\t"
12055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
12075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
12085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
12095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
12105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
12115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
12125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
12135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
12145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         32(%[src])     \n\t"
12165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         36(%[src])     \n\t"
12175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         40(%[src])     \n\t"
12185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         44(%[src])     \n\t"
12195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp5],         48(%[src])     \n\t"
12205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp6],         52(%[src])     \n\t"
12215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp7],         56(%[src])     \n\t"
12225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp8],         60(%[src])     \n\t"
12235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
12255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
12265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
12275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
12285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
12295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
12305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
12315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
12325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
12345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
12355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
12365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
12375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            : [src] "r" (src), [dst] "r" (dst)
12385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        );
12395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
12415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
12425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
12435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
12445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
12455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    default:
12465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y--; ) {
12475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        for (x = 0; x < w; ++x) {
12485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          dst[x] = src[x];
12495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
12505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
12515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
12525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
12535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
12545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
12555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
12565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
12575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif
1258