15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/* 25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * 45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * Use of this source code is governed by a BSD-style license 55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * that can be found in the LICENSE file in the root of the source 65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * tree. An additional intellectual property rights grant can be found 75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * in the file PATENTS. All contributing project authors may 85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang * be found in the AUTHORS file in the root of the source tree. 95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */ 105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h> 125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h> 135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h" 157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/mips/convolve_common_dspr2.h" 167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/vpx_dsp_common.h" 177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/vpx_filter.h" 185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx_ports/mem.h" 195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2 215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_4_transposed_dspr2(const uint8_t *src, 225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t src_stride, 235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst, 245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t dst_stride, 255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter_x0, 265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t h) { 275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t y; 287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8_t *cm = vpx_ff_cropTbl; 295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst_ptr; 305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t vector1b, vector2b, vector3b, vector4b; 315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t Temp1, Temp2, Temp3, Temp4; 325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t vector4a = 64; 335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp1, tp2; 345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t p1, p2, p3, p4; 355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tn1, tn2; 365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vector1b = ((const int32_t *)filter_x0)[0]; 385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vector2b = ((const int32_t *)filter_x0)[1]; 395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vector3b = ((const int32_t *)filter_x0)[2]; 405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vector4b = ((const int32_t *)filter_x0)[3]; 415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--;) { 435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst_ptr = dst; 445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch data to cache memory */ 457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride); 467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride + 32); 475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], 0(%[src]) \n\t" 505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 4(%[src]) \n\t" 515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 1. pixel */ 535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac3 \n\t" 545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[tp1] \n\t" 565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p2], %[tp1] \n\t" 575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p3], %[tp2] \n\t" 585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p4], %[tp2] \n\t" 595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tn2], 8(%[src]) \n\t" 635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac3, 31 \n\t" 655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 2. pixel */ 675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac2 \n\t" 685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[tn2] \n\t" 705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "balign %[tn1], %[tn2], 3 \n\t" 715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "balign %[tn2], %[tp2], 3 \n\t" 725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "balign %[tp2], %[tp1], 3 \n\t" 735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac2, 31 \n\t" 785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 1. pixel */ 805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[tp1], %[Temp1](%[cm]) \n\t" 815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac3 \n\t" 825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[tp2] \n\t" 845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p2], %[tp2] \n\t" 855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p3], %[tn2] \n\t" 865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p4], %[tn2] \n\t" 875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac3, 31 \n\t" 925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 2. pixel */ 945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[tp2], %[Temp3](%[cm]) \n\t" 955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac2 \n\t" 965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[tn1] \n\t" 985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp4], $ac2, 31 \n\t" 1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* clamp */ 1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[tn1], %[Temp2](%[cm]) \n\t" 1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[p2], %[Temp4](%[cm]) \n\t" 1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* store bytes */ 1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[tp1], 0(%[dst_ptr]) \n\t" 1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[tn1], 0(%[dst_ptr]) \n\t" 1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[tp2], 0(%[dst_ptr]) \n\t" 1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[p2], 0(%[dst_ptr]) \n\t" 1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), 1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), 1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [dst_ptr] "+r" (dst_ptr) 1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector4a] "r" (vector4a), 1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) 1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* Next row... */ 1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += 1; 1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_8_transposed_dspr2(const uint8_t *src, 1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t src_stride, 1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst, 1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t dst_stride, 1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter_x0, 1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t h) { 1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t y; 1447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8_t *cm = vpx_ff_cropTbl; 1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst_ptr; 1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t vector4a = 64; 1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t vector1b, vector2b, vector3b, vector4b; 1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t Temp1, Temp2, Temp3; 1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp1, tp2, tp3; 1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t p1, p2, p3, p4, n1; 1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *odd_dst; 1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t dst_pitch_2 = (dst_stride << 1); 1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vector1b = ((const int32_t *)filter_x0)[0]; 1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vector2b = ((const int32_t *)filter_x0)[1]; 1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vector3b = ((const int32_t *)filter_x0)[2]; 1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang vector4b = ((const int32_t *)filter_x0)[3]; 1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--;) { 1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch data to cache memory */ 1617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride); 1627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride + 32); 1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst_ptr = dst; 1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang odd_dst = (dst_ptr + dst_stride); 1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 0(%[src]) \n\t" 1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], 4(%[src]) \n\t" 1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 1. pixel */ 1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac3 \n\t" 1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac2 \n\t" 1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[tp2] \n\t" 1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p2], %[tp2] \n\t" 1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p3], %[tp1] \n\t" 1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p4], %[tp1] \n\t" 1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp3], 8(%[src]) \n\t" 1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac3, 31 \n\t" 1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 2. pixel */ 1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[tp3] \n\t" 1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[n1], %[tp3] \n\t" 1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 12(%[src]) \n\t" 1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac2, 31 \n\t" 1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 3. pixel */ 1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac1 \n\t" 2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p2], %[tp2] \n\t" 2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[tp3], %[Temp3](%[cm]) \n\t" 2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[p3], $ac1, 31 \n\t" 2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 4. pixel */ 2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac2 \n\t" 2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac3 \n\t" 2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[Temp2], 0(%[dst_ptr]) \n\t" 2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[tp3], 0(%[dst_ptr]) \n\t" 2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], 1(%[src]) \n\t" 2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp3], 5(%[src]) \n\t" 2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac2, 31 \n\t" 2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[tp2], %[p3](%[cm]) \n\t" 2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 1. pixel */ 2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac1 \n\t" 2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[tp1] \n\t" 2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p2], %[tp1] \n\t" 2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p3], %[tp3] \n\t" 2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p4], %[tp3] \n\t" 2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[tp2], 0(%[dst_ptr]) \n\t" 2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 9(%[src]) \n\t" 2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac3, 31 \n\t" 2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 2. pixel */ 2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[tp1], %[Temp3](%[cm]) \n\t" 2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac3 \n\t" 2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector4a], $ac2 \n\t" 2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[tp2] \n\t" 2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[n1], %[tp2] \n\t" 2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[Temp1], 13(%[src]) \n\t" 2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[tp1], 0(%[dst_ptr]) \n\t" 2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac1, 31 \n\t" 2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 3. pixel */ 2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[tp3], %[Temp2](%[cm]) \n\t" 2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p2], %[Temp1] \n\t" 2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac3, 31 \n\t" 2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 4. pixel */ 2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[tp3], 0(%[odd_dst]) \n\t" 2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac2, 31 \n\t" 2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* clamp */ 2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[p4], %[Temp3](%[cm]) \n\t" 2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[p2], %[Temp2](%[cm]) \n\t" 2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[n1], %[Temp1](%[cm]) \n\t" 2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* store bytes */ 2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[p4], 0(%[odd_dst]) \n\t" 2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[p2], 0(%[odd_dst]) \n\t" 2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[n1], 0(%[odd_dst]) \n\t" 2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), 2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [n1] "=&r" (n1), 2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) 3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector4a] "r" (vector4a), [cm] "r" (cm), 3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) 3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* Next row... */ 3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += 1; 3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, 3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t src_stride, 3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst_ptr, 3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t dst_stride, 3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter_x0, 3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t h, 3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t count) { 3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t c, y; 3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const uint8_t *src; 3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst; 3237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8_t *cm = vpx_ff_cropTbl; 3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t vector_64 = 64; 3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t filter12, filter34, filter56, filter78; 3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t Temp1, Temp2, Temp3; 3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t qload1, qload2; 3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t p1, p2, p3, p4, p5; 3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t st1, st2, st3; 3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t dst_pitch_2 = (dst_stride << 1); 3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *odd_dst; 3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter12 = ((const int32_t *)filter_x0)[0]; 3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter34 = ((const int32_t *)filter_x0)[1]; 3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter56 = ((const int32_t *)filter_x0)[2]; 3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter78 = ((const int32_t *)filter_x0)[3]; 3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--;) { 3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch data to cache memory */ 3407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src_ptr + src_stride); 3417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src_ptr + src_stride + 32); 3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src = src_ptr; 3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst = dst_ptr; 3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang odd_dst = (dst + dst_stride); 3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (c = 0; c < count; c++) { 3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 0(%[src]) \n\t" 3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 4(%[src]) \n\t" 3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 1. pixel */ 3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p3], %[qload2] \n\t" 3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p4], %[qload2] \n\t" 3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[qload1] \n\t" 3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p2], %[qload1] \n\t" 3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 8(%[src]) \n\t" 3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 2. pixel */ 3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[qload2] \n\t" 3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p5], %[qload2] \n\t" 3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 12(%[src]) \n\t" 3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 3. pixel */ 3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p2], %[qload1] \n\t" 3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 4. pixel */ 3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 3985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p3], %[qload1] \n\t" 3995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ 4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 16(%[src]) \n\t" 4025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 5. pixel */ 4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 4125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p4], %[qload2] \n\t" 4135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ 4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 6. pixel */ 4235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 4245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 4255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p1], %[qload2] \n\t" 4265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ 4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 4285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 20(%[src]) \n\t" 4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 4315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 4325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 4345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 4355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 7. pixel */ 4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 4395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p5], %[qload1] \n\t" 4405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ 4415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 4425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 4435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 4445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 4455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 4465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 4475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 4485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 8. pixel */ 4505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 4515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 4525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 4535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 4545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ 4555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 4585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 4595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 4605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* ODD pixels */ 4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 1(%[src]) \n\t" 4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 5(%[src]) \n\t" 4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 1. pixel */ 4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 4685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[qload1] \n\t" 4695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p2], %[qload1] \n\t" 4705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p3], %[qload2] \n\t" 4715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p4], %[qload2] \n\t" 4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ 4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 9(%[src]) \n\t" 4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 4785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 2. pixel */ 4835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 4855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[qload2] \n\t" 4865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p5], %[qload2] \n\t" 4875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ 4885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 13(%[src]) \n\t" 4895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 4905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 4915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 4925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 4935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 3. pixel */ 4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p2], %[qload1] \n\t" 5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ 5015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 5025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 5035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 5045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 5055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 5065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 5075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 4. pixel */ 5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 5115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p3], %[qload1] \n\t" 5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ 5145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 5155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 17(%[src]) \n\t" 5165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 5185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 5. pixel */ 5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p4], %[qload2] \n\t" 5275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ 5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 5305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 5315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 5325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 5345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 5355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 6. pixel */ 5375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 5385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 5395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p1], %[qload2] \n\t" 5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ 5415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 5425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 21(%[src]) \n\t" 5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 5495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 7. pixel */ 5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 5525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 5535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p5], %[qload1] \n\t" 5545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ 5555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 5565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 5575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 5585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 8. pixel */ 5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 5685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 5705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ 5745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ 5775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 5785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ 5805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), 5825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 5835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 5845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 5855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) 5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [filter12] "r" (filter12), [filter34] "r" (filter34), 5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [filter56] "r" (filter56), [filter78] "r" (filter78), 5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector_64] "r" (vector_64), [cm] "r" (cm), 5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) 5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += 16; 5935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); 5945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang odd_dst = (dst + dst_stride); 5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 5975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* Next row... */ 5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src_ptr += src_stride; 5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst_ptr += 1; 6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, 6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t src_stride, 6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst_ptr, 6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t dst_stride, 6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter_x0, 6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t h) { 6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t c, y; 6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const uint8_t *src; 6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst; 6137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8_t *cm = vpx_ff_cropTbl; 6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t vector_64 = 64; 6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t filter12, filter34, filter56, filter78; 6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t Temp1, Temp2, Temp3; 6175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t qload1, qload2; 6185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t p1, p2, p3, p4, p5; 6195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t st1, st2, st3; 6205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t dst_pitch_2 = (dst_stride << 1); 6215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *odd_dst; 6225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter12 = ((const int32_t *)filter_x0)[0]; 6245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter34 = ((const int32_t *)filter_x0)[1]; 6255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter56 = ((const int32_t *)filter_x0)[2]; 6265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter78 = ((const int32_t *)filter_x0)[3]; 6275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--;) { 6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch data to cache memory */ 6307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src_ptr + src_stride); 6317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src_ptr + src_stride + 32); 6327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src_ptr + src_stride + 64); 6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src = src_ptr; 6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst = dst_ptr; 6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang odd_dst = (dst + dst_stride); 6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (c = 0; c < 4; c++) { 6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 0(%[src]) \n\t" 6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 4(%[src]) \n\t" 6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 1. pixel */ 6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p3], %[qload2] \n\t" 6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p4], %[qload2] \n\t" 6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[qload1] \n\t" 6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p2], %[qload1] \n\t" 6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 8(%[src]) \n\t" 6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 2. pixel */ 6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[qload2] \n\t" 6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p5], %[qload2] \n\t" 6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 12(%[src]) \n\t" 6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 3. pixel */ 6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 6765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p2], %[qload1] \n\t" 6775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 6785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 6835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 6845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 6855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 6865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 4. pixel */ 6875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 6885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 6895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p3], %[qload1] \n\t" 6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ 6915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 6925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 16(%[src]) \n\t" 6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 6945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 6955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 6965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 6975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 6985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 6995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 5. pixel */ 7015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 7025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 7035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p4], %[qload2] \n\t" 7045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ 7055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 7065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 7075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 7085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 7095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 7105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 7115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 7125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 6. pixel */ 7145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 7155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 7165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p1], %[qload2] \n\t" 7175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ 7185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 7195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 20(%[src]) \n\t" 7205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 7215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 7225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 7235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 7245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 7255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 7265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 7. pixel */ 7285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 7295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 7305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p5], %[qload1] \n\t" 7315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ 7325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 7335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 7345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 7355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 7365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 7375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 7385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 7395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* even 8. pixel */ 7415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 7425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 7435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 7445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 7455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ 7465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 7475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 7485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 7495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 7505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 7515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* ODD pixels */ 7535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 1(%[src]) \n\t" 7545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 5(%[src]) \n\t" 7555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 1. pixel */ 7575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 7585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 7595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[qload1] \n\t" 7605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p2], %[qload1] \n\t" 7615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p3], %[qload2] \n\t" 7625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p4], %[qload2] \n\t" 7635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ 7645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 7655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 9(%[src]) \n\t" 7665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 7675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 7685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 7695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 7705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 7715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 7725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 2. pixel */ 7745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 7755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 7765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p1], %[qload2] \n\t" 7775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p5], %[qload2] \n\t" 7785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ 7795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 13(%[src]) \n\t" 7805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 7815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 7825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 7835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 7845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 7855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 7865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 7875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 3. pixel */ 7885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 7895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 7905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p2], %[qload1] \n\t" 7915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ 7925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 7935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 7945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 7955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 7965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 7975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 7985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 7995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 4. pixel */ 8015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 8025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 8035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p3], %[qload1] \n\t" 8045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ 8055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 8065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload2], 17(%[src]) \n\t" 8075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 8085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 8095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 8105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 8115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 8125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 8135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 5. pixel */ 8155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 8165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac2 \n\t" 8175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p4], %[qload2] \n\t" 8185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ 8195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 8205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 8215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 8225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 8235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 8245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 8255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 8265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 6. pixel */ 8285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 8295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac3 \n\t" 8305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbl %[p1], %[qload2] \n\t" 8315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ 8325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 8335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[qload1], 21(%[src]) \n\t" 8345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 8355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 8365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 8375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 8385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 8395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 8405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 7. pixel */ 8425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 8435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "mthi $zero, $ac1 \n\t" 8445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "preceu.ph.qbr %[p5], %[qload1] \n\t" 8455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ 8465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 8475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 8485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 8495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 8505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 8515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 8525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* odd 8. pixel */ 8545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 8555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 8565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 8575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 8585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 8595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 8615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 8625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 8635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ 8655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 8665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ 8685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 8695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ 8715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), 8735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 8745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 8755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 8765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) 8775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [filter12] "r" (filter12), [filter34] "r" (filter34), 8785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [filter56] "r" (filter56), [filter78] "r" (filter78), 8795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [vector_64] "r" (vector_64), [cm] "r" (cm), 8805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) 8815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 8825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += 16; 8845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); 8855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang odd_dst = (dst + dst_stride); 8865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 8875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* Next row... */ 8895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src_ptr += src_stride; 8905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst_ptr += 1; 8925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 8935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 8945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 8955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, 8965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst, ptrdiff_t dst_stride, 8975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter, int w, int h) { 8985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x, y, k; 8995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = 0; y < h; ++y) { 9015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (x = 0; x < w; ++x) { 9025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int sum = 0; 9035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (k = 0; k < 8; ++k) 9055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang sum += src[x + k] * filter[k]; 9065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); 9085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 9095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 9115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += 1; 9125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 9135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 9145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangvoid copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, 9165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst, ptrdiff_t dst_stride, 9175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int w, int h) { 9185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x, y; 9195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = 0; y < h; ++y) { 9215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (x = 0; x < w; ++x) { 9225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst[x * dst_stride] = src[x]; 9235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 9245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 9265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += 1; 9275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 9285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 9295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, 9315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst, ptrdiff_t dst_stride, 9325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter_x, int x_step_q4, 9335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter_y, int y_step_q4, 9345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int w, int h) { 9357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); 9365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; 9375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t pos = 38; 9385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian assert(x_step_q4 == 16); 9407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian assert(y_step_q4 == 16); 9417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian assert(((const int32_t *)filter_x)[1] != 0x800000); 9427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian assert(((const int32_t *)filter_y)[1] != 0x800000); 9437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 9447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 9455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* bit positon for extract from acc */ 9465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 9475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "wrdsp %[pos], 1 \n\t" 9485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : 9495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [pos] "r" (pos) 9505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 9515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang if (intermediate_height < h) 9535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang intermediate_height = h; 9545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* copy the src to dst */ 9565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang if (filter_x[3] == 0x80) { 9575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang copy_horiz_transposed(src - src_stride * 3, src_stride, 9585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp, intermediate_height, 9595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang w, intermediate_height); 9605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } else if (((const int32_t *)filter_x)[0] == 0) { 9617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpx_convolve2_dspr2(src - src_stride * 3, src_stride, 9625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp, intermediate_height, 9635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_x, 9645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang w, intermediate_height); 9655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } else { 9665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src -= (src_stride * 3 + 3); 9675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch data to cache memory */ 9697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src); 9707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + 32); 9715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 9725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang switch (w) { 9735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 4: 9745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_4_transposed_dspr2(src, src_stride, 9755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp, intermediate_height, 9765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_x, intermediate_height); 9775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 9785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 8: 9795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_8_transposed_dspr2(src, src_stride, 9805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp, intermediate_height, 9815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_x, intermediate_height); 9825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 9835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 16: 9845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 32: 9855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_16_transposed_dspr2(src, src_stride, 9865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp, intermediate_height, 9875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_x, intermediate_height, 9885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang (w/16)); 9895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 9905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 64: 9917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + 32); 9925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_64_transposed_dspr2(src, src_stride, 9935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp, intermediate_height, 9945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_x, intermediate_height); 9955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 9965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang default: 9975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_transposed(src, src_stride, 9985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang temp, intermediate_height, 9995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_x, w, intermediate_height); 10005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 10015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 10025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 10035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* copy the src to dst */ 10055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang if (filter_y[3] == 0x80) { 10065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang copy_horiz_transposed(temp + 3, intermediate_height, 10075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst, dst_stride, 10085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang h, w); 10095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } else if (((const int32_t *)filter_y)[0] == 0) { 10107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpx_convolve2_dspr2(temp + 3, intermediate_height, 10115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst, dst_stride, 10125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_y, 10135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang h, w); 10145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } else { 10155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang switch (h) { 10165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 4: 10175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_4_transposed_dspr2(temp, intermediate_height, 10185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst, dst_stride, 10195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_y, w); 10205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 10215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 8: 10225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_8_transposed_dspr2(temp, intermediate_height, 10235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst, dst_stride, 10245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_y, w); 10255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 10265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 16: 10275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 32: 10285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_16_transposed_dspr2(temp, intermediate_height, 10295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst, dst_stride, 10305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_y, w, (h/16)); 10315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 10325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 64: 10335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_64_transposed_dspr2(temp, intermediate_height, 10345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst, dst_stride, 10355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_y, w); 10365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 10375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang default: 10385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang convolve_horiz_transposed(temp, intermediate_height, 10395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst, dst_stride, 10405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang filter_y, h, w); 10415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 10425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 10435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 10445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 10455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, 10475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint8_t *dst, ptrdiff_t dst_stride, 10485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter_x, int filter_x_stride, 10495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang const int16_t *filter_y, int filter_y_stride, 10505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int w, int h) { 10515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang int x, y; 10525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* prefetch data to cache memory */ 10547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src); 10557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + 32); 10567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_store(dst); 10575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang switch (w) { 10595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 4: 10605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { 10615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp1; 10625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* 1 word storage */ 10645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--; ) { 10657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride); 10667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride + 32); 10677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_store(dst + dst_stride); 10685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 10705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], (%[src]) \n\t" 10715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp1], (%[dst]) \n\t" /* store */ 10725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [tp1] "=&r" (tp1) 10745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [src] "r" (src), [dst] "r" (dst) 10755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 10765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 10785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += dst_stride; 10795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 10805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 10815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 10825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 8: 10835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { 10845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp1, tp2; 10855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* 2 word storage */ 10875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--; ) { 10887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride); 10897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride + 32); 10907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_store(dst + dst_stride); 10915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 10935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], 0(%[src]) \n\t" 10945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 4(%[src]) \n\t" 10955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp1], 0(%[dst]) \n\t" /* store */ 10965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp2], 4(%[dst]) \n\t" /* store */ 10975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 10985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2) 10995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [src] "r" (src), [dst] "r" (dst) 11005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 11015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 11035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += dst_stride; 11045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 11075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 16: 11085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { 11095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp1, tp2, tp3, tp4; 11105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* 4 word storage */ 11125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--; ) { 11137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride); 11147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride + 32); 11157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_store(dst + dst_stride); 11165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 11185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], 0(%[src]) \n\t" 11195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 4(%[src]) \n\t" 11205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp3], 8(%[src]) \n\t" 11215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp4], 12(%[src]) \n\t" 11225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp1], 0(%[dst]) \n\t" /* store */ 11245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp2], 4(%[dst]) \n\t" /* store */ 11255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp3], 8(%[dst]) \n\t" /* store */ 11265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp4], 12(%[dst]) \n\t" /* store */ 11275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 11295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [tp3] "=&r" (tp3), [tp4] "=&r" (tp4) 11305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [src] "r" (src), [dst] "r" (dst) 11315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 11325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 11345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += dst_stride; 11355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 11385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 32: 11395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { 11405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp1, tp2, tp3, tp4; 11415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp5, tp6, tp7, tp8; 11425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* 8 word storage */ 11445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--; ) { 11457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride); 11467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride + 32); 11477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_store(dst + dst_stride); 11485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 11505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], 0(%[src]) \n\t" 11515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 4(%[src]) \n\t" 11525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp3], 8(%[src]) \n\t" 11535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp4], 12(%[src]) \n\t" 11545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp5], 16(%[src]) \n\t" 11555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp6], 20(%[src]) \n\t" 11565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp7], 24(%[src]) \n\t" 11575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp8], 28(%[src]) \n\t" 11585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp1], 0(%[dst]) \n\t" /* store */ 11605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp2], 4(%[dst]) \n\t" /* store */ 11615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp3], 8(%[dst]) \n\t" /* store */ 11625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp4], 12(%[dst]) \n\t" /* store */ 11635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp5], 16(%[dst]) \n\t" /* store */ 11645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp6], 20(%[dst]) \n\t" /* store */ 11655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp7], 24(%[dst]) \n\t" /* store */ 11665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp8], 28(%[dst]) \n\t" /* store */ 11675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 11695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 11705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), 11715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) 11725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [src] "r" (src), [dst] "r" (dst) 11735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 11745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 11765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += dst_stride; 11775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 11795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 11805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang case 64: 11815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang { 11825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp1, tp2, tp3, tp4; 11835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang uint32_t tp5, tp6, tp7, tp8; 11845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + 64); 11867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_store(dst + 32); 11875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang /* 16 word storage */ 11895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--; ) { 11907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride); 11917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride + 32); 11927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_load(src + src_stride + 64); 11937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_store(dst + dst_stride); 11947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian prefetch_store(dst + dst_stride + 32); 11955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 11965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang __asm__ __volatile__ ( 11975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], 0(%[src]) \n\t" 11985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 4(%[src]) \n\t" 11995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp3], 8(%[src]) \n\t" 12005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp4], 12(%[src]) \n\t" 12015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp5], 16(%[src]) \n\t" 12025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp6], 20(%[src]) \n\t" 12035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp7], 24(%[src]) \n\t" 12045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp8], 28(%[src]) \n\t" 12055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp1], 0(%[dst]) \n\t" /* store */ 12075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp2], 4(%[dst]) \n\t" /* store */ 12085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp3], 8(%[dst]) \n\t" /* store */ 12095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp4], 12(%[dst]) \n\t" /* store */ 12105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp5], 16(%[dst]) \n\t" /* store */ 12115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp6], 20(%[dst]) \n\t" /* store */ 12125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp7], 24(%[dst]) \n\t" /* store */ 12135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp8], 28(%[dst]) \n\t" /* store */ 12145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp1], 32(%[src]) \n\t" 12165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp2], 36(%[src]) \n\t" 12175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp3], 40(%[src]) \n\t" 12185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp4], 44(%[src]) \n\t" 12195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp5], 48(%[src]) \n\t" 12205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp6], 52(%[src]) \n\t" 12215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp7], 56(%[src]) \n\t" 12225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "ulw %[tp8], 60(%[src]) \n\t" 12235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp1], 32(%[dst]) \n\t" /* store */ 12255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp2], 36(%[dst]) \n\t" /* store */ 12265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp3], 40(%[dst]) \n\t" /* store */ 12275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp4], 44(%[dst]) \n\t" /* store */ 12285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp5], 48(%[dst]) \n\t" /* store */ 12295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp6], 52(%[dst]) \n\t" /* store */ 12305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp7], 56(%[dst]) \n\t" /* store */ 12315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang "sw %[tp8], 60(%[dst]) \n\t" /* store */ 12325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 12345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 12355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), 12365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) 12375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang : [src] "r" (src), [dst] "r" (dst) 12385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang ); 12395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 12415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += dst_stride; 12425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 12435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 12445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 12455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang default: 12465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (y = h; y--; ) { 12475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang for (x = 0; x < w; ++x) { 12485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst[x] = src[x]; 12495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 12505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang 12515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang src += src_stride; 12525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang dst += dst_stride; 12535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 12545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang break; 12555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang } 12565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang} 12575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif 1258