17c8da7ce66017295a65ec028084b90800be377f8James Zern// Copyright 2014 Google Inc. All Rights Reserved. 27c8da7ce66017295a65ec028084b90800be377f8James Zern// 37c8da7ce66017295a65ec028084b90800be377f8James Zern// Use of this source code is governed by a BSD-style license 47c8da7ce66017295a65ec028084b90800be377f8James Zern// that can be found in the COPYING file in the root of the source 57c8da7ce66017295a65ec028084b90800be377f8James Zern// tree. An additional intellectual property rights grant can be found 67c8da7ce66017295a65ec028084b90800be377f8James Zern// in the file PATENTS. All contributing project authors may 77c8da7ce66017295a65ec028084b90800be377f8James Zern// be found in the AUTHORS file in the root of the source tree. 87c8da7ce66017295a65ec028084b90800be377f8James Zern// ----------------------------------------------------------------------------- 97c8da7ce66017295a65ec028084b90800be377f8James Zern// 107c8da7ce66017295a65ec028084b90800be377f8James Zern// MIPS version of dsp functions 117c8da7ce66017295a65ec028084b90800be377f8James Zern// 127c8da7ce66017295a65ec028084b90800be377f8James Zern// Author(s): Djordje Pesut (djordje.pesut@imgtec.com) 137c8da7ce66017295a65ec028084b90800be377f8James Zern// Jovan Zelincevic (jovan.zelincevic@imgtec.com) 147c8da7ce66017295a65ec028084b90800be377f8James Zern 157c8da7ce66017295a65ec028084b90800be377f8James Zern#include "./dsp.h" 167c8da7ce66017295a65ec028084b90800be377f8James Zern 177c8da7ce66017295a65ec028084b90800be377f8James Zern#if defined(WEBP_USE_MIPS_DSP_R2) 187c8da7ce66017295a65ec028084b90800be377f8James Zern 197c8da7ce66017295a65ec028084b90800be377f8James Zern#include "./mips_macro.h" 207c8da7ce66017295a65ec028084b90800be377f8James Zern 217c8da7ce66017295a65ec028084b90800be377f8James Zernstatic const int kC1 = 20091 + (1 << 16); 227c8da7ce66017295a65ec028084b90800be377f8James Zernstatic const int kC2 = 35468; 237c8da7ce66017295a65ec028084b90800be377f8James Zern 247c8da7ce66017295a65ec028084b90800be377f8James Zern#define MUL(a, b) (((a) * (b)) >> 16) 257c8da7ce66017295a65ec028084b90800be377f8James Zern 267c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void TransformDC(const int16_t* in, uint8_t* dst) { 277c8da7ce66017295a65ec028084b90800be377f8James Zern int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10; 287c8da7ce66017295a65ec028084b90800be377f8James Zern 297c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 307c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst, 317c8da7ce66017295a65ec028084b90800be377f8James Zern 0, 0, 0, 0, 327c8da7ce66017295a65ec028084b90800be377f8James Zern 0, 1, 2, 3, 337c8da7ce66017295a65ec028084b90800be377f8James Zern BPS) 347c8da7ce66017295a65ec028084b90800be377f8James Zern "lh %[temp5], 0(%[in]) \n\t" 357c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp5], %[temp5], 4 \n\t" 367c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[temp5], %[temp5], 16, 16 \n\t" 377c8da7ce66017295a65ec028084b90800be377f8James Zern "shra.ph %[temp5], %[temp5], 3 \n\t" 387c8da7ce66017295a65ec028084b90800be377f8James Zern CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2, 397c8da7ce66017295a65ec028084b90800be377f8James Zern temp3, temp1, temp2, temp3, temp4) 407c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3, 417c8da7ce66017295a65ec028084b90800be377f8James Zern temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5, 427c8da7ce66017295a65ec028084b90800be377f8James Zern dst, 0, 1, 2, 3, BPS) 437c8da7ce66017295a65ec028084b90800be377f8James Zern 447c8da7ce66017295a65ec028084b90800be377f8James Zern OUTPUT_EARLY_CLOBBER_REGS_10() 457c8da7ce66017295a65ec028084b90800be377f8James Zern : [in]"r"(in), [dst]"r"(dst) 467c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 477c8da7ce66017295a65ec028084b90800be377f8James Zern ); 487c8da7ce66017295a65ec028084b90800be377f8James Zern} 497c8da7ce66017295a65ec028084b90800be377f8James Zern 507c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void TransformAC3(const int16_t* in, uint8_t* dst) { 517c8da7ce66017295a65ec028084b90800be377f8James Zern const int a = in[0] + 4; 527c8da7ce66017295a65ec028084b90800be377f8James Zern int c4 = MUL(in[4], kC2); 537c8da7ce66017295a65ec028084b90800be377f8James Zern const int d4 = MUL(in[4], kC1); 547c8da7ce66017295a65ec028084b90800be377f8James Zern const int c1 = MUL(in[1], kC2); 557c8da7ce66017295a65ec028084b90800be377f8James Zern const int d1 = MUL(in[1], kC1); 567c8da7ce66017295a65ec028084b90800be377f8James Zern int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; 577c8da7ce66017295a65ec028084b90800be377f8James Zern int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; 587c8da7ce66017295a65ec028084b90800be377f8James Zern 597c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 607c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[c4], %[d4], 16, 16 \n\t" 617c8da7ce66017295a65ec028084b90800be377f8James Zern "replv.ph %[temp1], %[a] \n\t" 627c8da7ce66017295a65ec028084b90800be377f8James Zern "replv.ph %[temp4], %[d1] \n\t" 637c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp2, temp3, temp1, c4) 647c8da7ce66017295a65ec028084b90800be377f8James Zern "replv.ph %[temp5], %[c1] \n\t" 657c8da7ce66017295a65ec028084b90800be377f8James Zern SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4, 667c8da7ce66017295a65ec028084b90800be377f8James Zern temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5) 677c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst, 687c8da7ce66017295a65ec028084b90800be377f8James Zern 0, 0, 0, 0, 697c8da7ce66017295a65ec028084b90800be377f8James Zern 0, 1, 2, 3, 707c8da7ce66017295a65ec028084b90800be377f8James Zern BPS) 717c8da7ce66017295a65ec028084b90800be377f8James Zern CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16, 727c8da7ce66017295a65ec028084b90800be377f8James Zern temp11, temp17, temp3, temp5, temp11, temp12) 737c8da7ce66017295a65ec028084b90800be377f8James Zern PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2, 747c8da7ce66017295a65ec028084b90800be377f8James Zern temp4, temp7, temp6, temp10, temp9) 757c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11, 767c8da7ce66017295a65ec028084b90800be377f8James Zern temp17, temp12, temp18, temp1, temp8, temp2, temp4, 777c8da7ce66017295a65ec028084b90800be377f8James Zern temp7, temp6, dst, 0, 1, 2, 3, BPS) 787c8da7ce66017295a65ec028084b90800be377f8James Zern 797c8da7ce66017295a65ec028084b90800be377f8James Zern OUTPUT_EARLY_CLOBBER_REGS_18(), 807c8da7ce66017295a65ec028084b90800be377f8James Zern [c4]"+&r"(c4) 817c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1) 827c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 837c8da7ce66017295a65ec028084b90800be377f8James Zern ); 847c8da7ce66017295a65ec028084b90800be377f8James Zern} 857c8da7ce66017295a65ec028084b90800be377f8James Zern 867c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void TransformOne(const int16_t* in, uint8_t* dst) { 877c8da7ce66017295a65ec028084b90800be377f8James Zern int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; 887c8da7ce66017295a65ec028084b90800be377f8James Zern int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; 897c8da7ce66017295a65ec028084b90800be377f8James Zern 907c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 917c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp1], 0(%[in]) \n\t" 927c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp2], 16(%[in]) \n\t" 937c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_IN_X2(temp5, temp6, 24, 26) 947c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp3, temp4, temp1, temp2) 957c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_IN_X2(temp1, temp2, 8, 10) 967c8da7ce66017295a65ec028084b90800be377f8James Zern MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, 977c8da7ce66017295a65ec028084b90800be377f8James Zern temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6, 987c8da7ce66017295a65ec028084b90800be377f8James Zern temp13, temp11, temp14, temp12) 997c8da7ce66017295a65ec028084b90800be377f8James Zern INSERT_HALF_X2(temp8, temp7, temp10, temp9) 1007c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp17], 4(%[in]) \n\t" 1017c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp18], 20(%[in]) \n\t" 1027c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp1, temp2, temp3, temp8) 1037c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp5, temp6, temp4, temp7) 1047c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp7, temp8, temp17, temp18) 1057c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_IN_X2(temp17, temp18, 12, 14) 1067c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_IN_X2(temp9, temp10, 28, 30) 1077c8da7ce66017295a65ec028084b90800be377f8James Zern MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17, 1087c8da7ce66017295a65ec028084b90800be377f8James Zern temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10, 1097c8da7ce66017295a65ec028084b90800be377f8James Zern temp15, temp4, temp16, temp17) 1107c8da7ce66017295a65ec028084b90800be377f8James Zern INSERT_HALF_X2(temp11, temp12, temp13, temp14) 1117c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp17, temp8, temp8, temp11) 1127c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp3, temp4, temp7, temp12) 1137c8da7ce66017295a65ec028084b90800be377f8James Zern 1147c8da7ce66017295a65ec028084b90800be377f8James Zern // horizontal 1157c8da7ce66017295a65ec028084b90800be377f8James Zern SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6) 1167c8da7ce66017295a65ec028084b90800be377f8James Zern INSERT_HALF_X2(temp1, temp6, temp5, temp2) 1177c8da7ce66017295a65ec028084b90800be377f8James Zern SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8) 1187c8da7ce66017295a65ec028084b90800be377f8James Zern "repl.ph %[temp2], 0x4 \n\t" 1197c8da7ce66017295a65ec028084b90800be377f8James Zern INSERT_HALF_X2(temp3, temp8, temp17, temp4) 1207c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp1], %[temp1], %[temp2] \n\t" 1217c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp6], %[temp6], %[temp2] \n\t" 1227c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp2, temp4, temp1, temp3) 1237c8da7ce66017295a65ec028084b90800be377f8James Zern ADD_SUB_HALVES(temp5, temp7, temp6, temp8) 1247c8da7ce66017295a65ec028084b90800be377f8James Zern MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18, 1257c8da7ce66017295a65ec028084b90800be377f8James Zern temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15, 1267c8da7ce66017295a65ec028084b90800be377f8James Zern temp6, temp17, temp8, temp18) 1277c8da7ce66017295a65ec028084b90800be377f8James Zern MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16, 1287c8da7ce66017295a65ec028084b90800be377f8James Zern temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14, 1297c8da7ce66017295a65ec028084b90800be377f8James Zern temp18, temp12, temp17, temp16) 1307c8da7ce66017295a65ec028084b90800be377f8James Zern INSERT_HALF_X2(temp1, temp3, temp9, temp13) 1317c8da7ce66017295a65ec028084b90800be377f8James Zern INSERT_HALF_X2(temp6, temp8, temp11, temp15) 1327c8da7ce66017295a65ec028084b90800be377f8James Zern SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15, 1337c8da7ce66017295a65ec028084b90800be377f8James Zern temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8, 1347c8da7ce66017295a65ec028084b90800be377f8James Zern temp6) 1357c8da7ce66017295a65ec028084b90800be377f8James Zern PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13, 1367c8da7ce66017295a65ec028084b90800be377f8James Zern temp16, temp11, temp10, temp15, temp14) 1377c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst, 1387c8da7ce66017295a65ec028084b90800be377f8James Zern 0, 0, 0, 0, 1397c8da7ce66017295a65ec028084b90800be377f8James Zern 0, 1, 2, 3, 1407c8da7ce66017295a65ec028084b90800be377f8James Zern BPS) 1417c8da7ce66017295a65ec028084b90800be377f8James Zern CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10, 1427c8da7ce66017295a65ec028084b90800be377f8James Zern temp11, temp10, temp11, temp14, temp15) 1437c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11, 1447c8da7ce66017295a65ec028084b90800be377f8James Zern temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4, 1457c8da7ce66017295a65ec028084b90800be377f8James Zern dst, 0, 1, 2, 3, BPS) 1467c8da7ce66017295a65ec028084b90800be377f8James Zern 1477c8da7ce66017295a65ec028084b90800be377f8James Zern OUTPUT_EARLY_CLOBBER_REGS_18() 1487c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2) 1497c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory", "hi", "lo" 1507c8da7ce66017295a65ec028084b90800be377f8James Zern ); 1517c8da7ce66017295a65ec028084b90800be377f8James Zern} 1527c8da7ce66017295a65ec028084b90800be377f8James Zern 1537c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { 1547c8da7ce66017295a65ec028084b90800be377f8James Zern TransformOne(in, dst); 1557c8da7ce66017295a65ec028084b90800be377f8James Zern if (do_two) { 1567c8da7ce66017295a65ec028084b90800be377f8James Zern TransformOne(in + 16, dst + 4); 1577c8da7ce66017295a65ec028084b90800be377f8James Zern } 1587c8da7ce66017295a65ec028084b90800be377f8James Zern} 1597c8da7ce66017295a65ec028084b90800be377f8James Zern 1607c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void FilterLoop26(uint8_t* p, 1617c8da7ce66017295a65ec028084b90800be377f8James Zern int hstride, int vstride, int size, 1627c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 1637c8da7ce66017295a65ec028084b90800be377f8James Zern const int thresh2 = 2 * thresh + 1; 1647c8da7ce66017295a65ec028084b90800be377f8James Zern int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; 1657c8da7ce66017295a65ec028084b90800be377f8James Zern int temp10, temp11, temp12, temp13, temp14, temp15; 1667c8da7ce66017295a65ec028084b90800be377f8James Zern 1677c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 1687c8da7ce66017295a65ec028084b90800be377f8James Zern ".set push \n\t" 1697c8da7ce66017295a65ec028084b90800be377f8James Zern ".set noreorder \n\t" 1707c8da7ce66017295a65ec028084b90800be377f8James Zern "1: \n\t" 1717c8da7ce66017295a65ec028084b90800be377f8James Zern "negu %[temp1], %[hstride] \n\t" 1727c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[size], %[size], -1 \n\t" 1737c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp2], %[hstride], 1 \n\t" 1747c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp3], %[temp1], 1 \n\t" 1757c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp4], %[temp2], %[hstride] \n\t" 1767c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp5], %[temp3], %[temp1] \n\t" 1777c8da7ce66017295a65ec028084b90800be377f8James Zern "lbu %[temp7], 0(%[p]) \n\t" 1787c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp6], %[temp3], 1 \n\t" 1797c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp8], %[temp5](%[p]) \n\t" 1807c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp9], %[temp3](%[p]) \n\t" 1817c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp10], %[temp1](%[p]) \n\t" 1827c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp11], %[temp6](%[p]) \n\t" 1837c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp12], %[hstride](%[p]) \n\t" 1847c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp13], %[temp2](%[p]) \n\t" 1857c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp14], %[temp4](%[p]) \n\t" 1867c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp1], %[temp10], %[temp7] \n\t" 1877c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp2], %[temp9], %[temp12] \n\t" 1887c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp3], %[temp1] \n\t" 1897c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp4], %[temp2] \n\t" 1907c8da7ce66017295a65ec028084b90800be377f8James Zern "negu %[temp1], %[temp1] \n\t" 1917c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp3], %[temp3], 2 \n\t" 1927c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp15], %[temp3], %[temp4] \n\t" 1937c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp3], %[temp15], %[thresh2] \n\t" 1947c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp6], %[temp1], 1 \n\t" 1957c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp3], 3f \n\t" 1967c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp4], %[temp11], %[temp8] \n\t" 1977c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp4], %[temp4] \n\t" 1987c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp2], %[temp2], 24 \n\t" 1997c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp4], %[temp4], %[ithresh] \n\t" 2007c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp4], 3f \n\t" 2017c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp3], %[temp8], %[temp9] \n\t" 2027c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp3], %[temp3] \n\t" 2037c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp3], %[temp3], %[ithresh] \n\t" 2047c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp3], 3f \n\t" 2057c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp5], %[temp9], %[temp10] \n\t" 2067c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp3], %[temp5] \n\t" 2077c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp5], %[temp5] \n\t" 2087c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp3], %[temp3], %[ithresh] \n\t" 2097c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp3], 3f \n\t" 2107c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp3], %[temp14], %[temp13] \n\t" 2117c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp3], %[temp3] \n\t" 2127c8da7ce66017295a65ec028084b90800be377f8James Zern "slt %[temp5], %[hev_thresh], %[temp5] \n\t" 2137c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp3], %[temp3], %[ithresh] \n\t" 2147c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp3], 3f \n\t" 2157c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp3], %[temp13], %[temp12] \n\t" 2167c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp3], %[temp3] \n\t" 2177c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp4], %[temp2], 24 \n\t" 2187c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp3], %[temp3], %[ithresh] \n\t" 2197c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp3], 3f \n\t" 2207c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp15], %[temp12], %[temp7] \n\t" 2217c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp3], %[temp15] \n\t" 2227c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp15], %[temp15] \n\t" 2237c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp3], %[temp3], %[ithresh] \n\t" 2247c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp3], 3f \n\t" 2257c8da7ce66017295a65ec028084b90800be377f8James Zern " slt %[temp15], %[hev_thresh], %[temp15] \n\t" 2267c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp3], %[temp6], %[temp1] \n\t" 2277c8da7ce66017295a65ec028084b90800be377f8James Zern "or %[temp2], %[temp5], %[temp15] \n\t" 2287c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp5], %[temp4], %[temp3] \n\t" 2297c8da7ce66017295a65ec028084b90800be377f8James Zern "beqz %[temp2], 4f \n\t" 2307c8da7ce66017295a65ec028084b90800be377f8James Zern " shra_r.w %[temp1], %[temp5], 3 \n\t" 2317c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp2], %[temp5], 3 \n\t" 2327c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp2], %[temp2], 3 \n\t" 2337c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp1], %[temp1], 27 \n\t" 2347c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp2], %[temp2], 27 \n\t" 2357c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp3], %[p], %[hstride] \n\t" 2367c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp1], %[temp1], 27 \n\t" 2377c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp2], %[temp2], 27 \n\t" 2387c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp1], %[temp7], %[temp1] \n\t" 2397c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp2], %[temp10], %[temp2] \n\t" 2407c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t" 2417c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t" 2427c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp2], 0(%[temp3]) \n\t" 2437c8da7ce66017295a65ec028084b90800be377f8James Zern "j 3f \n\t" 2447c8da7ce66017295a65ec028084b90800be377f8James Zern " sb %[temp1], 0(%[p]) \n\t" 2457c8da7ce66017295a65ec028084b90800be377f8James Zern "4: \n\t" 2467c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp5], %[temp5], 24 \n\t" 2477c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp14], %[p], %[hstride] \n\t" 2487c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp11], %[temp14], %[hstride] \n\t" 2497c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp6], %[temp5], 24 \n\t" 2507c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp1], %[temp6], 3 \n\t" 2517c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp15], %[temp11], %[hstride] \n\t" 2527c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp2], %[temp6], %[temp1] \n\t" 2537c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp3], %[temp2], 1 \n\t" 2547c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp4], %[temp3], %[temp2] \n\t" 2557c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp2], %[temp2], 63 \n\t" 2567c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp3], %[temp3], 63 \n\t" 2577c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp4], %[temp4], 63 \n\t" 2587c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp2], %[temp2], 7 \n\t" 2597c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp3], %[temp3], 7 \n\t" 2607c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp4], %[temp4], 7 \n\t" 2617c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp1], %[temp8], %[temp2] \n\t" 2627c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp5], %[temp9], %[temp3] \n\t" 2637c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp6], %[temp10], %[temp4] \n\t" 2647c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp8], %[temp7], %[temp4] \n\t" 2657c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp7], %[temp12], %[temp3] \n\t" 2667c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp10], %[p], %[hstride] \n\t" 2677c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp9], %[temp13], %[temp2] \n\t" 2687c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp12], %[temp10], %[hstride] \n\t" 2697c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t" 2707c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t" 2717c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t" 2727c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t" 2737c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t" 2747c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t" 2757c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp2], 0(%[temp15]) \n\t" 2767c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp3], 0(%[temp11]) \n\t" 2777c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp4], 0(%[temp14]) \n\t" 2787c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp5], 0(%[p]) \n\t" 2797c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp6], 0(%[temp10]) \n\t" 2807c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp8], 0(%[temp12]) \n\t" 2817c8da7ce66017295a65ec028084b90800be377f8James Zern "3: \n\t" 2827c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[size], 1b \n\t" 2837c8da7ce66017295a65ec028084b90800be377f8James Zern " addu %[p], %[p], %[vstride] \n\t" 2847c8da7ce66017295a65ec028084b90800be377f8James Zern ".set pop \n\t" 2857c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3), 2867c8da7ce66017295a65ec028084b90800be377f8James Zern [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), 2877c8da7ce66017295a65ec028084b90800be377f8James Zern [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9), 2887c8da7ce66017295a65ec028084b90800be377f8James Zern [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12), 2897c8da7ce66017295a65ec028084b90800be377f8James Zern [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15), 2907c8da7ce66017295a65ec028084b90800be377f8James Zern [size]"+&r"(size), [p]"+&r"(p) 2917c8da7ce66017295a65ec028084b90800be377f8James Zern : [hstride]"r"(hstride), [thresh2]"r"(thresh2), 2927c8da7ce66017295a65ec028084b90800be377f8James Zern [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh), 2937c8da7ce66017295a65ec028084b90800be377f8James Zern [VP8kclip1]"r"(VP8kclip1) 2947c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 2957c8da7ce66017295a65ec028084b90800be377f8James Zern ); 2967c8da7ce66017295a65ec028084b90800be377f8James Zern} 2977c8da7ce66017295a65ec028084b90800be377f8James Zern 2987c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void FilterLoop24(uint8_t* p, 2997c8da7ce66017295a65ec028084b90800be377f8James Zern int hstride, int vstride, int size, 3007c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 3017c8da7ce66017295a65ec028084b90800be377f8James Zern int p0, q0, p1, q1, p2, q2, p3, q3; 3027c8da7ce66017295a65ec028084b90800be377f8James Zern int step1, step2, temp1, temp2, temp3, temp4; 3037c8da7ce66017295a65ec028084b90800be377f8James Zern uint8_t* pTemp0; 3047c8da7ce66017295a65ec028084b90800be377f8James Zern uint8_t* pTemp1; 3057c8da7ce66017295a65ec028084b90800be377f8James Zern const int thresh2 = 2 * thresh + 1; 3067c8da7ce66017295a65ec028084b90800be377f8James Zern 3077c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 3087c8da7ce66017295a65ec028084b90800be377f8James Zern ".set push \n\t" 3097c8da7ce66017295a65ec028084b90800be377f8James Zern ".set noreorder \n\t" 3107c8da7ce66017295a65ec028084b90800be377f8James Zern "bltz %[size], 3f \n\t" 3117c8da7ce66017295a65ec028084b90800be377f8James Zern " nop \n\t" 3127c8da7ce66017295a65ec028084b90800be377f8James Zern "2: \n\t" 3137c8da7ce66017295a65ec028084b90800be377f8James Zern "negu %[step1], %[hstride] \n\t" 3147c8da7ce66017295a65ec028084b90800be377f8James Zern "lbu %[q0], 0(%[p]) \n\t" 3157c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[p0], %[step1](%[p]) \n\t" 3167c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[step1], %[step1], %[hstride] \n\t" 3177c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[q1], %[hstride](%[p]) \n\t" 3187c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp1], %[p0], %[q0] \n\t" 3197c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[p1], %[step1](%[p]) \n\t" 3207c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[step2], %[hstride], %[hstride] \n\t" 3217c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp2], %[temp1] \n\t" 3227c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp3], %[p1], %[q1] \n\t" 3237c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp4], %[temp3] \n\t" 3247c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp2], %[temp2], 2 \n\t" 3257c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp2], %[temp2], %[temp4] \n\t" 3267c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp4], %[temp2], %[thresh2] \n\t" 3277c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[step1], %[step1], %[hstride] \n\t" 3287c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp4], 0f \n\t" 3297c8da7ce66017295a65ec028084b90800be377f8James Zern " lbux %[p2], %[step1](%[p]) \n\t" 3307c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[step1], %[step1], %[hstride] \n\t" 3317c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[q2], %[step2](%[p]) \n\t" 3327c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[p3], %[step1](%[p]) \n\t" 3337c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp4], %[p2], %[p1] \n\t" 3347c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[step2], %[step2], %[hstride] \n\t" 3357c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp2], %[p3], %[p2] \n\t" 3367c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp4], %[temp4] \n\t" 3377c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp2], %[temp2] \n\t" 3387c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[q3], %[step2](%[p]) \n\t" 3397c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp4], %[temp4], %[ithresh] \n\t" 3407c8da7ce66017295a65ec028084b90800be377f8James Zern "negu %[temp1], %[temp1] \n\t" 3417c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp4], 0f \n\t" 3427c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp2], %[temp2], %[ithresh] \n\t" 3437c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[p3], %[p1], %[p0] \n\t" 3447c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp2], 0f \n\t" 3457c8da7ce66017295a65ec028084b90800be377f8James Zern " absq_s.w %[p3], %[p3] \n\t" 3467c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp4], %[q3], %[q2] \n\t" 3477c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[pTemp0], %[p], %[hstride] \n\t" 3487c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp4], %[temp4] \n\t" 3497c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp2], %[p3], %[ithresh] \n\t" 3507c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[step1], %[temp1], 1 \n\t" 3517c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp2], 0f \n\t" 3527c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp4], %[temp4], %[ithresh] \n\t" 3537c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp2], %[q2], %[q1] \n\t" 3547c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp4], 0f \n\t" 3557c8da7ce66017295a65ec028084b90800be377f8James Zern " absq_s.w %[temp2], %[temp2] \n\t" 3567c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[q3], %[q1], %[q0] \n\t" 3577c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[q3], %[q3] \n\t" 3587c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp2], %[temp2], %[ithresh] \n\t" 3597c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp1], %[temp1], %[step1] \n\t" 3607c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp2], 0f \n\t" 3617c8da7ce66017295a65ec028084b90800be377f8James Zern " subu %[temp4], %[q3], %[ithresh] \n\t" 3627c8da7ce66017295a65ec028084b90800be377f8James Zern "slt %[p3], %[hev_thresh], %[p3] \n\t" 3637c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp4], 0f \n\t" 3647c8da7ce66017295a65ec028084b90800be377f8James Zern " slt %[q3], %[hev_thresh], %[q3] \n\t" 3657c8da7ce66017295a65ec028084b90800be377f8James Zern "or %[q3], %[q3], %[p3] \n\t" 3667c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[q3], 1f \n\t" 3677c8da7ce66017295a65ec028084b90800be377f8James Zern " shra_r.w %[temp2], %[temp1], 3 \n\t" 3687c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp1], %[temp1], 3 \n\t" 3697c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp1], %[temp1], 3 \n\t" 3707c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp2], %[temp2], 27 \n\t" 3717c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp1], %[temp1], 27 \n\t" 3727c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[pTemp1], %[p], %[hstride] \n\t" 3737c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp2], %[temp2], 27 \n\t" 3747c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp1], %[temp1], 27 \n\t" 3757c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[step1], %[temp2], 1 \n\t" 3767c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[step1], %[step1], 1 \n\t" 3777c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[p0], %[p0], %[temp1] \n\t" 3787c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[p1], %[p1], %[step1] \n\t" 3797c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[q0], %[q0], %[temp2] \n\t" 3807c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[q1], %[q1], %[step1] \n\t" 3817c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t" 3827c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t" 3837c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t" 3847c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp2], 0(%[pTemp0]) \n\t" 3857c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t" 3867c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[pTemp0], %[pTemp0], %[hstride] \n\t" 3877c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp3], 0(%[p]) \n\t" 3887c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp4], 0(%[pTemp1]) \n\t" 3897c8da7ce66017295a65ec028084b90800be377f8James Zern "j 0f \n\t" 3907c8da7ce66017295a65ec028084b90800be377f8James Zern " sb %[temp1], 0(%[pTemp0]) \n\t" 3917c8da7ce66017295a65ec028084b90800be377f8James Zern "1: \n\t" 3927c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp3], %[temp3], 24 \n\t" 3937c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp3], %[temp3], 24 \n\t" 3947c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp1], %[temp1], %[temp3] \n\t" 3957c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp2], %[temp1], 3 \n\t" 3967c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp1], %[temp1], 3 \n\t" 3977c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp2], %[temp2], 27 \n\t" 3987c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp1], %[temp1], 3 \n\t" 3997c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp1], %[temp1], 27 \n\t" 4007c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp2], %[temp2], 27 \n\t" 4017c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp1], %[temp1], 27 \n\t" 4027c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[p0], %[p0], %[temp1] \n\t" 4037c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[q0], %[q0], %[temp2] \n\t" 4047c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t" 4057c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t" 4067c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp2], 0(%[p]) \n\t" 4077c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp1], 0(%[pTemp0]) \n\t" 4087c8da7ce66017295a65ec028084b90800be377f8James Zern "0: \n\t" 4097c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[size], %[size], 1 \n\t" 4107c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[size], 2b \n\t" 4117c8da7ce66017295a65ec028084b90800be377f8James Zern " addu %[p], %[p], %[vstride] \n\t" 4127c8da7ce66017295a65ec028084b90800be377f8James Zern "3: \n\t" 4137c8da7ce66017295a65ec028084b90800be377f8James Zern ".set pop \n\t" 4147c8da7ce66017295a65ec028084b90800be377f8James Zern : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1), 4157c8da7ce66017295a65ec028084b90800be377f8James Zern [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3), 4167c8da7ce66017295a65ec028084b90800be377f8James Zern [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1), 4177c8da7ce66017295a65ec028084b90800be377f8James Zern [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), 4187c8da7ce66017295a65ec028084b90800be377f8James Zern [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p), 4197c8da7ce66017295a65ec028084b90800be377f8James Zern [size]"+&r"(size) 4207c8da7ce66017295a65ec028084b90800be377f8James Zern : [vstride]"r"(vstride), [ithresh]"r"(ithresh), 4217c8da7ce66017295a65ec028084b90800be377f8James Zern [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride), 4227c8da7ce66017295a65ec028084b90800be377f8James Zern [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) 4237c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 4247c8da7ce66017295a65ec028084b90800be377f8James Zern ); 4257c8da7ce66017295a65ec028084b90800be377f8James Zern} 4267c8da7ce66017295a65ec028084b90800be377f8James Zern 4277c8da7ce66017295a65ec028084b90800be377f8James Zern// on macroblock edges 4287c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void VFilter16(uint8_t* p, int stride, 4297c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 4307c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh); 4317c8da7ce66017295a65ec028084b90800be377f8James Zern} 4327c8da7ce66017295a65ec028084b90800be377f8James Zern 4337c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void HFilter16(uint8_t* p, int stride, 4347c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 4357c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh); 4367c8da7ce66017295a65ec028084b90800be377f8James Zern} 4377c8da7ce66017295a65ec028084b90800be377f8James Zern 4387c8da7ce66017295a65ec028084b90800be377f8James Zern// 8-pixels wide variant, for chroma filtering 4397c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void VFilter8(uint8_t* u, uint8_t* v, int stride, 4407c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 4417c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); 4427c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); 4437c8da7ce66017295a65ec028084b90800be377f8James Zern} 4447c8da7ce66017295a65ec028084b90800be377f8James Zern 4457c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void HFilter8(uint8_t* u, uint8_t* v, int stride, 4467c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 4477c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); 4487c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); 4497c8da7ce66017295a65ec028084b90800be377f8James Zern} 4507c8da7ce66017295a65ec028084b90800be377f8James Zern 4517c8da7ce66017295a65ec028084b90800be377f8James Zern// on three inner edges 4527c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void VFilter16i(uint8_t* p, int stride, 4537c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 4547c8da7ce66017295a65ec028084b90800be377f8James Zern int k; 4557c8da7ce66017295a65ec028084b90800be377f8James Zern for (k = 3; k > 0; --k) { 4567c8da7ce66017295a65ec028084b90800be377f8James Zern p += 4 * stride; 4577c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh); 4587c8da7ce66017295a65ec028084b90800be377f8James Zern } 4597c8da7ce66017295a65ec028084b90800be377f8James Zern} 4607c8da7ce66017295a65ec028084b90800be377f8James Zern 4617c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void HFilter16i(uint8_t* p, int stride, 4627c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 4637c8da7ce66017295a65ec028084b90800be377f8James Zern int k; 4647c8da7ce66017295a65ec028084b90800be377f8James Zern for (k = 3; k > 0; --k) { 4657c8da7ce66017295a65ec028084b90800be377f8James Zern p += 4; 4667c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh); 4677c8da7ce66017295a65ec028084b90800be377f8James Zern } 4687c8da7ce66017295a65ec028084b90800be377f8James Zern} 4697c8da7ce66017295a65ec028084b90800be377f8James Zern 4707c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void VFilter8i(uint8_t* u, uint8_t* v, int stride, 4717c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 4727c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); 4737c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); 4747c8da7ce66017295a65ec028084b90800be377f8James Zern} 4757c8da7ce66017295a65ec028084b90800be377f8James Zern 4767c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void HFilter8i(uint8_t* u, uint8_t* v, int stride, 4777c8da7ce66017295a65ec028084b90800be377f8James Zern int thresh, int ithresh, int hev_thresh) { 4787c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); 4797c8da7ce66017295a65ec028084b90800be377f8James Zern FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); 4807c8da7ce66017295a65ec028084b90800be377f8James Zern} 4817c8da7ce66017295a65ec028084b90800be377f8James Zern 4827c8da7ce66017295a65ec028084b90800be377f8James Zern#undef MUL 4837c8da7ce66017295a65ec028084b90800be377f8James Zern 4847c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 4857c8da7ce66017295a65ec028084b90800be377f8James Zern// Simple In-loop filtering (Paragraph 15.2) 4867c8da7ce66017295a65ec028084b90800be377f8James Zern 4877c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void SimpleVFilter16(uint8_t* p, int stride, int thresh) { 4887c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 4897c8da7ce66017295a65ec028084b90800be377f8James Zern const int thresh2 = 2 * thresh + 1; 4907c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; 4917c8da7ce66017295a65ec028084b90800be377f8James Zern uint8_t* p1 = p - stride; 4927c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 4937c8da7ce66017295a65ec028084b90800be377f8James Zern ".set push \n\t" 4947c8da7ce66017295a65ec028084b90800be377f8James Zern ".set noreorder \n\t" 4957c8da7ce66017295a65ec028084b90800be377f8James Zern "li %[i], 16 \n\t" 4967c8da7ce66017295a65ec028084b90800be377f8James Zern "0: \n\t" 4977c8da7ce66017295a65ec028084b90800be377f8James Zern "negu %[temp4], %[stride] \n\t" 4987c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp5], %[temp4], 1 \n\t" 4997c8da7ce66017295a65ec028084b90800be377f8James Zern "lbu %[temp2], 0(%[p]) \n\t" 5007c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp3], %[stride](%[p]) \n\t" 5017c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp1], %[temp4](%[p]) \n\t" 5027c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp0], %[temp5](%[p]) \n\t" 5037c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp7], %[temp1], %[temp2] \n\t" 5047c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp6], %[temp0], %[temp3] \n\t" 5057c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp4], %[temp7] \n\t" 5067c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp5], %[temp6] \n\t" 5077c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp4], %[temp4], 2 \n\t" 5087c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp5], %[temp5], %[thresh2] \n\t" 5097c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp5], %[temp4], %[temp5] \n\t" 5107c8da7ce66017295a65ec028084b90800be377f8James Zern "negu %[temp8], %[temp7] \n\t" 5117c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp5], 1f \n\t" 5127c8da7ce66017295a65ec028084b90800be377f8James Zern " addiu %[i], %[i], -1 \n\t" 5137c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp4], %[temp8], 1 \n\t" 5147c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp5], %[temp6], 24 \n\t" 5157c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp3], %[temp4], %[temp8] \n\t" 5167c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp5], %[temp5], 24 \n\t" 5177c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp3], %[temp3], %[temp5] \n\t" 5187c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp7], %[temp3], 3 \n\t" 5197c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp7], %[temp7], 3 \n\t" 5207c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp8], %[temp3], 3 \n\t" 5217c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp0], %[temp7], 27 \n\t" 5227c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp4], %[temp8], 27 \n\t" 5237c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp0], %[temp0], 27 \n\t" 5247c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp4], %[temp4], 27 \n\t" 5257c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp7], %[temp1], %[temp0] \n\t" 5267c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp2], %[temp2], %[temp4] \n\t" 5277c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t" 5287c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t" 5297c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp3], 0(%[p1]) \n\t" 5307c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp4], 0(%[p]) \n\t" 5317c8da7ce66017295a65ec028084b90800be377f8James Zern "1: \n\t" 5327c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[p1], %[p1], 1 \n\t" 5337c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[i], 0b \n\t" 5347c8da7ce66017295a65ec028084b90800be377f8James Zern " addiu %[p], %[p], 1 \n\t" 5357c8da7ce66017295a65ec028084b90800be377f8James Zern " .set pop \n\t" 5367c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 5377c8da7ce66017295a65ec028084b90800be377f8James Zern [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 5387c8da7ce66017295a65ec028084b90800be377f8James Zern [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), 5397c8da7ce66017295a65ec028084b90800be377f8James Zern [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1) 5407c8da7ce66017295a65ec028084b90800be377f8James Zern : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) 5417c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 5427c8da7ce66017295a65ec028084b90800be377f8James Zern ); 5437c8da7ce66017295a65ec028084b90800be377f8James Zern} 5447c8da7ce66017295a65ec028084b90800be377f8James Zern 5457c8da7ce66017295a65ec028084b90800be377f8James Zern// TEMP0 = SRC[A + A1 * BPS] 5467c8da7ce66017295a65ec028084b90800be377f8James Zern// TEMP1 = SRC[B + B1 * BPS] 5477c8da7ce66017295a65ec028084b90800be377f8James Zern// TEMP2 = SRC[C + C1 * BPS] 5487c8da7ce66017295a65ec028084b90800be377f8James Zern// TEMP3 = SRC[D + D1 * BPS] 5497c8da7ce66017295a65ec028084b90800be377f8James Zern#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \ 5507c8da7ce66017295a65ec028084b90800be377f8James Zern A, A1, B, B1, C, C1, D, D1, SRC) \ 5517c8da7ce66017295a65ec028084b90800be377f8James Zern "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 5527c8da7ce66017295a65ec028084b90800be377f8James Zern "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 5537c8da7ce66017295a65ec028084b90800be377f8James Zern "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 5547c8da7ce66017295a65ec028084b90800be377f8James Zern "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 5557c8da7ce66017295a65ec028084b90800be377f8James Zern 5567c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void SimpleHFilter16(uint8_t* p, int stride, int thresh) { 5577c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 5587c8da7ce66017295a65ec028084b90800be377f8James Zern const int thresh2 = 2 * thresh + 1; 5597c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; 5607c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 5617c8da7ce66017295a65ec028084b90800be377f8James Zern ".set push \n\t" 5627c8da7ce66017295a65ec028084b90800be377f8James Zern ".set noreorder \n\t" 5637c8da7ce66017295a65ec028084b90800be377f8James Zern "li %[i], 16 \n\t" 5647c8da7ce66017295a65ec028084b90800be377f8James Zern "0: \n\t" 5657c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p) 5667c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp7], %[temp1], %[temp2] \n\t" 5677c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp6], %[temp0], %[temp3] \n\t" 5687c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp4], %[temp7] \n\t" 5697c8da7ce66017295a65ec028084b90800be377f8James Zern "absq_s.w %[temp5], %[temp6] \n\t" 5707c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp4], %[temp4], 2 \n\t" 5717c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp5], %[temp4], %[temp5] \n\t" 5727c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp5], %[temp5], %[thresh2] \n\t" 5737c8da7ce66017295a65ec028084b90800be377f8James Zern "negu %[temp8], %[temp7] \n\t" 5747c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[temp5], 1f \n\t" 5757c8da7ce66017295a65ec028084b90800be377f8James Zern " addiu %[i], %[i], -1 \n\t" 5767c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp4], %[temp8], 1 \n\t" 5777c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp5], %[temp6], 24 \n\t" 5787c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp3], %[temp4], %[temp8] \n\t" 5797c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp5], %[temp5], 24 \n\t" 5807c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp3], %[temp3], %[temp5] \n\t" 5817c8da7ce66017295a65ec028084b90800be377f8James Zern "addiu %[temp7], %[temp3], 3 \n\t" 5827c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp7], %[temp7], 3 \n\t" 5837c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp8], %[temp3], 3 \n\t" 5847c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp0], %[temp7], 27 \n\t" 5857c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.w %[temp4], %[temp8], 27 \n\t" 5867c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp0], %[temp0], 27 \n\t" 5877c8da7ce66017295a65ec028084b90800be377f8James Zern "sra %[temp4], %[temp4], 27 \n\t" 5887c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp7], %[temp1], %[temp0] \n\t" 5897c8da7ce66017295a65ec028084b90800be377f8James Zern "subu %[temp2], %[temp2], %[temp4] \n\t" 5907c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t" 5917c8da7ce66017295a65ec028084b90800be377f8James Zern "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t" 5927c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp3], -1(%[p]) \n\t" 5937c8da7ce66017295a65ec028084b90800be377f8James Zern "sb %[temp4], 0(%[p]) \n\t" 5947c8da7ce66017295a65ec028084b90800be377f8James Zern "1: \n\t" 5957c8da7ce66017295a65ec028084b90800be377f8James Zern "bgtz %[i], 0b \n\t" 5967c8da7ce66017295a65ec028084b90800be377f8James Zern " addu %[p], %[p], %[stride] \n\t" 5977c8da7ce66017295a65ec028084b90800be377f8James Zern ".set pop \n\t" 5987c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 5997c8da7ce66017295a65ec028084b90800be377f8James Zern [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 6007c8da7ce66017295a65ec028084b90800be377f8James Zern [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), 6017c8da7ce66017295a65ec028084b90800be377f8James Zern [p]"+&r"(p), [i]"=&r"(i) 6027c8da7ce66017295a65ec028084b90800be377f8James Zern : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) 6037c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 6047c8da7ce66017295a65ec028084b90800be377f8James Zern ); 6057c8da7ce66017295a65ec028084b90800be377f8James Zern} 6067c8da7ce66017295a65ec028084b90800be377f8James Zern 6077c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { 6087c8da7ce66017295a65ec028084b90800be377f8James Zern int k; 6097c8da7ce66017295a65ec028084b90800be377f8James Zern for (k = 3; k > 0; --k) { 6107c8da7ce66017295a65ec028084b90800be377f8James Zern p += 4 * stride; 6117c8da7ce66017295a65ec028084b90800be377f8James Zern SimpleVFilter16(p, stride, thresh); 6127c8da7ce66017295a65ec028084b90800be377f8James Zern } 6137c8da7ce66017295a65ec028084b90800be377f8James Zern} 6147c8da7ce66017295a65ec028084b90800be377f8James Zern 6157c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { 6167c8da7ce66017295a65ec028084b90800be377f8James Zern int k; 6177c8da7ce66017295a65ec028084b90800be377f8James Zern for (k = 3; k > 0; --k) { 6187c8da7ce66017295a65ec028084b90800be377f8James Zern p += 4; 6197c8da7ce66017295a65ec028084b90800be377f8James Zern SimpleHFilter16(p, stride, thresh); 6207c8da7ce66017295a65ec028084b90800be377f8James Zern } 6217c8da7ce66017295a65ec028084b90800be377f8James Zern} 6227c8da7ce66017295a65ec028084b90800be377f8James Zern 6237c8da7ce66017295a65ec028084b90800be377f8James Zern// DST[A * BPS] = TEMP0 6247c8da7ce66017295a65ec028084b90800be377f8James Zern// DST[B + C * BPS] = TEMP1 6257c8da7ce66017295a65ec028084b90800be377f8James Zern#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \ 6267c8da7ce66017295a65ec028084b90800be377f8James Zern "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \ 6277c8da7ce66017295a65ec028084b90800be377f8James Zern "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t" 6287c8da7ce66017295a65ec028084b90800be377f8James Zern 6297c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void VE4(uint8_t* dst) { // vertical 6307c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top = dst - BPS; 6317c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3, temp4, temp5, temp6; 6327c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 6337c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp0], -1(%[top]) \n\t" 6347c8da7ce66017295a65ec028084b90800be377f8James Zern "ulh %[temp1], 3(%[top]) \n\t" 6357c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbr %[temp2], %[temp0] \n\t" 6367c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbl %[temp3], %[temp0] \n\t" 6377c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbr %[temp4], %[temp1] \n\t" 6387c8da7ce66017295a65ec028084b90800be377f8James Zern "packrl.ph %[temp5], %[temp3], %[temp2] \n\t" 6397c8da7ce66017295a65ec028084b90800be377f8James Zern "packrl.ph %[temp6], %[temp4], %[temp3] \n\t" 6407c8da7ce66017295a65ec028084b90800be377f8James Zern "shll.ph %[temp5], %[temp5], 1 \n\t" 6417c8da7ce66017295a65ec028084b90800be377f8James Zern "shll.ph %[temp6], %[temp6], 1 \n\t" 6427c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp2], %[temp5], %[temp2] \n\t" 6437c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp6], %[temp6], %[temp4] \n\t" 6447c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp2], %[temp2], %[temp3] \n\t" 6457c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp6], %[temp6], %[temp3] \n\t" 6467c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.ph %[temp2], %[temp2], 2 \n\t" 6477c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.ph %[temp6], %[temp6], 2 \n\t" 6487c8da7ce66017295a65ec028084b90800be377f8James Zern "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t" 6497c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst) 6507c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst) 6517c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 6527c8da7ce66017295a65ec028084b90800be377f8James Zern [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 6537c8da7ce66017295a65ec028084b90800be377f8James Zern [temp6]"=&r"(temp6) 6547c8da7ce66017295a65ec028084b90800be377f8James Zern : [top]"r"(top), [dst]"r"(dst) 6557c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 6567c8da7ce66017295a65ec028084b90800be377f8James Zern ); 6577c8da7ce66017295a65ec028084b90800be377f8James Zern} 6587c8da7ce66017295a65ec028084b90800be377f8James Zern 6597c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void DC4(uint8_t* dst) { // DC 6607c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3, temp4; 6617c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 6627c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t" 6637c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst) 6647c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[temp1], %[temp2], 8, 8 \n\t" 6657c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[temp1], %[temp3], 16, 8 \n\t" 6667c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[temp1], %[temp4], 24, 8 \n\t" 6677c8da7ce66017295a65ec028084b90800be377f8James Zern "raddu.w.qb %[temp0], %[temp0] \n\t" 6687c8da7ce66017295a65ec028084b90800be377f8James Zern "raddu.w.qb %[temp1], %[temp1] \n\t" 6697c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp0], %[temp0], %[temp1] \n\t" 6707c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp0], %[temp0], 3 \n\t" 6717c8da7ce66017295a65ec028084b90800be377f8James Zern "replv.qb %[temp0], %[temp0] \n\t" 6727c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst) 6737c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst) 6747c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 6757c8da7ce66017295a65ec028084b90800be377f8James Zern [temp3]"=&r"(temp3), [temp4]"=&r"(temp4) 6767c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst]"r"(dst) 6777c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 6787c8da7ce66017295a65ec028084b90800be377f8James Zern ); 6797c8da7ce66017295a65ec028084b90800be377f8James Zern} 6807c8da7ce66017295a65ec028084b90800be377f8James Zern 6817c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void RD4(uint8_t* dst) { // Down-right 6827c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3, temp4; 6837c8da7ce66017295a65ec028084b90800be377f8James Zern int temp5, temp6, temp7, temp8; 6847c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 6857c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst) 6867c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t" 6877c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[temp1], %[temp0], 16, 16 \n\t" 6887c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbr %[temp5], %[temp7] \n\t" 6897c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[temp2], %[temp1], 16, 16 \n\t" 6907c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbl %[temp4], %[temp7] \n\t" 6917c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[temp3], %[temp2], 16, 16 \n\t" 6927c8da7ce66017295a65ec028084b90800be377f8James Zern "shll.ph %[temp2], %[temp2], 1 \n\t" 6937c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp3], %[temp3], %[temp1] \n\t" 6947c8da7ce66017295a65ec028084b90800be377f8James Zern "packrl.ph %[temp6], %[temp5], %[temp1] \n\t" 6957c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp3], %[temp3], %[temp2] \n\t" 6967c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp1], %[temp1], %[temp5] \n\t" 6977c8da7ce66017295a65ec028084b90800be377f8James Zern "shll.ph %[temp6], %[temp6], 1 \n\t" 6987c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp1], %[temp1], %[temp6] \n\t" 6997c8da7ce66017295a65ec028084b90800be377f8James Zern "packrl.ph %[temp0], %[temp4], %[temp5] \n\t" 7007c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp8], %[temp5], %[temp4] \n\t" 7017c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.ph %[temp3], %[temp3], 2 \n\t" 7027c8da7ce66017295a65ec028084b90800be377f8James Zern "shll.ph %[temp0], %[temp0], 1 \n\t" 7037c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.ph %[temp1], %[temp1], 2 \n\t" 7047c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp8], %[temp0], %[temp8] \n\t" 7057c8da7ce66017295a65ec028084b90800be377f8James Zern "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t" 7067c8da7ce66017295a65ec028084b90800be377f8James Zern "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t" 7077c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.ph %[temp8], %[temp8], 2 \n\t" 7087c8da7ce66017295a65ec028084b90800be377f8James Zern "ins %[temp7], %[temp5], 0, 8 \n\t" 7097c8da7ce66017295a65ec028084b90800be377f8James Zern "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t" 7107c8da7ce66017295a65ec028084b90800be377f8James Zern "raddu.w.qb %[temp4], %[temp7] \n\t" 7117c8da7ce66017295a65ec028084b90800be377f8James Zern "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t" 7127c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp4], %[temp4], 2 \n\t" 7137c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst) 7147c8da7ce66017295a65ec028084b90800be377f8James Zern "prepend %[temp2], %[temp8], 8 \n\t" 7157c8da7ce66017295a65ec028084b90800be377f8James Zern "prepend %[temp6], %[temp4], 8 \n\t" 7167c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst) 7177c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 7187c8da7ce66017295a65ec028084b90800be377f8James Zern [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 7197c8da7ce66017295a65ec028084b90800be377f8James Zern [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8) 7207c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst]"r"(dst) 7217c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 7227c8da7ce66017295a65ec028084b90800be377f8James Zern ); 7237c8da7ce66017295a65ec028084b90800be377f8James Zern} 7247c8da7ce66017295a65ec028084b90800be377f8James Zern 7257c8da7ce66017295a65ec028084b90800be377f8James Zern// TEMP0 = SRC[A * BPS] 7267c8da7ce66017295a65ec028084b90800be377f8James Zern// TEMP1 = SRC[B + C * BPS] 7277c8da7ce66017295a65ec028084b90800be377f8James Zern#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \ 7287c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 7297c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t" 7307c8da7ce66017295a65ec028084b90800be377f8James Zern 7317c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void LD4(uint8_t* dst) { // Down-Left 7327c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3, temp4; 7337c8da7ce66017295a65ec028084b90800be377f8James Zern int temp5, temp6, temp7, temp8, temp9; 7347c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 7357c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) 7367c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbl %[temp2], %[temp0] \n\t" 7377c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbr %[temp3], %[temp0] \n\t" 7387c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbr %[temp4], %[temp1] \n\t" 7397c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbl %[temp5], %[temp1] \n\t" 7407c8da7ce66017295a65ec028084b90800be377f8James Zern "packrl.ph %[temp6], %[temp2], %[temp3] \n\t" 7417c8da7ce66017295a65ec028084b90800be377f8James Zern "packrl.ph %[temp7], %[temp4], %[temp2] \n\t" 7427c8da7ce66017295a65ec028084b90800be377f8James Zern "packrl.ph %[temp8], %[temp5], %[temp4] \n\t" 7437c8da7ce66017295a65ec028084b90800be377f8James Zern "shll.ph %[temp6], %[temp6], 1 \n\t" 7447c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp9], %[temp2], %[temp6] \n\t" 7457c8da7ce66017295a65ec028084b90800be377f8James Zern "shll.ph %[temp7], %[temp7], 1 \n\t" 7467c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp9], %[temp9], %[temp3] \n\t" 7477c8da7ce66017295a65ec028084b90800be377f8James Zern "shll.ph %[temp8], %[temp8], 1 \n\t" 7487c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.ph %[temp9], %[temp9], 2 \n\t" 7497c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp3], %[temp4], %[temp7] \n\t" 7507c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp0], %[temp5], %[temp8] \n\t" 7517c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp3], %[temp3], %[temp2] \n\t" 7527c8da7ce66017295a65ec028084b90800be377f8James Zern "addq.ph %[temp0], %[temp0], %[temp4] \n\t" 7537c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.ph %[temp3], %[temp3], 2 \n\t" 7547c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.ph %[temp0], %[temp0], 2 \n\t" 7557c8da7ce66017295a65ec028084b90800be377f8James Zern "srl %[temp1], %[temp1], 24 \n\t" 7567c8da7ce66017295a65ec028084b90800be377f8James Zern "sll %[temp1], %[temp1], 1 \n\t" 7577c8da7ce66017295a65ec028084b90800be377f8James Zern "raddu.w.qb %[temp5], %[temp5] \n\t" 7587c8da7ce66017295a65ec028084b90800be377f8James Zern "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t" 7597c8da7ce66017295a65ec028084b90800be377f8James Zern "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t" 7607c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp1], %[temp1], %[temp5] \n\t" 7617c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp1], %[temp1], 2 \n\t" 7627c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst) 7637c8da7ce66017295a65ec028084b90800be377f8James Zern "prepend %[temp9], %[temp0], 8 \n\t" 7647c8da7ce66017295a65ec028084b90800be377f8James Zern "prepend %[temp3], %[temp1], 8 \n\t" 7657c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst) 7667c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 7677c8da7ce66017295a65ec028084b90800be377f8James Zern [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 7687c8da7ce66017295a65ec028084b90800be377f8James Zern [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), 7697c8da7ce66017295a65ec028084b90800be377f8James Zern [temp9]"=&r"(temp9) 7707c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst]"r"(dst) 7717c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 7727c8da7ce66017295a65ec028084b90800be377f8James Zern ); 7737c8da7ce66017295a65ec028084b90800be377f8James Zern} 7747c8da7ce66017295a65ec028084b90800be377f8James Zern 7757c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 7767c8da7ce66017295a65ec028084b90800be377f8James Zern// Chroma 7777c8da7ce66017295a65ec028084b90800be377f8James Zern 7787c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void DC8uv(uint8_t* dst) { // DC 7797c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3, temp4; 7807c8da7ce66017295a65ec028084b90800be377f8James Zern int temp5, temp6, temp7, temp8, temp9; 7817c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 7827c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) 7837c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst) 7847c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst) 7857c8da7ce66017295a65ec028084b90800be377f8James Zern "raddu.w.qb %[temp0], %[temp0] \n\t" 7867c8da7ce66017295a65ec028084b90800be377f8James Zern "raddu.w.qb %[temp1], %[temp1] \n\t" 7877c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp2], %[temp2], %[temp3] \n\t" 7887c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp4], %[temp4], %[temp5] \n\t" 7897c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp6], %[temp6], %[temp7] \n\t" 7907c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp8], %[temp8], %[temp9] \n\t" 7917c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp0], %[temp0], %[temp1] \n\t" 7927c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp2], %[temp2], %[temp4] \n\t" 7937c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp6], %[temp6], %[temp8] \n\t" 7947c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp0], %[temp0], %[temp2] \n\t" 7957c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp0], %[temp0], %[temp6] \n\t" 7967c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp0], %[temp0], 4 \n\t" 7977c8da7ce66017295a65ec028084b90800be377f8James Zern "replv.qb %[temp0], %[temp0] \n\t" 7987c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) 7997c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) 8007c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) 8017c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) 8027c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) 8037c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) 8047c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) 8057c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) 8067c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 8077c8da7ce66017295a65ec028084b90800be377f8James Zern [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 8087c8da7ce66017295a65ec028084b90800be377f8James Zern [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), 8097c8da7ce66017295a65ec028084b90800be377f8James Zern [temp9]"=&r"(temp9) 8107c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst]"r"(dst) 8117c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 8127c8da7ce66017295a65ec028084b90800be377f8James Zern ); 8137c8da7ce66017295a65ec028084b90800be377f8James Zern} 8147c8da7ce66017295a65ec028084b90800be377f8James Zern 8157c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples 8167c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1; 8177c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 8187c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) 8197c8da7ce66017295a65ec028084b90800be377f8James Zern "raddu.w.qb %[temp0], %[temp0] \n\t" 8207c8da7ce66017295a65ec028084b90800be377f8James Zern "raddu.w.qb %[temp1], %[temp1] \n\t" 8217c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp0], %[temp0], %[temp1] \n\t" 8227c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp0], %[temp0], 3 \n\t" 8237c8da7ce66017295a65ec028084b90800be377f8James Zern "replv.qb %[temp0], %[temp0] \n\t" 8247c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) 8257c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) 8267c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) 8277c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) 8287c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) 8297c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) 8307c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) 8317c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) 8327c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1) 8337c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst]"r"(dst) 8347c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 8357c8da7ce66017295a65ec028084b90800be377f8James Zern ); 8367c8da7ce66017295a65ec028084b90800be377f8James Zern} 8377c8da7ce66017295a65ec028084b90800be377f8James Zern 8387c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void DC8uvNoTop(uint8_t* dst) { // DC with no top samples 8397c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3, temp4; 8407c8da7ce66017295a65ec028084b90800be377f8James Zern int temp5, temp6, temp7, temp8; 8417c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( 8427c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst) 8437c8da7ce66017295a65ec028084b90800be377f8James Zern LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst) 8447c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp2], %[temp2], %[temp3] \n\t" 8457c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp4], %[temp4], %[temp5] \n\t" 8467c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp6], %[temp6], %[temp7] \n\t" 8477c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp8], %[temp8], %[temp1] \n\t" 8487c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp2], %[temp2], %[temp4] \n\t" 8497c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp6], %[temp6], %[temp8] \n\t" 8507c8da7ce66017295a65ec028084b90800be377f8James Zern "addu %[temp0], %[temp6], %[temp2] \n\t" 8517c8da7ce66017295a65ec028084b90800be377f8James Zern "shra_r.w %[temp0], %[temp0], 3 \n\t" 8527c8da7ce66017295a65ec028084b90800be377f8James Zern "replv.qb %[temp0], %[temp0] \n\t" 8537c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) 8547c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) 8557c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) 8567c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) 8577c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) 8587c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) 8597c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) 8607c8da7ce66017295a65ec028084b90800be377f8James Zern STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) 8617c8da7ce66017295a65ec028084b90800be377f8James Zern : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 8627c8da7ce66017295a65ec028084b90800be377f8James Zern [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 8637c8da7ce66017295a65ec028084b90800be377f8James Zern [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8) 8647c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst]"r"(dst) 8657c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" 8667c8da7ce66017295a65ec028084b90800be377f8James Zern ); 8677c8da7ce66017295a65ec028084b90800be377f8James Zern} 8687c8da7ce66017295a65ec028084b90800be377f8James Zern 8697c8da7ce66017295a65ec028084b90800be377f8James Zern#undef LOAD_8_BYTES 8707c8da7ce66017295a65ec028084b90800be377f8James Zern#undef STORE_8_BYTES 8717c8da7ce66017295a65ec028084b90800be377f8James Zern#undef LOAD_4_BYTES 8727c8da7ce66017295a65ec028084b90800be377f8James Zern 8737c8da7ce66017295a65ec028084b90800be377f8James Zern#define CLIPPING(SIZE) \ 8747c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbl %[temp2], %[temp0] \n\t" \ 8757c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbr %[temp0], %[temp0] \n\t" \ 8767c8da7ce66017295a65ec028084b90800be377f8James Zern".if " #SIZE " == 8 \n\t" \ 8777c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbl %[temp3], %[temp1] \n\t" \ 8787c8da7ce66017295a65ec028084b90800be377f8James Zern "preceu.ph.qbr %[temp1], %[temp1] \n\t" \ 8797c8da7ce66017295a65ec028084b90800be377f8James Zern".endif \n\t" \ 8807c8da7ce66017295a65ec028084b90800be377f8James Zern "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \ 8817c8da7ce66017295a65ec028084b90800be377f8James Zern "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \ 8827c8da7ce66017295a65ec028084b90800be377f8James Zern".if " #SIZE " == 8 \n\t" \ 8837c8da7ce66017295a65ec028084b90800be377f8James Zern "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \ 8847c8da7ce66017295a65ec028084b90800be377f8James Zern "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \ 8857c8da7ce66017295a65ec028084b90800be377f8James Zern".endif \n\t" \ 8867c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.ph %[temp2], %[temp2], 7 \n\t" \ 8877c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.ph %[temp0], %[temp0], 7 \n\t" \ 8887c8da7ce66017295a65ec028084b90800be377f8James Zern".if " #SIZE " == 8 \n\t" \ 8897c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.ph %[temp3], %[temp3], 7 \n\t" \ 8907c8da7ce66017295a65ec028084b90800be377f8James Zern "shll_s.ph %[temp1], %[temp1], 7 \n\t" \ 8917c8da7ce66017295a65ec028084b90800be377f8James Zern".endif \n\t" \ 8927c8da7ce66017295a65ec028084b90800be377f8James Zern "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \ 8937c8da7ce66017295a65ec028084b90800be377f8James Zern".if " #SIZE " == 8 \n\t" \ 8947c8da7ce66017295a65ec028084b90800be377f8James Zern "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \ 8957c8da7ce66017295a65ec028084b90800be377f8James Zern".endif \n\t" 8967c8da7ce66017295a65ec028084b90800be377f8James Zern 8977c8da7ce66017295a65ec028084b90800be377f8James Zern 8987c8da7ce66017295a65ec028084b90800be377f8James Zern#define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \ 8997c8da7ce66017295a65ec028084b90800be377f8James Zern int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \ 9007c8da7ce66017295a65ec028084b90800be377f8James Zern int temp0, temp1, temp2, temp3; \ 9017c8da7ce66017295a65ec028084b90800be377f8James Zern __asm__ volatile ( \ 9027c8da7ce66017295a65ec028084b90800be377f8James Zern ".if " #SIZE " < 8 \n\t" \ 9037c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp0], 0(%[top]) \n\t" \ 9047c8da7ce66017295a65ec028084b90800be377f8James Zern "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \ 9057c8da7ce66017295a65ec028084b90800be377f8James Zern CLIPPING(4) \ 9067c8da7ce66017295a65ec028084b90800be377f8James Zern "usw %[temp0], 0(%[dst]) \n\t" \ 9077c8da7ce66017295a65ec028084b90800be377f8James Zern ".else \n\t" \ 9087c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp0], 0(%[top]) \n\t" \ 9097c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp1], 4(%[top]) \n\t" \ 9107c8da7ce66017295a65ec028084b90800be377f8James Zern "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \ 9117c8da7ce66017295a65ec028084b90800be377f8James Zern CLIPPING(8) \ 9127c8da7ce66017295a65ec028084b90800be377f8James Zern "usw %[temp0], 0(%[dst]) \n\t" \ 9137c8da7ce66017295a65ec028084b90800be377f8James Zern "usw %[temp1], 4(%[dst]) \n\t" \ 9147c8da7ce66017295a65ec028084b90800be377f8James Zern ".if " #SIZE " == 16 \n\t" \ 9157c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp0], 8(%[top]) \n\t" \ 9167c8da7ce66017295a65ec028084b90800be377f8James Zern "ulw %[temp1], 12(%[top]) \n\t" \ 9177c8da7ce66017295a65ec028084b90800be377f8James Zern CLIPPING(8) \ 9187c8da7ce66017295a65ec028084b90800be377f8James Zern "usw %[temp0], 8(%[dst]) \n\t" \ 9197c8da7ce66017295a65ec028084b90800be377f8James Zern "usw %[temp1], 12(%[dst]) \n\t" \ 9207c8da7ce66017295a65ec028084b90800be377f8James Zern ".endif \n\t" \ 9217c8da7ce66017295a65ec028084b90800be377f8James Zern ".endif \n\t" \ 9227c8da7ce66017295a65ec028084b90800be377f8James Zern : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ 9237c8da7ce66017295a65ec028084b90800be377f8James Zern [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \ 9247c8da7ce66017295a65ec028084b90800be377f8James Zern : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \ 9257c8da7ce66017295a65ec028084b90800be377f8James Zern : "memory" \ 9267c8da7ce66017295a65ec028084b90800be377f8James Zern ); \ 9277c8da7ce66017295a65ec028084b90800be377f8James Zern} while (0) 9287c8da7ce66017295a65ec028084b90800be377f8James Zern 9297c8da7ce66017295a65ec028084b90800be377f8James Zern#define CLIP_TO_DST(DST, SIZE) do { \ 9307c8da7ce66017295a65ec028084b90800be377f8James Zern int y; \ 9317c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top = (DST) - BPS; \ 9327c8da7ce66017295a65ec028084b90800be377f8James Zern const int top_1 = ((int)top[-1] << 16) + top[-1]; \ 9337c8da7ce66017295a65ec028084b90800be377f8James Zern for (y = 0; y < (SIZE); ++y) { \ 9347c8da7ce66017295a65ec028084b90800be377f8James Zern CLIP_8B_TO_DST((DST), top, (SIZE)); \ 9357c8da7ce66017295a65ec028084b90800be377f8James Zern (DST) += BPS; \ 9367c8da7ce66017295a65ec028084b90800be377f8James Zern } \ 9377c8da7ce66017295a65ec028084b90800be377f8James Zern} while (0) 9387c8da7ce66017295a65ec028084b90800be377f8James Zern 9397c8da7ce66017295a65ec028084b90800be377f8James Zern#define TRUE_MOTION(DST, SIZE) \ 9407c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void TrueMotion##SIZE(uint8_t* (DST)) { \ 9417c8da7ce66017295a65ec028084b90800be377f8James Zern CLIP_TO_DST((DST), (SIZE)); \ 9427c8da7ce66017295a65ec028084b90800be377f8James Zern} 9437c8da7ce66017295a65ec028084b90800be377f8James Zern 9447c8da7ce66017295a65ec028084b90800be377f8James ZernTRUE_MOTION(dst, 4) 9457c8da7ce66017295a65ec028084b90800be377f8James ZernTRUE_MOTION(dst, 8) 9467c8da7ce66017295a65ec028084b90800be377f8James ZernTRUE_MOTION(dst, 16) 9477c8da7ce66017295a65ec028084b90800be377f8James Zern 9487c8da7ce66017295a65ec028084b90800be377f8James Zern#undef TRUE_MOTION 9497c8da7ce66017295a65ec028084b90800be377f8James Zern#undef CLIP_TO_DST 9507c8da7ce66017295a65ec028084b90800be377f8James Zern#undef CLIP_8B_TO_DST 9517c8da7ce66017295a65ec028084b90800be377f8James Zern#undef CLIPPING 9527c8da7ce66017295a65ec028084b90800be377f8James Zern 9537c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 9547c8da7ce66017295a65ec028084b90800be377f8James Zern// Entry point 9557c8da7ce66017295a65ec028084b90800be377f8James Zern 9567c8da7ce66017295a65ec028084b90800be377f8James Zernextern void VP8DspInitMIPSdspR2(void); 9577c8da7ce66017295a65ec028084b90800be377f8James Zern 9587c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) { 9597c8da7ce66017295a65ec028084b90800be377f8James Zern VP8TransformDC = TransformDC; 9607c8da7ce66017295a65ec028084b90800be377f8James Zern VP8TransformAC3 = TransformAC3; 9617c8da7ce66017295a65ec028084b90800be377f8James Zern VP8Transform = TransformTwo; 9627c8da7ce66017295a65ec028084b90800be377f8James Zern 9637c8da7ce66017295a65ec028084b90800be377f8James Zern VP8VFilter16 = VFilter16; 9647c8da7ce66017295a65ec028084b90800be377f8James Zern VP8HFilter16 = HFilter16; 9657c8da7ce66017295a65ec028084b90800be377f8James Zern VP8VFilter8 = VFilter8; 9667c8da7ce66017295a65ec028084b90800be377f8James Zern VP8HFilter8 = HFilter8; 9677c8da7ce66017295a65ec028084b90800be377f8James Zern VP8VFilter16i = VFilter16i; 9687c8da7ce66017295a65ec028084b90800be377f8James Zern VP8HFilter16i = HFilter16i; 9697c8da7ce66017295a65ec028084b90800be377f8James Zern VP8VFilter8i = VFilter8i; 9707c8da7ce66017295a65ec028084b90800be377f8James Zern VP8HFilter8i = HFilter8i; 9717c8da7ce66017295a65ec028084b90800be377f8James Zern VP8SimpleVFilter16 = SimpleVFilter16; 9727c8da7ce66017295a65ec028084b90800be377f8James Zern VP8SimpleHFilter16 = SimpleHFilter16; 9737c8da7ce66017295a65ec028084b90800be377f8James Zern VP8SimpleVFilter16i = SimpleVFilter16i; 9747c8da7ce66017295a65ec028084b90800be377f8James Zern VP8SimpleHFilter16i = SimpleHFilter16i; 9757c8da7ce66017295a65ec028084b90800be377f8James Zern 9767c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredLuma4[0] = DC4; 9777c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredLuma4[1] = TrueMotion4; 9787c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredLuma4[2] = VE4; 9797c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredLuma4[4] = RD4; 9807c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredLuma4[6] = LD4; 9817c8da7ce66017295a65ec028084b90800be377f8James Zern 9827c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredChroma8[0] = DC8uv; 9837c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredChroma8[1] = TrueMotion8; 9847c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredChroma8[4] = DC8uvNoTop; 9857c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredChroma8[5] = DC8uvNoLeft; 9867c8da7ce66017295a65ec028084b90800be377f8James Zern 9877c8da7ce66017295a65ec028084b90800be377f8James Zern VP8PredLuma16[1] = TrueMotion16; 9887c8da7ce66017295a65ec028084b90800be377f8James Zern} 9897c8da7ce66017295a65ec028084b90800be377f8James Zern 9907c8da7ce66017295a65ec028084b90800be377f8James Zern#else // !WEBP_USE_MIPS_DSP_R2 9917c8da7ce66017295a65ec028084b90800be377f8James Zern 9927c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2) 9937c8da7ce66017295a65ec028084b90800be377f8James Zern 9947c8da7ce66017295a65ec028084b90800be377f8James Zern#endif // WEBP_USE_MIPS_DSP_R2 995