12f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan/* 21b362b15af34006e6a11974088a46d42b903418eJohann * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 32f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan * 42f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan * Use of this source code is governed by a BSD-style license 52f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan * that can be found in the LICENSE file in the root of the source 62f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan * tree. An additional intellectual property rights grant can be found 72f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan * in the file PATENTS. All contributing project authors may 82f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan * be found in the AUTHORS file in the root of the source tree. 92f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan */ 102f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang#include "vp8_rtcd.h" 122f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 131b362b15af34006e6a11974088a46d42b903418eJohann#if HAVE_DSPR2 141b362b15af34006e6a11974088a46d42b903418eJohann#define CROP_WIDTH 256 151b362b15af34006e6a11974088a46d42b903418eJohann 161b362b15af34006e6a11974088a46d42b903418eJohann/****************************************************************************** 171b362b15af34006e6a11974088a46d42b903418eJohann * Notes: 181b362b15af34006e6a11974088a46d42b903418eJohann * 191b362b15af34006e6a11974088a46d42b903418eJohann * This implementation makes use of 16 bit fixed point version of two multiply 201b362b15af34006e6a11974088a46d42b903418eJohann * constants: 211b362b15af34006e6a11974088a46d42b903418eJohann * 1. sqrt(2) * cos (pi/8) 221b362b15af34006e6a11974088a46d42b903418eJohann * 2. sqrt(2) * sin (pi/8) 231b362b15af34006e6a11974088a46d42b903418eJohann * Since the first constant is bigger than 1, to maintain the same 16 bit 241b362b15af34006e6a11974088a46d42b903418eJohann * fixed point precision as the second one, we use a trick of 251b362b15af34006e6a11974088a46d42b903418eJohann * x * a = x + x*(a-1) 261b362b15af34006e6a11974088a46d42b903418eJohann * so 271b362b15af34006e6a11974088a46d42b903418eJohann * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). 281b362b15af34006e6a11974088a46d42b903418eJohann ****************************************************************************/ 291b362b15af34006e6a11974088a46d42b903418eJohannextern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH]; 302f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjanstatic const int cospi8sqrt2minus1 = 20091; 312f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjanstatic const int sinpi8sqrt2 = 35468; 322f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 331b362b15af34006e6a11974088a46d42b903418eJohanninline void prefetch_load_short(short *src) 341b362b15af34006e6a11974088a46d42b903418eJohann{ 352f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan __asm__ __volatile__ ( 362f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "pref 0, 0(%[src]) \n\t" 372f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : 382f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [src] "r" (src) 392f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ); 402f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan} 412f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 421b362b15af34006e6a11974088a46d42b903418eJohannvoid vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr, 431b362b15af34006e6a11974088a46d42b903418eJohann int pred_stride, unsigned char *dst_ptr, 441b362b15af34006e6a11974088a46d42b903418eJohann int dst_stride) 452f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan{ 461b362b15af34006e6a11974088a46d42b903418eJohann int r, c; 471b362b15af34006e6a11974088a46d42b903418eJohann int a1, b1, c1, d1; 481b362b15af34006e6a11974088a46d42b903418eJohann short output[16]; 492f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan short *ip = input; 502f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan short *op = output; 511b362b15af34006e6a11974088a46d42b903418eJohann int temp1, temp2; 521b362b15af34006e6a11974088a46d42b903418eJohann int shortpitch = 4; 531b362b15af34006e6a11974088a46d42b903418eJohann 541b362b15af34006e6a11974088a46d42b903418eJohann int c2, d2; 551b362b15af34006e6a11974088a46d42b903418eJohann int temp3, temp4; 561b362b15af34006e6a11974088a46d42b903418eJohann unsigned char *cm = ff_cropTbl + CROP_WIDTH; 572f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 582f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan /* prepare data for load */ 592f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan prefetch_load_short(ip + 8); 602f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 612f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan /* first loop is unrolled */ 622f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[0] + ip[8]; 632f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[0] - ip[8]; 642f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 652f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp1 = (ip[4] * sinpi8sqrt2) >> 16; 662f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 672f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c1 = temp1 - temp2; 682f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 692f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); 702f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp2 = (ip[12] * sinpi8sqrt2) >> 16; 712f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d1 = temp1 + temp2; 722f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 732f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp3 = (ip[5] * sinpi8sqrt2) >> 16; 742f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); 752f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c2 = temp3 - temp4; 762f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 772f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); 782f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp4 = (ip[13] * sinpi8sqrt2) >> 16; 792f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d2 = temp3 + temp4; 802f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 812f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[0] = a1 + d1; 822f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[12] = a1 - d1; 832f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[4] = b1 + c1; 842f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[8] = b1 - c1; 852f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 862f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[1] + ip[9]; 872f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[1] - ip[9]; 882f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 892f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[1] = a1 + d2; 902f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[13] = a1 - d2; 912f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[5] = b1 + c2; 922f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[9] = b1 - c2; 932f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 942f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[2] + ip[10]; 952f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[2] - ip[10]; 962f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 972f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp1 = (ip[6] * sinpi8sqrt2) >> 16; 982f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16); 992f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c1 = temp1 - temp2; 1002f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1012f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16); 1022f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp2 = (ip[14] * sinpi8sqrt2) >> 16; 1032f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d1 = temp1 + temp2; 1042f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1052f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp3 = (ip[7] * sinpi8sqrt2) >> 16; 1062f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); 1072f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c2 = temp3 - temp4; 1082f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1092f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); 1102f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp4 = (ip[15] * sinpi8sqrt2) >> 16; 1112f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d2 = temp3 + temp4; 1122f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1132f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[2] = a1 + d1; 1142f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[14] = a1 - d1; 1152f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[6] = b1 + c1; 1162f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[10] = b1 - c1; 1172f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1182f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[3] + ip[11]; 1192f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[3] - ip[11]; 1202f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1212f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[3] = a1 + d2; 1222f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[15] = a1 - d2; 1232f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[7] = b1 + c2; 1242f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[11] = b1 - c2; 1252f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1262f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ip = output; 1272f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1282f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan /* prepare data for load */ 1291b362b15af34006e6a11974088a46d42b903418eJohann prefetch_load_short(ip + shortpitch); 1302f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1312f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan /* second loop is unrolled */ 1322f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[0] + ip[2]; 1332f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[0] - ip[2]; 1342f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1352f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp1 = (ip[1] * sinpi8sqrt2) >> 16; 1362f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); 1372f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c1 = temp1 - temp2; 1382f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1392f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); 1402f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp2 = (ip[3] * sinpi8sqrt2) >> 16; 1412f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d1 = temp1 + temp2; 1422f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1432f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp3 = (ip[5] * sinpi8sqrt2) >> 16; 1442f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); 1452f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c2 = temp3 - temp4; 1462f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1472f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); 1482f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp4 = (ip[7] * sinpi8sqrt2) >> 16; 1492f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d2 = temp3 + temp4; 1502f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1512f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[0] = (a1 + d1 + 4) >> 3; 1522f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[3] = (a1 - d1 + 4) >> 3; 1532f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[1] = (b1 + c1 + 4) >> 3; 1542f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[2] = (b1 - c1 + 4) >> 3; 1552f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1562f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[4] + ip[6]; 1572f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[4] - ip[6]; 1582f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1592f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[4] = (a1 + d2 + 4) >> 3; 1602f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[7] = (a1 - d2 + 4) >> 3; 1612f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[5] = (b1 + c2 + 4) >> 3; 1622f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[6] = (b1 - c2 + 4) >> 3; 1632f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1642f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[8] + ip[10]; 1652f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[8] - ip[10]; 1662f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1672f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp1 = (ip[9] * sinpi8sqrt2) >> 16; 1682f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16); 1692f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c1 = temp1 - temp2; 1702f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1712f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16); 1722f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp2 = (ip[11] * sinpi8sqrt2) >> 16; 1732f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d1 = temp1 + temp2; 1742f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1752f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp3 = (ip[13] * sinpi8sqrt2) >> 16; 1762f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); 1772f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c2 = temp3 - temp4; 1782f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1792f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); 1802f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan temp4 = (ip[15] * sinpi8sqrt2) >> 16; 1812f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d2 = temp3 + temp4; 1822f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1832f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[8] = (a1 + d1 + 4) >> 3; 1842f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[11] = (a1 - d1 + 4) >> 3; 1852f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[9] = (b1 + c1 + 4) >> 3; 1862f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[10] = (b1 - c1 + 4) >> 3; 1872f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1882f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[12] + ip[14]; 1892f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[12] - ip[14]; 1902f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1912f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[12] = (a1 + d2 + 4) >> 3; 1922f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[15] = (a1 - d2 + 4) >> 3; 1932f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[13] = (b1 + c2 + 4) >> 3; 1942f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[14] = (b1 - c2 + 4) >> 3; 1952f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 1961b362b15af34006e6a11974088a46d42b903418eJohann ip = output; 1971b362b15af34006e6a11974088a46d42b903418eJohann 1981b362b15af34006e6a11974088a46d42b903418eJohann for (r = 0; r < 4; r++) 1991b362b15af34006e6a11974088a46d42b903418eJohann { 2001b362b15af34006e6a11974088a46d42b903418eJohann for (c = 0; c < 4; c++) 2011b362b15af34006e6a11974088a46d42b903418eJohann { 2021b362b15af34006e6a11974088a46d42b903418eJohann short a = ip[c] + pred_ptr[c] ; 2031b362b15af34006e6a11974088a46d42b903418eJohann dst_ptr[c] = cm[a] ; 2041b362b15af34006e6a11974088a46d42b903418eJohann } 2051b362b15af34006e6a11974088a46d42b903418eJohann 2061b362b15af34006e6a11974088a46d42b903418eJohann ip += 4; 2071b362b15af34006e6a11974088a46d42b903418eJohann dst_ptr += dst_stride; 2081b362b15af34006e6a11974088a46d42b903418eJohann pred_ptr += pred_stride; 2091b362b15af34006e6a11974088a46d42b903418eJohann } 2101b362b15af34006e6a11974088a46d42b903418eJohann} 2112f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2121b362b15af34006e6a11974088a46d42b903418eJohannvoid vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) 2132f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan{ 2141b362b15af34006e6a11974088a46d42b903418eJohann int a1; 2151b362b15af34006e6a11974088a46d42b903418eJohann int i, absa1; 2162f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan int t2, vector_a1, vector_a; 2172f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2182f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan /* a1 = ((input_dc + 4) >> 3); */ 2192f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan __asm__ __volatile__ ( 2202f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "addi %[a1], %[input_dc], 4 \n\t" 2212f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "sra %[a1], %[a1], 3 \n\t" 2222f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [a1] "=r" (a1) 2232f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [input_dc] "r" (input_dc) 2242f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ); 2252f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2261b362b15af34006e6a11974088a46d42b903418eJohann if (a1 < 0) 2271b362b15af34006e6a11974088a46d42b903418eJohann { 2282f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan /* use quad-byte 2292f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan * input and output memory are four byte aligned 2302f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan */ 2312f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan __asm__ __volatile__ ( 2322f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "abs %[absa1], %[a1] \n\t" 2332f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "replv.qb %[vector_a1], %[absa1] \n\t" 2342f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) 2352f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [a1] "r" (a1) 2362f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ); 2372f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2382f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan /* use (a1 - predptr[c]) instead a1 + predptr[c] */ 2392f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan for (i = 4; i--;) 2402f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan { 2412f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan __asm__ __volatile__ ( 2422f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "lw %[t2], 0(%[pred_ptr]) \n\t" 2431b362b15af34006e6a11974088a46d42b903418eJohann "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t" 2442f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" 2452f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "sw %[vector_a], 0(%[dst_ptr]) \n\t" 2461b362b15af34006e6a11974088a46d42b903418eJohann "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 2472f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), 2482f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr) 2491b362b15af34006e6a11974088a46d42b903418eJohann : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1) 2502f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ); 2512f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan } 2522f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan } 2531b362b15af34006e6a11974088a46d42b903418eJohann else 2541b362b15af34006e6a11974088a46d42b903418eJohann { 2552f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan /* use quad-byte 2562f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan * input and output memory are four byte aligned 2572f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan */ 2582f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan __asm__ __volatile__ ( 2592f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "replv.qb %[vector_a1], %[a1] \n\t" 2602f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [vector_a1] "=r" (vector_a1) 2612f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [a1] "r" (a1) 2622f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ); 2632f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2642f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan for (i = 4; i--;) 2652f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan { 2662f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan __asm__ __volatile__ ( 2672f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "lw %[t2], 0(%[pred_ptr]) \n\t" 2681b362b15af34006e6a11974088a46d42b903418eJohann "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t" 2692f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "addu_s.qb %[vector_a], %[vector_a1], %[t2] \n\t" 2702f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan "sw %[vector_a], 0(%[dst_ptr]) \n\t" 2711b362b15af34006e6a11974088a46d42b903418eJohann "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 2722f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), 2732f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr) 2741b362b15af34006e6a11974088a46d42b903418eJohann : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1) 2752f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ); 2762f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan } 2772f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan } 2782f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2791b362b15af34006e6a11974088a46d42b903418eJohann} 2802f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2811b362b15af34006e6a11974088a46d42b903418eJohannvoid vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff) 2822f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan{ 2831b362b15af34006e6a11974088a46d42b903418eJohann short output[16]; 2842f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan int i; 2852f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan int a1, b1, c1, d1; 2862f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan int a2, b2, c2, d2; 2872f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan short *ip = input; 2882f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan short *op = output; 2892f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2902f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan prefetch_load_short(ip); 2912f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2922f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan for (i = 4; i--;) 2932f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan { 2942f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[0] + ip[12]; 2952f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[4] + ip[8]; 2962f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c1 = ip[4] - ip[8]; 2972f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d1 = ip[0] - ip[12]; 2982f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 2992f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[0] = a1 + b1; 3002f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[4] = c1 + d1; 3012f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[8] = a1 - b1; 3022f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[12] = d1 - c1; 3032f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3042f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ip++; 3052f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op++; 3062f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan } 3072f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3082f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ip = output; 3092f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op = output; 3102f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3112f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan prefetch_load_short(ip); 3122f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3132f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan for (i = 4; i--;) 3142f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan { 3152f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ip[0] + ip[3] + 3; 3162f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b1 = ip[1] + ip[2]; 3172f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c1 = ip[1] - ip[2]; 3182f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d1 = ip[0] - ip[3] + 3; 3192f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3202f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a2 = a1 + b1; 3212f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan b2 = d1 + c1; 3222f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan c2 = a1 - b1; 3232f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan d2 = d1 - c1; 3242f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3252f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[0] = a2 >> 3; 3262f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[1] = b2 >> 3; 3272f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[2] = c2 >> 3; 3282f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op[3] = d2 >> 3; 3292f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3302f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan ip += 4; 3312f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan op += 4; 3322f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan } 3332f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3341b362b15af34006e6a11974088a46d42b903418eJohann for (i = 0; i < 16; i++) 3351b362b15af34006e6a11974088a46d42b903418eJohann { 3361b362b15af34006e6a11974088a46d42b903418eJohann mb_dqcoeff[i * 16] = output[i]; 3371b362b15af34006e6a11974088a46d42b903418eJohann } 3381b362b15af34006e6a11974088a46d42b903418eJohann} 3392f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3401b362b15af34006e6a11974088a46d42b903418eJohannvoid vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff) 3412f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan{ 3422f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan int a1; 3432f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3442f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan a1 = ((input[0] + 3) >> 3); 3452f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3462f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan __asm__ __volatile__ ( 3471b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 0(%[mb_dqcoeff]) \n\t" 3481b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 32(%[mb_dqcoeff]) \n\t" 3491b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 64(%[mb_dqcoeff]) \n\t" 3501b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 96(%[mb_dqcoeff]) \n\t" 3511b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 128(%[mb_dqcoeff]) \n\t" 3521b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 160(%[mb_dqcoeff]) \n\t" 3531b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 192(%[mb_dqcoeff]) \n\t" 3541b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 224(%[mb_dqcoeff]) \n\t" 3551b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 256(%[mb_dqcoeff]) \n\t" 3561b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 288(%[mb_dqcoeff]) \n\t" 3571b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 320(%[mb_dqcoeff]) \n\t" 3581b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 352(%[mb_dqcoeff]) \n\t" 3591b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 384(%[mb_dqcoeff]) \n\t" 3601b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 416(%[mb_dqcoeff]) \n\t" 3611b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 448(%[mb_dqcoeff]) \n\t" 3621b362b15af34006e6a11974088a46d42b903418eJohann "sh %[a1], 480(%[mb_dqcoeff]) \n\t" 3632f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan 3641b362b15af34006e6a11974088a46d42b903418eJohann : 3651b362b15af34006e6a11974088a46d42b903418eJohann : [a1] "r" (a1), [mb_dqcoeff] "r" (mb_dqcoeff) 3661b362b15af34006e6a11974088a46d42b903418eJohann ); 3672f01f9a5c363613e7389fb28c250edcd4509f815Dragan Mrdjan} 3681b362b15af34006e6a11974088a46d42b903418eJohann 3691b362b15af34006e6a11974088a46d42b903418eJohann#endif 370