1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "vp8_rtcd.h" 12 13#if HAVE_DSPR2 14#define CROP_WIDTH 256 15 16/****************************************************************************** 17 * Notes: 18 * 19 * This implementation makes use of 16 bit fixed point version of two multiply 20 * constants: 21 * 1. sqrt(2) * cos (pi/8) 22 * 2. sqrt(2) * sin (pi/8) 23 * Since the first constant is bigger than 1, to maintain the same 16 bit 24 * fixed point precision as the second one, we use a trick of 25 * x * a = x + x*(a-1) 26 * so 27 * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). 28 ****************************************************************************/ 29extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH]; 30static const int cospi8sqrt2minus1 = 20091; 31static const int sinpi8sqrt2 = 35468; 32 33inline void prefetch_load_short(short *src) { 34 __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); 35} 36 37void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr, 38 int pred_stride, unsigned char *dst_ptr, 39 int dst_stride) { 40 int r, c; 41 int a1, b1, c1, d1; 42 short output[16]; 43 short *ip = input; 44 short *op = output; 45 int temp1, temp2; 46 int shortpitch = 4; 47 48 int c2, d2; 49 int temp3, temp4; 50 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 51 52 /* prepare data for load */ 53 prefetch_load_short(ip + 8); 54 55 /* first loop is unrolled */ 56 a1 = ip[0] + ip[8]; 57 b1 = ip[0] - ip[8]; 58 59 temp1 = (ip[4] * sinpi8sqrt2) >> 16; 60 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 61 c1 = temp1 - temp2; 62 63 temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); 64 temp2 = (ip[12] * sinpi8sqrt2) >> 16; 65 d1 = temp1 + temp2; 66 67 temp3 = (ip[5] * sinpi8sqrt2) >> 16; 68 temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); 69 c2 = temp3 - temp4; 70 71 temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); 72 temp4 = (ip[13] * sinpi8sqrt2) >> 16; 73 d2 = temp3 + temp4; 74 75 op[0] = a1 + d1; 76 op[12] = a1 - d1; 77 op[4] = b1 + c1; 78 op[8] = b1 - c1; 79 80 a1 = ip[1] + ip[9]; 81 b1 = ip[1] - ip[9]; 82 83 op[1] = a1 + d2; 84 op[13] = a1 - d2; 85 op[5] = b1 + c2; 86 op[9] = b1 - c2; 87 88 a1 = ip[2] + ip[10]; 89 b1 = ip[2] - ip[10]; 90 91 temp1 = (ip[6] * sinpi8sqrt2) >> 16; 92 temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16); 93 c1 = temp1 - temp2; 94 95 temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16); 96 temp2 = (ip[14] * sinpi8sqrt2) >> 16; 97 d1 = temp1 + temp2; 98 99 temp3 = (ip[7] * sinpi8sqrt2) >> 16; 100 temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); 101 c2 = temp3 - temp4; 102 103 temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); 104 temp4 = (ip[15] * sinpi8sqrt2) >> 16; 105 d2 = temp3 + temp4; 106 107 op[2] = a1 + d1; 108 op[14] = a1 - d1; 109 op[6] = b1 + c1; 110 op[10] = b1 - c1; 111 112 a1 = ip[3] + ip[11]; 113 b1 = ip[3] - ip[11]; 114 115 op[3] = a1 + d2; 116 op[15] = a1 - d2; 117 op[7] = b1 + c2; 118 op[11] = b1 - c2; 119 120 ip = output; 121 122 /* prepare data for load */ 123 prefetch_load_short(ip + shortpitch); 124 125 /* second loop is unrolled */ 126 a1 = ip[0] + ip[2]; 127 b1 = ip[0] - ip[2]; 128 129 temp1 = (ip[1] * sinpi8sqrt2) >> 16; 130 temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); 131 c1 = temp1 - temp2; 132 133 temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); 134 temp2 = (ip[3] * sinpi8sqrt2) >> 16; 135 d1 = temp1 + temp2; 136 137 temp3 = (ip[5] * sinpi8sqrt2) >> 16; 138 temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); 139 c2 = temp3 - temp4; 140 141 temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); 142 temp4 = (ip[7] * sinpi8sqrt2) >> 16; 143 d2 = temp3 + temp4; 144 145 op[0] = (a1 + d1 + 4) >> 3; 146 op[3] = (a1 - d1 + 4) >> 3; 147 op[1] = (b1 + c1 + 4) >> 3; 148 op[2] = (b1 - c1 + 4) >> 3; 149 150 a1 = ip[4] + ip[6]; 151 b1 = ip[4] - ip[6]; 152 153 op[4] = (a1 + d2 + 4) >> 3; 154 op[7] = (a1 - d2 + 4) >> 3; 155 op[5] = (b1 + c2 + 4) >> 3; 156 op[6] = (b1 - c2 + 4) >> 3; 157 158 a1 = ip[8] + ip[10]; 159 b1 = ip[8] - ip[10]; 160 161 temp1 = (ip[9] * sinpi8sqrt2) >> 16; 162 temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16); 163 c1 = temp1 - temp2; 164 165 temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16); 166 temp2 = (ip[11] * sinpi8sqrt2) >> 16; 167 d1 = temp1 + temp2; 168 169 temp3 = (ip[13] * sinpi8sqrt2) >> 16; 170 temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); 171 c2 = temp3 - temp4; 172 173 temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); 174 temp4 = (ip[15] * sinpi8sqrt2) >> 16; 175 d2 = temp3 + temp4; 176 177 op[8] = (a1 + d1 + 4) >> 3; 178 op[11] = (a1 - d1 + 4) >> 3; 179 op[9] = (b1 + c1 + 4) >> 3; 180 op[10] = (b1 - c1 + 4) >> 3; 181 182 a1 = ip[12] + ip[14]; 183 b1 = ip[12] - ip[14]; 184 185 op[12] = (a1 + d2 + 4) >> 3; 186 op[15] = (a1 - d2 + 4) >> 3; 187 op[13] = (b1 + c2 + 4) >> 3; 188 op[14] = (b1 - c2 + 4) >> 3; 189 190 ip = output; 191 192 for (r = 0; r < 4; ++r) { 193 for (c = 0; c < 4; ++c) { 194 short a = ip[c] + pred_ptr[c]; 195 dst_ptr[c] = cm[a]; 196 } 197 198 ip += 4; 199 dst_ptr += dst_stride; 200 pred_ptr += pred_stride; 201 } 202} 203 204void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, 205 int pred_stride, unsigned char *dst_ptr, 206 int dst_stride) { 207 int a1; 208 int i, absa1; 209 int t2, vector_a1, vector_a; 210 211 /* a1 = ((input_dc + 4) >> 3); */ 212 __asm__ __volatile__( 213 "addi %[a1], %[input_dc], 4 \n\t" 214 "sra %[a1], %[a1], 3 \n\t" 215 : [a1] "=r"(a1) 216 : [input_dc] "r"(input_dc)); 217 218 if (a1 < 0) { 219 /* use quad-byte 220 * input and output memory are four byte aligned 221 */ 222 __asm__ __volatile__( 223 "abs %[absa1], %[a1] \n\t" 224 "replv.qb %[vector_a1], %[absa1] \n\t" 225 : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) 226 : [a1] "r"(a1)); 227 228 /* use (a1 - predptr[c]) instead a1 + predptr[c] */ 229 for (i = 4; i--;) { 230 __asm__ __volatile__( 231 "lw %[t2], 0(%[pred_ptr]) \n\t" 232 "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t" 233 "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" 234 "sw %[vector_a], 0(%[dst_ptr]) \n\t" 235 "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 236 : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), 237 [dst_ptr] "+&r"(dst_ptr), [pred_ptr] "+&r"(pred_ptr) 238 : [dst_stride] "r"(dst_stride), [pred_stride] "r"(pred_stride), 239 [vector_a1] "r"(vector_a1)); 240 } 241 } else { 242 /* use quad-byte 243 * input and output memory are four byte aligned 244 */ 245 __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" 246 : [vector_a1] "=r"(vector_a1) 247 : [a1] "r"(a1)); 248 249 for (i = 4; i--;) { 250 __asm__ __volatile__( 251 "lw %[t2], 0(%[pred_ptr]) \n\t" 252 "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t" 253 "addu_s.qb %[vector_a], %[vector_a1], %[t2] \n\t" 254 "sw %[vector_a], 0(%[dst_ptr]) \n\t" 255 "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 256 : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), 257 [dst_ptr] "+&r"(dst_ptr), [pred_ptr] "+&r"(pred_ptr) 258 : [dst_stride] "r"(dst_stride), [pred_stride] "r"(pred_stride), 259 [vector_a1] "r"(vector_a1)); 260 } 261 } 262} 263 264void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff) { 265 short output[16]; 266 int i; 267 int a1, b1, c1, d1; 268 int a2, b2, c2, d2; 269 short *ip = input; 270 short *op = output; 271 272 prefetch_load_short(ip); 273 274 for (i = 4; i--;) { 275 a1 = ip[0] + ip[12]; 276 b1 = ip[4] + ip[8]; 277 c1 = ip[4] - ip[8]; 278 d1 = ip[0] - ip[12]; 279 280 op[0] = a1 + b1; 281 op[4] = c1 + d1; 282 op[8] = a1 - b1; 283 op[12] = d1 - c1; 284 285 ip++; 286 op++; 287 } 288 289 ip = output; 290 op = output; 291 292 prefetch_load_short(ip); 293 294 for (i = 4; i--;) { 295 a1 = ip[0] + ip[3] + 3; 296 b1 = ip[1] + ip[2]; 297 c1 = ip[1] - ip[2]; 298 d1 = ip[0] - ip[3] + 3; 299 300 a2 = a1 + b1; 301 b2 = d1 + c1; 302 c2 = a1 - b1; 303 d2 = d1 - c1; 304 305 op[0] = a2 >> 3; 306 op[1] = b2 >> 3; 307 op[2] = c2 >> 3; 308 op[3] = d2 >> 3; 309 310 ip += 4; 311 op += 4; 312 } 313 314 for (i = 0; i < 16; ++i) { 315 mb_dqcoeff[i * 16] = output[i]; 316 } 317} 318 319void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff) { 320 int a1; 321 322 a1 = ((input[0] + 3) >> 3); 323 324 __asm__ __volatile__( 325 "sh %[a1], 0(%[mb_dqcoeff]) \n\t" 326 "sh %[a1], 32(%[mb_dqcoeff]) \n\t" 327 "sh %[a1], 64(%[mb_dqcoeff]) \n\t" 328 "sh %[a1], 96(%[mb_dqcoeff]) \n\t" 329 "sh %[a1], 128(%[mb_dqcoeff]) \n\t" 330 "sh %[a1], 160(%[mb_dqcoeff]) \n\t" 331 "sh %[a1], 192(%[mb_dqcoeff]) \n\t" 332 "sh %[a1], 224(%[mb_dqcoeff]) \n\t" 333 "sh %[a1], 256(%[mb_dqcoeff]) \n\t" 334 "sh %[a1], 288(%[mb_dqcoeff]) \n\t" 335 "sh %[a1], 320(%[mb_dqcoeff]) \n\t" 336 "sh %[a1], 352(%[mb_dqcoeff]) \n\t" 337 "sh %[a1], 384(%[mb_dqcoeff]) \n\t" 338 "sh %[a1], 416(%[mb_dqcoeff]) \n\t" 339 "sh %[a1], 448(%[mb_dqcoeff]) \n\t" 340 "sh %[a1], 480(%[mb_dqcoeff]) \n\t" 341 342 : 343 : [a1] "r"(a1), [mb_dqcoeff] "r"(mb_dqcoeff)); 344} 345 346#endif 347