1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "vpx_ports/config.h" 12#include "onyxc_int.h" 13 14static const int cospi8sqrt2minus1 = 20091; 15static const int sinpi8sqrt2 = 35468; 16 17 18inline void prefetch_load_short(short* src) { 19 __asm__ __volatile__ ( 20 "pref 0, 0(%[src]) \n\t" 21 : 22 : [src] "r" (src) 23 ); 24} 25 26 27void vp8_short_idct4x4llm_mips(short *input, short *output, int pitch) 28{ 29 int i; 30 int a1, b1, c1, d1, c2, d2; 31 32 /* pitch has always value 4 */ 33 34 short *ip = input; 35 short *op = output; 36 int temp1, temp2, temp3, temp4; 37 38 /* prepare data for load */ 39 prefetch_load_short(ip + 8); 40 41 /* first loop is unrolled */ 42 a1 = ip[0] + ip[8]; 43 b1 = ip[0] - ip[8]; 44 45 temp1 = (ip[4] * sinpi8sqrt2) >> 16; 46 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 47 c1 = temp1 - temp2; 48 49 temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); 50 temp2 = (ip[12] * sinpi8sqrt2) >> 16; 51 d1 = temp1 + temp2; 52 53 temp3 = (ip[5] * sinpi8sqrt2) >> 16; 54 temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); 55 c2 = temp3 - temp4; 56 57 temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); 58 temp4 = (ip[13] * sinpi8sqrt2) >> 16; 59 d2 = temp3 + temp4; 60 61 op[0] = a1 + d1; 62 op[12] = a1 - d1; 63 op[4] = b1 + c1; 64 op[8] = b1 - c1; 65 66 a1 = ip[1] + ip[9]; 67 b1 = ip[1] - ip[9]; 68 69 op[1] = a1 + d2; 70 op[13] = a1 - d2; 71 op[5] = b1 + c2; 72 op[9] = b1 - c2; 73 74 a1 = ip[2] + ip[10]; 75 b1 = ip[2] - ip[10]; 76 77 temp1 = (ip[6] * sinpi8sqrt2) >> 16; 78 temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16); 79 c1 = temp1 - temp2; 80 81 temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16); 82 temp2 = (ip[14] * sinpi8sqrt2) >> 16; 83 d1 = temp1 + temp2; 84 85 temp3 = (ip[7] * sinpi8sqrt2) >> 16; 86 temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); 87 c2 = temp3 - temp4; 88 89 temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); 90 temp4 = (ip[15] * sinpi8sqrt2) >> 16; 91 d2 = temp3 + temp4; 92 93 op[2] = a1 + d1; 94 op[14] = a1 - d1; 95 op[6] = b1 + c1; 96 op[10] = b1 - c1; 97 98 a1 = ip[3] + ip[11]; 99 b1 = ip[3] - ip[11]; 100 101 op[3] = a1 + d2; 102 op[15] = a1 - d2; 103 op[7] = b1 + c2; 104 op[11] = b1 - c2; 105 106 ip = output; 107 108 /* prepare data for load */ 109 prefetch_load_short(ip + pitch); 110 111 /* second loop is unrolled */ 112 a1 = ip[0] + ip[2]; 113 b1 = ip[0] - ip[2]; 114 115 temp1 = (ip[1] * sinpi8sqrt2) >> 16; 116 temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); 117 c1 = temp1 - temp2; 118 119 temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); 120 temp2 = (ip[3] * sinpi8sqrt2) >> 16; 121 d1 = temp1 + temp2; 122 123 temp3 = (ip[5] * sinpi8sqrt2) >> 16; 124 temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16); 125 c2 = temp3 - temp4; 126 127 temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16); 128 temp4 = (ip[7] * sinpi8sqrt2) >> 16; 129 d2 = temp3 + temp4; 130 131 op[0] = (a1 + d1 + 4) >> 3; 132 op[3] = (a1 - d1 + 4) >> 3; 133 op[1] = (b1 + c1 + 4) >> 3; 134 op[2] = (b1 - c1 + 4) >> 3; 135 136 a1 = ip[4] + ip[6]; 137 b1 = ip[4] - ip[6]; 138 139 op[4] = (a1 + d2 + 4) >> 3; 140 op[7] = (a1 - d2 + 4) >> 3; 141 op[5] = (b1 + c2 + 4) >> 3; 142 op[6] = (b1 - c2 + 4) >> 3; 143 144 a1 = ip[8] + ip[10]; 145 b1 = ip[8] - ip[10]; 146 147 temp1 = (ip[9] * sinpi8sqrt2) >> 16; 148 temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16); 149 c1 = temp1 - temp2; 150 151 temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16); 152 temp2 = (ip[11] * sinpi8sqrt2) >> 16; 153 d1 = temp1 + temp2; 154 155 temp3 = (ip[13] * sinpi8sqrt2) >> 16; 156 temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16); 157 c2 = temp3 - temp4; 158 159 temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16); 160 temp4 = (ip[15] * sinpi8sqrt2) >> 16; 161 d2 = temp3 + temp4; 162 163 op[8] = (a1 + d1 + 4) >> 3; 164 op[11] = (a1 - d1 + 4) >> 3; 165 op[9] = (b1 + c1 + 4) >> 3; 166 op[10] = (b1 - c1 + 4) >> 3; 167 168 a1 = ip[12] + ip[14]; 169 b1 = ip[12] - ip[14]; 170 171 op[12] = (a1 + d2 + 4) >> 3; 172 op[15] = (a1 - d2 + 4) >> 3; 173 op[13] = (b1 + c2 + 4) >> 3; 174 op[14] = (b1 - c2 + 4) >> 3; 175} 176 177 178void vp8_dc_only_idct_add_mips 179( 180 short input_dc, 181 unsigned char *pred_ptr, 182 unsigned char *dst_ptr, 183 int pitch, 184 int stride 185) 186{ 187 int i, a1, absa1; 188 int t2, vector_a1, vector_a; 189 190 /* a1 = ((input_dc + 4) >> 3); */ 191 __asm__ __volatile__ ( 192 "addi %[a1], %[input_dc], 4 \n\t" 193 "sra %[a1], %[a1], 3 \n\t" 194 : [a1] "=r" (a1) 195 : [input_dc] "r" (input_dc) 196 ); 197 198 /* first for loop is unrolled 199 * if (a1 < 0) then always (a1 + pred_ptr[c]) < 255 200 */ 201 if (a1 < 0) { 202 /* use quad-byte 203 * input and output memory are four byte aligned 204 */ 205 __asm__ __volatile__ ( 206 "abs %[absa1], %[a1] \n\t" 207 "replv.qb %[vector_a1], %[absa1] \n\t" 208 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) 209 : [a1] "r" (a1) 210 ); 211 212 /* use (a1 - predptr[c]) instead a1 + predptr[c] */ 213 for (i = 4; i--;) 214 { 215 __asm__ __volatile__ ( 216 "lw %[t2], 0(%[pred_ptr]) \n\t" 217 "add %[pred_ptr], %[pred_ptr], %[pitch] \n\t" 218 "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" 219 "sw %[vector_a], 0(%[dst_ptr]) \n\t" 220 "add %[dst_ptr], %[dst_ptr], %[stride] \n\t" 221 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), 222 [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr) 223 : [stride] "r" (stride), [pitch] "r" (pitch), [vector_a1] "r" (vector_a1) 224 ); 225 } 226 } 227 else { 228 /* use quad-byte 229 * input and output memory are four byte aligned 230 */ 231 __asm__ __volatile__ ( 232 "replv.qb %[vector_a1], %[a1] \n\t" 233 : [vector_a1] "=r" (vector_a1) 234 : [a1] "r" (a1) 235 ); 236 237 for (i = 4; i--;) 238 { 239 __asm__ __volatile__ ( 240 "lw %[t2], 0(%[pred_ptr]) \n\t" 241 "add %[pred_ptr], %[pred_ptr], %[pitch] \n\t" 242 "addu_s.qb %[vector_a], %[vector_a1], %[t2] \n\t" 243 "sw %[vector_a], 0(%[dst_ptr]) \n\t" 244 "add %[dst_ptr], %[dst_ptr], %[stride] \n\t" 245 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), 246 [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr) 247 : [stride] "r" (stride), [pitch] "r" (pitch), [vector_a1] "r" (vector_a1) 248 ); 249 } 250 } 251} 252 253 254void vp8_short_inv_walsh4x4_mips(short *input, short *output) 255{ 256 int i; 257 int a1, b1, c1, d1; 258 int a2, b2, c2, d2; 259 260 short *ip = input; 261 short *op = output; 262 263 prefetch_load_short(ip); 264 265 for (i = 4; i--;) 266 { 267 a1 = ip[0] + ip[12]; 268 b1 = ip[4] + ip[8]; 269 c1 = ip[4] - ip[8]; 270 d1 = ip[0] - ip[12]; 271 272 op[0] = a1 + b1; 273 op[4] = c1 + d1; 274 op[8] = a1 - b1; 275 op[12] = d1 - c1; 276 277 ip++; 278 op++; 279 } 280 281 ip = output; 282 op = output; 283 284 prefetch_load_short(ip); 285 286 for (i = 4; i--;) 287 { 288 a1 = ip[0] + ip[3] + 3; 289 b1 = ip[1] + ip[2]; 290 c1 = ip[1] - ip[2]; 291 d1 = ip[0] - ip[3] + 3; 292 293 a2 = a1 + b1; 294 b2 = d1 + c1; 295 c2 = a1 - b1; 296 d2 = d1 - c1; 297 298 op[0] = a2 >> 3; 299 op[1] = b2 >> 3; 300 op[2] = c2 >> 3; 301 op[3] = d2 >> 3; 302 303 ip += 4; 304 op += 4; 305 } 306} 307 308 309void vp8_short_inv_walsh4x4_1_mips(short *input, short *output) 310{ 311 int a1; 312 int vect_a; 313 unsigned int *op = (unsigned int *)output; 314 315 a1 = ((input[0] + 3) >> 3); 316 317 __asm__ __volatile__ ( 318 "replv.ph %[vect_a], %[a1] \n\t" 319 : [vect_a] "=r" (vect_a) 320 : [a1] "r" (a1) 321 ); 322 323 /* output is 4 byte aligned */ 324 op[0] = vect_a; 325 op[1] = vect_a; 326 op[2] = vect_a; 327 op[3] = vect_a; 328 op[4] = vect_a; 329 op[5] = vect_a; 330 op[6] = vect_a; 331 op[7] = vect_a; 332} 333