1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp8_rtcd.h" 12#include "vpx_ports/mem.h" 13#include "vpx_ports/asmdefs_mmi.h" 14 15/* clang-format off */ 16/* TRANSPOSE_4H: transpose 4x4 matrix. 17 Input: ftmp1,ftmp2,ftmp3,ftmp4 18 Output: ftmp1,ftmp2,ftmp3,ftmp4 19 Note: ftmp0 always be 0, ftmp5~9 used for temporary value. 20 */ 21#define TRANSPOSE_4H \ 22 MMI_LI(%[tmp0], 0x93) \ 23 "mtc1 %[tmp0], %[ftmp10] \n\t" \ 24 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ 25 "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ 26 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ 27 "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ 28 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ 29 "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ 30 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ 31 "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ 32 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 33 "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ 34 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ 35 "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ 36 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ 37 "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ 38 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ 39 "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ 40 "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ 41 "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ 42 "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ 43 "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" 44/* clang-format on */ 45 46void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { 47 uint64_t tmp[1]; 48 int16_t *ip = input; 49 50#if _MIPS_SIM == _ABIO32 51 register double ftmp0 asm("$f0"); 52 register double ftmp1 asm("$f2"); 53 register double ftmp2 asm("$f4"); 54 register double ftmp3 asm("$f6"); 55 register double ftmp4 asm("$f8"); 56 register double ftmp5 asm("$f10"); 57 register double ftmp6 asm("$f12"); 58 register double ftmp7 asm("$f14"); 59 register double ftmp8 asm("$f16"); 60 register double ftmp9 asm("$f18"); 61 register double ftmp10 asm("$f20"); 62 register double ftmp11 asm("$f22"); 63 register double ftmp12 asm("$f24"); 64#else 65 register double ftmp0 asm("$f0"); 66 register double ftmp1 asm("$f1"); 67 register double ftmp2 asm("$f2"); 68 register double ftmp3 asm("$f3"); 69 register double ftmp4 asm("$f4"); 70 register double ftmp5 asm("$f5"); 71 register double ftmp6 asm("$f6"); 72 register double ftmp7 asm("$f7"); 73 register double ftmp8 asm("$f8"); 74 register double ftmp9 asm("$f9"); 75 register double ftmp10 asm("$f10"); 76 register double ftmp11 asm("$f11"); 77 register double ftmp12 asm("$f12"); 78#endif // _MIPS_SIM == _ABIO32 79 80 DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; 81 DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL }; 82 DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL }; 83 DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL }; 84 DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL }; 85 DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; 86 DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL }; 87 DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL }; 88 DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL }; 89 DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL }; 90 DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; 91 92 __asm__ volatile ( 93 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 94 "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" 95 "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" 96 MMI_ADDU(%[ip], %[ip], %[pitch]) 97 "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" 98 "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" 99 MMI_ADDU(%[ip], %[ip], %[pitch]) 100 "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" 101 "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" 102 MMI_ADDU(%[ip], %[ip], %[pitch]) 103 "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" 104 "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" 105 MMI_ADDU(%[ip], %[ip], %[pitch]) 106 TRANSPOSE_4H 107 108 "ldc1 %[ftmp11], %[ff_ph_8] \n\t" 109 // f1 + f4 110 "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" 111 // a1 112 "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t" 113 // f2 + f3 114 "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" 115 // b1 116 "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t" 117 // f2 - f3 118 "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" 119 // c1 120 "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" 121 // f1 - f4 122 "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" 123 // d1 124 "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" 125 // op[0] = a1 + b1 126 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" 127 // op[2] = a1 - b1 128 "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" 129 130 // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12 131 MMI_LI(%[tmp0], 0x0c) 132 "mtc1 %[tmp0], %[ftmp11] \n\t" 133 "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" 134 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 135 "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" 136 "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 137 "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t" 138 "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" 139 "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" 140 "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" 141 "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" 142 "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" 143 144 // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12 145 "ldc1 %[ftmp12], %[ff_pw_7500] \n\t" 146 "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" 147 "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t" 148 "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" 149 "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t" 150 "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" 151 "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" 152 "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" 153 "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" 154 "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" 155 TRANSPOSE_4H 156 157 "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" 158 "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" 159 "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" 160 "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" 161 162 "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t" 163 "ldc1 %[ftmp9], %[ff_ph_01] \n\t" 164 "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" 165 166 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" 167 "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" 168 "ldc1 %[ftmp9], %[ff_ph_07] \n\t" 169 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" 170 "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" 171 MMI_LI(%[tmp0], 0x04) 172 "mtc1 %[tmp0], %[ftmp9] \n\t" 173 "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" 174 "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" 175 176 MMI_LI(%[tmp0], 0x10) 177 "mtc1 %[tmp0], %[ftmp9] \n\t" 178 "ldc1 %[ftmp12], %[ff_pw_12000] \n\t" 179 "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" 180 "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t" 181 "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" 182 "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t" 183 "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" 184 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" 185 "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" 186 "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" 187 "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t" 188 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 189 190 "ldc1 %[ftmp12], %[ff_pw_51000] \n\t" 191 "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" 192 "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t" 193 "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" 194 "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t" 195 "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" 196 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" 197 "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" 198 "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" 199 "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t" 200 201 "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t" 202 "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t" 203 "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t" 204 "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t" 205 "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t" 206 "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t" 207 "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t" 208 "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t" 209 210 : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2), 211 [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), 212 [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), 213 [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), 214 [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip) 215 : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), 216 [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3), 217 [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500), 218 [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000), 219 [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217), 220 [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output) 221 : "memory" 222 ); 223} 224 225void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { 226 vp8_short_fdct4x4_mmi(input, output, pitch); 227 vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch); 228} 229 230void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { 231 double ftmp[13]; 232 uint32_t tmp[1]; 233 DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; 234 DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL }; 235 DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL }; 236 DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL }; 237 238 __asm__ volatile ( 239 MMI_LI(%[tmp0], 0x02) 240 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 241 "mtc1 %[tmp0], %[ftmp11] \n\t" 242 243 "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" 244 "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" 245 MMI_ADDU(%[ip], %[ip], %[pitch]) 246 "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" 247 "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" 248 MMI_ADDU(%[ip], %[ip], %[pitch]) 249 "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" 250 "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" 251 MMI_ADDU(%[ip], %[ip], %[pitch]) 252 "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" 253 "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" 254 TRANSPOSE_4H 255 256 "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 257 "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" 258 "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" 259 "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" 260 // a 261 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" 262 // d 263 "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t" 264 // c 265 "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t" 266 // b 267 "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t" 268 269 // a + d 270 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" 271 // b + c 272 "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t" 273 // b - c 274 "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t" 275 // a - d 276 "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" 277 278 "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" 279 "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t" 280 "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" 281 TRANSPOSE_4H 282 283 // op[2], op[0] 284 "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t" 285 // op[3], op[1] 286 "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t" 287 288 // op[6], op[4] 289 "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t" 290 // op[7], op[5] 291 "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t" 292 293 // op[10], op[8] 294 "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t" 295 // op[11], op[9] 296 "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t" 297 298 // op[14], op[12] 299 "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t" 300 // op[15], op[13] 301 "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t" 302 303 // a1, a3 304 "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t" 305 // d1, d3 306 "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t" 307 // c1, c3 308 "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t" 309 // b1, b3 310 "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t" 311 312 // a1 + d1, a3 + d3 313 "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t" 314 // b1 + c1, b3 + c3 315 "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t" 316 // b1 - c1, b3 - c3 317 "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t" 318 // a1 - d1, a3 - d3 319 "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 320 321 // a2, a4 322 "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t" 323 // d2, d4 324 "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t" 325 // c2, c4 326 "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t" 327 // b2, b4 328 "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t" 329 330 // a2 + d2, a4 + d4 331 "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 332 // b2 + c2, b4 + c4 333 "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t" 334 // b2 - c2, b4 - c4 335 "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t" 336 // a2 - d2, a4 - d4 337 "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t" 338 339 MMI_LI(%[tmp0], 0x03) 340 "mtc1 %[tmp0], %[ftmp11] \n\t" 341 342 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" 343 "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" 344 "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t" 345 "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t" 346 "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 347 348 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" 349 "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" 350 "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t" 351 "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t" 352 "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t" 353 354 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t" 355 "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" 356 "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t" 357 "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t" 358 "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t" 359 360 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t" 361 "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" 362 "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t" 363 "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t" 364 "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t" 365 366 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t" 367 "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" 368 "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" 369 "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t" 370 "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" 371 372 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t" 373 "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" 374 "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t" 375 "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t" 376 "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" 377 378 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t" 379 "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" 380 "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t" 381 "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t" 382 "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t" 383 384 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t" 385 "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" 386 "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" 387 "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t" 388 "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t" 389 390 "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 391 "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 392 "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" 393 "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" 394 395 MMI_LI(%[tmp0], 0x72) 396 "mtc1 %[tmp0], %[ftmp11] \n\t" 397 "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 398 "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" 399 "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" 400 "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" 401 402 "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" 403 "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" 404 "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" 405 "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" 406 "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" 407 "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" 408 "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" 409 "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" 410 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), 411 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), 412 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), 413 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), 414 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), 415 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), 416 [ftmp12]"=&f"(ftmp[12]), 417 [tmp0]"=&r"(tmp[0]), 418 [ip]"+&r"(input) 419 : [op]"r"(output), 420 [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch), 421 [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask), 422 [ff_ph_01]"f"(ff_ph_01) 423 : "memory" 424 ); 425} 426