1/* The gcc-provided loongson intrinsic functions are way too fucking broken 2 * to be of any use, otherwise I'd use them. 3 * 4 * - The hardware instructions are very similar to MMX or iwMMXt. Certainly 5 * close enough that they could have implemented the _mm_*-style intrinsic 6 * interface and had a ton of optimized code available to them. Instead they 7 * implemented something much, much worse. 8 * 9 * - pshuf takes a dead first argument, causing extra instructions to be 10 * generated. 11 * 12 * - There are no 64-bit shift or logical intrinsics, which means you have 13 * to implement them with inline assembly, but this is a nightmare because 14 * gcc doesn't understand that the integer vector datatypes are actually in 15 * floating-point registers, so you end up with braindead code like 16 * 17 * punpcklwd $f9,$f9,$f5 18 * dmtc1 v0,$f8 19 * punpcklwd $f19,$f19,$f5 20 * dmfc1 t9,$f9 21 * dmtc1 v0,$f9 22 * dmtc1 t9,$f20 23 * dmfc1 s0,$f19 24 * punpcklbh $f20,$f20,$f2 25 * 26 * where crap just gets copied back and forth between integer and floating- 27 * point registers ad nauseum. 28 * 29 * Instead of trying to workaround the problems from these crap intrinsics, I 30 * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline 31 * assembly. 32 */ 33 34#include <stdint.h> 35 36/* vectors are stored in 64-bit floating-point registers */ 37typedef double __m64; 38/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */ 39typedef float __m32; 40 41extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 42_mm_setzero_si64 (void) 43{ 44 return 0.0; 45} 46 47extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 48_mm_add_pi16 (__m64 __m1, __m64 __m2) 49{ 50 __m64 ret; 51 asm("paddh %0, %1, %2\n\t" 52 : "=f" (ret) 53 : "f" (__m1), "f" (__m2) 54 ); 55 return ret; 56} 57 58extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 59_mm_add_pi32 (__m64 __m1, __m64 __m2) 60{ 61 __m64 ret; 62 asm("paddw %0, %1, %2\n\t" 63 : "=f" (ret) 64 : "f" (__m1), "f" (__m2) 65 ); 66 return ret; 67} 68 69extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 70_mm_adds_pu16 (__m64 __m1, __m64 __m2) 71{ 72 __m64 ret; 73 asm("paddush %0, %1, %2\n\t" 74 : "=f" (ret) 75 : "f" (__m1), "f" (__m2) 76 ); 77 return ret; 78} 79 80extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 81_mm_adds_pu8 (__m64 __m1, __m64 __m2) 82{ 83 __m64 ret; 84 asm("paddusb %0, %1, %2\n\t" 85 : "=f" (ret) 86 : "f" (__m1), "f" (__m2) 87 ); 88 return ret; 89} 90 91extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 92_mm_and_si64 (__m64 __m1, __m64 __m2) 93{ 94 __m64 ret; 95 asm("and %0, %1, %2\n\t" 96 : "=f" (ret) 97 : "f" (__m1), "f" (__m2) 98 ); 99 return ret; 100} 101 102extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 103_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 104{ 105 __m64 ret; 106 asm("pcmpeqw %0, %1, %2\n\t" 107 : "=f" (ret) 108 : "f" (__m1), "f" (__m2) 109 ); 110 return ret; 111} 112 113extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 114_mm_empty (void) 115{ 116 117} 118 119extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120_mm_madd_pi16 (__m64 __m1, __m64 __m2) 121{ 122 __m64 ret; 123 asm("pmaddhw %0, %1, %2\n\t" 124 : "=f" (ret) 125 : "f" (__m1), "f" (__m2) 126 ); 127 return ret; 128} 129 130extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 131_mm_mulhi_pu16 (__m64 __m1, __m64 __m2) 132{ 133 __m64 ret; 134 asm("pmulhuh %0, %1, %2\n\t" 135 : "=f" (ret) 136 : "f" (__m1), "f" (__m2) 137 ); 138 return ret; 139} 140 141extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142_mm_mullo_pi16 (__m64 __m1, __m64 __m2) 143{ 144 __m64 ret; 145 asm("pmullh %0, %1, %2\n\t" 146 : "=f" (ret) 147 : "f" (__m1), "f" (__m2) 148 ); 149 return ret; 150} 151 152extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153_mm_or_si64 (__m64 __m1, __m64 __m2) 154{ 155 __m64 ret; 156 asm("or %0, %1, %2\n\t" 157 : "=f" (ret) 158 : "f" (__m1), "f" (__m2) 159 ); 160 return ret; 161} 162 163extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 164_mm_packs_pu16 (__m64 __m1, __m64 __m2) 165{ 166 __m64 ret; 167 asm("packushb %0, %1, %2\n\t" 168 : "=f" (ret) 169 : "f" (__m1), "f" (__m2) 170 ); 171 return ret; 172} 173 174extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 175_mm_packs_pi32 (__m64 __m1, __m64 __m2) 176{ 177 __m64 ret; 178 asm("packsswh %0, %1, %2\n\t" 179 : "=f" (ret) 180 : "f" (__m1), "f" (__m2) 181 ); 182 return ret; 183} 184 185#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 186 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 187extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 188_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0) 189{ 190 if (__builtin_constant_p (__w3) && 191 __builtin_constant_p (__w2) && 192 __builtin_constant_p (__w1) && 193 __builtin_constant_p (__w0)) 194 { 195 uint64_t val = ((uint64_t)__w3 << 48) 196 | ((uint64_t)__w2 << 32) 197 | ((uint64_t)__w1 << 16) 198 | ((uint64_t)__w0 << 0); 199 return *(__m64 *)&val; 200 } 201 else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0) 202 { 203 /* TODO: handle other cases */ 204 uint64_t val = __w3; 205 uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0); 206 __m64 ret; 207 asm("pshufh %0, %1, %2\n\t" 208 : "=f" (ret) 209 : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm) 210 ); 211 return ret; 212 } 213 uint64_t val = ((uint64_t)__w3 << 48) 214 | ((uint64_t)__w2 << 32) 215 | ((uint64_t)__w1 << 16) 216 | ((uint64_t)__w0 << 0); 217 return *(__m64 *)&val; 218} 219 220extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221_mm_set_pi32 (unsigned __i1, unsigned __i0) 222{ 223 if (__builtin_constant_p (__i1) && 224 __builtin_constant_p (__i0)) 225 { 226 uint64_t val = ((uint64_t)__i1 << 32) 227 | ((uint64_t)__i0 << 0); 228 return *(__m64 *)&val; 229 } 230 else if (__i1 == __i0) 231 { 232 uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0); 233 __m64 ret; 234 asm("pshufh %0, %1, %2\n\t" 235 : "=f" (ret) 236 : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) 237 ); 238 return ret; 239 } 240 uint64_t val = ((uint64_t)__i1 << 32) 241 | ((uint64_t)__i0 << 0); 242 return *(__m64 *)&val; 243} 244#undef _MM_SHUFFLE 245 246extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 247_mm_shuffle_pi16 (__m64 __m, int64_t __n) 248{ 249 __m64 ret; 250 asm("pshufh %0, %1, %2\n\t" 251 : "=f" (ret) 252 : "f" (__m), "f" (*(__m64 *)&__n) 253 ); 254 return ret; 255} 256 257extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 258_mm_slli_pi16 (__m64 __m, int64_t __count) 259{ 260 __m64 ret; 261 asm("psllh %0, %1, %2\n\t" 262 : "=f" (ret) 263 : "f" (__m), "f" (*(__m64 *)&__count) 264 ); 265 return ret; 266} 267extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 268_mm_slli_si64 (__m64 __m, int64_t __count) 269{ 270 __m64 ret; 271 asm("dsll %0, %1, %2\n\t" 272 : "=f" (ret) 273 : "f" (__m), "f" (*(__m64 *)&__count) 274 ); 275 return ret; 276} 277 278extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 279_mm_srli_pi16 (__m64 __m, int64_t __count) 280{ 281 __m64 ret; 282 asm("psrlh %0, %1, %2\n\t" 283 : "=f" (ret) 284 : "f" (__m), "f" (*(__m64 *)&__count) 285 ); 286 return ret; 287} 288 289extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 290_mm_srli_pi32 (__m64 __m, int64_t __count) 291{ 292 __m64 ret; 293 asm("psrlw %0, %1, %2\n\t" 294 : "=f" (ret) 295 : "f" (__m), "f" (*(__m64 *)&__count) 296 ); 297 return ret; 298} 299 300extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 301_mm_srli_si64 (__m64 __m, int64_t __count) 302{ 303 __m64 ret; 304 asm("dsrl %0, %1, %2\n\t" 305 : "=f" (ret) 306 : "f" (__m), "f" (*(__m64 *)&__count) 307 ); 308 return ret; 309} 310 311extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 312_mm_sub_pi16 (__m64 __m1, __m64 __m2) 313{ 314 __m64 ret; 315 asm("psubh %0, %1, %2\n\t" 316 : "=f" (ret) 317 : "f" (__m1), "f" (__m2) 318 ); 319 return ret; 320} 321 322extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 323_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 324{ 325 __m64 ret; 326 asm("punpckhbh %0, %1, %2\n\t" 327 : "=f" (ret) 328 : "f" (__m1), "f" (__m2) 329 ); 330 return ret; 331} 332 333extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 334_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 335{ 336 __m64 ret; 337 asm("punpckhhw %0, %1, %2\n\t" 338 : "=f" (ret) 339 : "f" (__m1), "f" (__m2) 340 ); 341 return ret; 342} 343 344extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 345_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 346{ 347 __m64 ret; 348 asm("punpcklbh %0, %1, %2\n\t" 349 : "=f" (ret) 350 : "f" (__m1), "f" (__m2) 351 ); 352 return ret; 353} 354 355/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which 356 * allows load8888 to use 32-bit loads */ 357extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 358_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2) 359{ 360 __m64 ret; 361 asm("punpcklbh %0, %1, %2\n\t" 362 : "=f" (ret) 363 : "f" (__m1), "f" (__m2) 364 ); 365 return ret; 366} 367 368extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 369_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 370{ 371 __m64 ret; 372 asm("punpcklhw %0, %1, %2\n\t" 373 : "=f" (ret) 374 : "f" (__m1), "f" (__m2) 375 ); 376 return ret; 377} 378 379extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 380_mm_xor_si64 (__m64 __m1, __m64 __m2) 381{ 382 __m64 ret; 383 asm("xor %0, %1, %2\n\t" 384 : "=f" (ret) 385 : "f" (__m1), "f" (__m2) 386 ); 387 return ret; 388} 389 390extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 391loongson_extract_pi16 (__m64 __m, int64_t __pos) 392{ 393 __m64 ret; 394 asm("pextrh %0, %1, %2\n\t" 395 : "=f" (ret) 396 : "f" (__m), "f" (*(__m64 *)&__pos) 397 ); 398 return ret; 399} 400 401extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 402loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos) 403{ 404 __m64 ret; 405 asm("pinsrh_%3 %0, %1, %2\n\t" 406 : "=f" (ret) 407 : "f" (__m1), "f" (__m2), "i" (__pos) 408 ); 409 return ret; 410} 411