1/* The gcc-provided loongson intrinsic functions are way too fucking broken
2 * to be of any use, otherwise I'd use them.
3 *
4 * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
5 *   close enough that they could have implemented the _mm_*-style intrinsic
6 *   interface and had a ton of optimized code available to them. Instead they
7 *   implemented something much, much worse.
8 *
9 * - pshuf takes a dead first argument, causing extra instructions to be
10 *   generated.
11 *
12 * - There are no 64-bit shift or logical intrinsics, which means you have
13 *   to implement them with inline assembly, but this is a nightmare because
14 *   gcc doesn't understand that the integer vector datatypes are actually in
15 *   floating-point registers, so you end up with braindead code like
16 *
17 *	punpcklwd	$f9,$f9,$f5
18 *	    dmtc1	v0,$f8
19 *	punpcklwd	$f19,$f19,$f5
20 *	    dmfc1	t9,$f9
21 *	    dmtc1	v0,$f9
22 *	    dmtc1	t9,$f20
23 *	    dmfc1	s0,$f19
24 *	punpcklbh	$f20,$f20,$f2
25 *
26 *   where crap just gets copied back and forth between integer and floating-
27 *   point registers ad nauseum.
28 *
29 * Instead of trying to workaround the problems from these crap intrinsics, I
30 * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
31 * assembly.
32 */
33
34#include <stdint.h>
35
36/* vectors are stored in 64-bit floating-point registers */
37typedef double __m64;
38/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
39typedef float  __m32;
40
41extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
42_mm_setzero_si64 (void)
43{
44	return 0.0;
45}
46
47extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48_mm_add_pi16 (__m64 __m1, __m64 __m2)
49{
50	__m64 ret;
51	asm("paddh %0, %1, %2\n\t"
52	   : "=f" (ret)
53	   : "f" (__m1), "f" (__m2)
54	);
55	return ret;
56}
57
58extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59_mm_add_pi32 (__m64 __m1, __m64 __m2)
60{
61	__m64 ret;
62	asm("paddw %0, %1, %2\n\t"
63	   : "=f" (ret)
64	   : "f" (__m1), "f" (__m2)
65	);
66	return ret;
67}
68
69extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70_mm_adds_pu16 (__m64 __m1, __m64 __m2)
71{
72	__m64 ret;
73	asm("paddush %0, %1, %2\n\t"
74	   : "=f" (ret)
75	   : "f" (__m1), "f" (__m2)
76	);
77	return ret;
78}
79
80extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81_mm_adds_pu8 (__m64 __m1, __m64 __m2)
82{
83	__m64 ret;
84	asm("paddusb %0, %1, %2\n\t"
85	   : "=f" (ret)
86	   : "f" (__m1), "f" (__m2)
87	);
88	return ret;
89}
90
91extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92_mm_and_si64 (__m64 __m1, __m64 __m2)
93{
94	__m64 ret;
95	asm("and %0, %1, %2\n\t"
96	   : "=f" (ret)
97	   : "f" (__m1), "f" (__m2)
98	);
99	return ret;
100}
101
102extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
104{
105	__m64 ret;
106	asm("pcmpeqw %0, %1, %2\n\t"
107	   : "=f" (ret)
108	   : "f" (__m1), "f" (__m2)
109	);
110	return ret;
111}
112
113extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
114_mm_empty (void)
115{
116
117}
118
119extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120_mm_madd_pi16 (__m64 __m1, __m64 __m2)
121{
122	__m64 ret;
123	asm("pmaddhw %0, %1, %2\n\t"
124	   : "=f" (ret)
125	   : "f" (__m1), "f" (__m2)
126	);
127	return ret;
128}
129
130extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131_mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
132{
133	__m64 ret;
134	asm("pmulhuh %0, %1, %2\n\t"
135	   : "=f" (ret)
136	   : "f" (__m1), "f" (__m2)
137	);
138	return ret;
139}
140
141extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
143{
144	__m64 ret;
145	asm("pmullh %0, %1, %2\n\t"
146	   : "=f" (ret)
147	   : "f" (__m1), "f" (__m2)
148	);
149	return ret;
150}
151
152extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153_mm_or_si64 (__m64 __m1, __m64 __m2)
154{
155	__m64 ret;
156	asm("or %0, %1, %2\n\t"
157	   : "=f" (ret)
158	   : "f" (__m1), "f" (__m2)
159	);
160	return ret;
161}
162
163extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164_mm_packs_pu16 (__m64 __m1, __m64 __m2)
165{
166	__m64 ret;
167	asm("packushb %0, %1, %2\n\t"
168	   : "=f" (ret)
169	   : "f" (__m1), "f" (__m2)
170	);
171	return ret;
172}
173
174extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175_mm_packs_pi32 (__m64 __m1, __m64 __m2)
176{
177	__m64 ret;
178	asm("packsswh %0, %1, %2\n\t"
179	   : "=f" (ret)
180	   : "f" (__m1), "f" (__m2)
181	);
182	return ret;
183}
184
185#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
186 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
187extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
189{
190	if (__builtin_constant_p (__w3) &&
191	    __builtin_constant_p (__w2) &&
192	    __builtin_constant_p (__w1) &&
193	    __builtin_constant_p (__w0))
194	{
195		uint64_t val = ((uint64_t)__w3 << 48)
196			     | ((uint64_t)__w2 << 32)
197			     | ((uint64_t)__w1 << 16)
198			     | ((uint64_t)__w0 <<  0);
199		return *(__m64 *)&val;
200	}
201	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
202	{
203		/* TODO: handle other cases */
204		uint64_t val = __w3;
205		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
206		__m64 ret;
207		asm("pshufh %0, %1, %2\n\t"
208		    : "=f" (ret)
209		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
210		);
211		return ret;
212	}
213	uint64_t val = ((uint64_t)__w3 << 48)
214		     | ((uint64_t)__w2 << 32)
215		     | ((uint64_t)__w1 << 16)
216		     | ((uint64_t)__w0 <<  0);
217	return *(__m64 *)&val;
218}
219
220extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221_mm_set_pi32 (unsigned __i1, unsigned __i0)
222{
223	if (__builtin_constant_p (__i1) &&
224	    __builtin_constant_p (__i0))
225	{
226		uint64_t val = ((uint64_t)__i1 << 32)
227			     | ((uint64_t)__i0 <<  0);
228		return *(__m64 *)&val;
229	}
230	else if (__i1 == __i0)
231	{
232		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
233		__m64 ret;
234		asm("pshufh %0, %1, %2\n\t"
235		    : "=f" (ret)
236		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
237		);
238		return ret;
239	}
240	uint64_t val = ((uint64_t)__i1 << 32)
241		     | ((uint64_t)__i0 <<  0);
242	return *(__m64 *)&val;
243}
244#undef _MM_SHUFFLE
245
246extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
247_mm_shuffle_pi16 (__m64 __m, int64_t __n)
248{
249	__m64 ret;
250	asm("pshufh %0, %1, %2\n\t"
251	    : "=f" (ret)
252	    : "f" (__m), "f" (*(__m64 *)&__n)
253	);
254	return ret;
255}
256
257extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258_mm_slli_pi16 (__m64 __m, int64_t __count)
259{
260	__m64 ret;
261	asm("psllh  %0, %1, %2\n\t"
262	   : "=f" (ret)
263	   : "f" (__m), "f" (*(__m64 *)&__count)
264	);
265	return ret;
266}
267extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268_mm_slli_si64 (__m64 __m, int64_t __count)
269{
270	__m64 ret;
271	asm("dsll  %0, %1, %2\n\t"
272	   : "=f" (ret)
273	   : "f" (__m), "f" (*(__m64 *)&__count)
274	);
275	return ret;
276}
277
278extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279_mm_srli_pi16 (__m64 __m, int64_t __count)
280{
281	__m64 ret;
282	asm("psrlh %0, %1, %2\n\t"
283	   : "=f" (ret)
284	   : "f" (__m), "f" (*(__m64 *)&__count)
285	);
286	return ret;
287}
288
289extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290_mm_srli_pi32 (__m64 __m, int64_t __count)
291{
292	__m64 ret;
293	asm("psrlw %0, %1, %2\n\t"
294	   : "=f" (ret)
295	   : "f" (__m), "f" (*(__m64 *)&__count)
296	);
297	return ret;
298}
299
300extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
301_mm_srli_si64 (__m64 __m, int64_t __count)
302{
303	__m64 ret;
304	asm("dsrl  %0, %1, %2\n\t"
305	   : "=f" (ret)
306	   : "f" (__m), "f" (*(__m64 *)&__count)
307	);
308	return ret;
309}
310
311extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312_mm_sub_pi16 (__m64 __m1, __m64 __m2)
313{
314	__m64 ret;
315	asm("psubh %0, %1, %2\n\t"
316	   : "=f" (ret)
317	   : "f" (__m1), "f" (__m2)
318	);
319	return ret;
320}
321
322extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
323_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
324{
325	__m64 ret;
326	asm("punpckhbh %0, %1, %2\n\t"
327	   : "=f" (ret)
328	   : "f" (__m1), "f" (__m2)
329	);
330	return ret;
331}
332
333extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
334_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
335{
336	__m64 ret;
337	asm("punpckhhw %0, %1, %2\n\t"
338	   : "=f" (ret)
339	   : "f" (__m1), "f" (__m2)
340	);
341	return ret;
342}
343
344extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
346{
347	__m64 ret;
348	asm("punpcklbh %0, %1, %2\n\t"
349	   : "=f" (ret)
350	   : "f" (__m1), "f" (__m2)
351	);
352	return ret;
353}
354
355/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
356 * allows load8888 to use 32-bit loads */
357extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
358_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
359{
360	__m64 ret;
361	asm("punpcklbh %0, %1, %2\n\t"
362	   : "=f" (ret)
363	   : "f" (__m1), "f" (__m2)
364	);
365	return ret;
366}
367
368extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
370{
371	__m64 ret;
372	asm("punpcklhw %0, %1, %2\n\t"
373	   : "=f" (ret)
374	   : "f" (__m1), "f" (__m2)
375	);
376	return ret;
377}
378
379extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380_mm_xor_si64 (__m64 __m1, __m64 __m2)
381{
382	__m64 ret;
383	asm("xor %0, %1, %2\n\t"
384	   : "=f" (ret)
385	   : "f" (__m1), "f" (__m2)
386	);
387	return ret;
388}
389
390extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
391loongson_extract_pi16 (__m64 __m, int64_t __pos)
392{
393	__m64 ret;
394	asm("pextrh %0, %1, %2\n\t"
395	   : "=f" (ret)
396	   : "f" (__m), "f" (*(__m64 *)&__pos)
397	);
398	return ret;
399}
400
401extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos)
403{
404	__m64 ret;
405	asm("pinsrh_%3 %0, %1, %2\n\t"
406	   : "=f" (ret)
407	   : "f" (__m1), "f" (__m2), "i" (__pos)
408	);
409	return ret;
410}
411