1/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __TMMINTRIN_H
25#define __TMMINTRIN_H
26
27#include <pmmintrin.h>
28
29/* Define the default attributes for the functions in this file. */
30#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
31
32/// \brief Computes the absolute value of each of the packed 8-bit signed
33///    integers in the source operand and stores the 8-bit unsigned integer
34///    results in the destination.
35///
36/// \headerfile <x86intrin.h>
37///
38/// This intrinsic corresponds to the \c PABSB instruction.
39///
40/// \param __a
41///    A 64-bit vector of [8 x i8].
42/// \returns A 64-bit integer vector containing the absolute values of the
43///    elements in the operand.
44static __inline__ __m64 __DEFAULT_FN_ATTRS
45_mm_abs_pi8(__m64 __a)
46{
47    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
48}
49
50/// \brief Computes the absolute value of each of the packed 8-bit signed
51///    integers in the source operand and stores the 8-bit unsigned integer
52///    results in the destination.
53///
54/// \headerfile <x86intrin.h>
55///
56/// This intrinsic corresponds to the \c VPABSB instruction.
57///
58/// \param __a
59///    A 128-bit vector of [16 x i8].
60/// \returns A 128-bit integer vector containing the absolute values of the
61///    elements in the operand.
62static __inline__ __m128i __DEFAULT_FN_ATTRS
63_mm_abs_epi8(__m128i __a)
64{
65    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
66}
67
68/// \brief Computes the absolute value of each of the packed 16-bit signed
69///    integers in the source operand and stores the 16-bit unsigned integer
70///    results in the destination.
71///
72/// \headerfile <x86intrin.h>
73///
74/// This intrinsic corresponds to the \c PABSW instruction.
75///
76/// \param __a
77///    A 64-bit vector of [4 x i16].
78/// \returns A 64-bit integer vector containing the absolute values of the
79///    elements in the operand.
80static __inline__ __m64 __DEFAULT_FN_ATTRS
81_mm_abs_pi16(__m64 __a)
82{
83    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
84}
85
86/// \brief Computes the absolute value of each of the packed 16-bit signed
87///    integers in the source operand and stores the 16-bit unsigned integer
88///    results in the destination.
89///
90/// \headerfile <x86intrin.h>
91///
92/// This intrinsic corresponds to the \c VPABSW instruction.
93///
94/// \param __a
95///    A 128-bit vector of [8 x i16].
96/// \returns A 128-bit integer vector containing the absolute values of the
97///    elements in the operand.
98static __inline__ __m128i __DEFAULT_FN_ATTRS
99_mm_abs_epi16(__m128i __a)
100{
101    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
102}
103
104/// \brief Computes the absolute value of each of the packed 32-bit signed
105///    integers in the source operand and stores the 32-bit unsigned integer
106///    results in the destination.
107///
108/// \headerfile <x86intrin.h>
109///
110/// This intrinsic corresponds to the \c PABSD instruction.
111///
112/// \param __a
113///    A 64-bit vector of [2 x i32].
114/// \returns A 64-bit integer vector containing the absolute values of the
115///    elements in the operand.
116static __inline__ __m64 __DEFAULT_FN_ATTRS
117_mm_abs_pi32(__m64 __a)
118{
119    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
120}
121
122/// \brief Computes the absolute value of each of the packed 32-bit signed
123///    integers in the source operand and stores the 32-bit unsigned integer
124///    results in the destination.
125///
126/// \headerfile <x86intrin.h>
127///
128/// This intrinsic corresponds to the \c VPABSD instruction.
129///
130/// \param __a
131///    A 128-bit vector of [4 x i32].
132/// \returns A 128-bit integer vector containing the absolute values of the
133///    elements in the operand.
134static __inline__ __m128i __DEFAULT_FN_ATTRS
135_mm_abs_epi32(__m128i __a)
136{
137    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
138}
139
140/// \brief Concatenates the two 128-bit integer vector operands, and
141///    right-shifts the result by the number of bytes specified in the immediate
142///    operand.
143///
144/// \headerfile <x86intrin.h>
145///
146/// \code
147/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
148/// \endcode
149///
150/// This intrinsic corresponds to the \c PALIGNR instruction.
151///
152/// \param a
153///    A 128-bit vector of [16 x i8] containing one of the source operands.
154/// \param b
155///    A 128-bit vector of [16 x i8] containing one of the source operands.
156/// \param n
157///    An immediate operand specifying how many bytes to right-shift the result.
158/// \returns A 128-bit integer vector containing the concatenated right-shifted
159///    value.
160#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
161  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
162                                     (__v16qi)(__m128i)(b), (n)); })
163
164/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
165///    the result by the number of bytes specified in the immediate operand.
166///
167/// \headerfile <x86intrin.h>
168///
169/// \code
170/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
171/// \endcode
172///
173/// This intrinsic corresponds to the \c PALIGNR instruction.
174///
175/// \param a
176///    A 64-bit vector of [8 x i8] containing one of the source operands.
177/// \param b
178///    A 64-bit vector of [8 x i8] containing one of the source operands.
179/// \param n
180///    An immediate operand specifying how many bytes to right-shift the result.
181/// \returns A 64-bit integer vector containing the concatenated right-shifted
182///    value.
183#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
184  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
185
186/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
187///    128-bit vectors of [8 x i16].
188///
189/// \headerfile <x86intrin.h>
190///
191/// This intrinsic corresponds to the \c VPHADDW instruction.
192///
193/// \param __a
194///    A 128-bit vector of [8 x i16] containing one of the source operands. The
195///    horizontal sums of the values are stored in the lower bits of the
196///    destination.
197/// \param __b
198///    A 128-bit vector of [8 x i16] containing one of the source operands. The
199///    horizontal sums of the values are stored in the upper bits of the
200///    destination.
201/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
202///    both operands.
203static __inline__ __m128i __DEFAULT_FN_ATTRS
204_mm_hadd_epi16(__m128i __a, __m128i __b)
205{
206    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
207}
208
209/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
210///    128-bit vectors of [4 x i32].
211///
212/// \headerfile <x86intrin.h>
213///
214/// This intrinsic corresponds to the \c VPHADDD instruction.
215///
216/// \param __a
217///    A 128-bit vector of [4 x i32] containing one of the source operands. The
218///    horizontal sums of the values are stored in the lower bits of the
219///    destination.
220/// \param __b
221///    A 128-bit vector of [4 x i32] containing one of the source operands. The
222///    horizontal sums of the values are stored in the upper bits of the
223///    destination.
224/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
225///    both operands.
226static __inline__ __m128i __DEFAULT_FN_ATTRS
227_mm_hadd_epi32(__m128i __a, __m128i __b)
228{
229    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
230}
231
232/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
233///    64-bit vectors of [4 x i16].
234///
235/// \headerfile <x86intrin.h>
236///
237/// This intrinsic corresponds to the \c PHADDW instruction.
238///
239/// \param __a
240///    A 64-bit vector of [4 x i16] containing one of the source operands. The
241///    horizontal sums of the values are stored in the lower bits of the
242///    destination.
243/// \param __b
244///    A 64-bit vector of [4 x i16] containing one of the source operands. The
245///    horizontal sums of the values are stored in the upper bits of the
246///    destination.
247/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
248///    operands.
249static __inline__ __m64 __DEFAULT_FN_ATTRS
250_mm_hadd_pi16(__m64 __a, __m64 __b)
251{
252    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
253}
254
255/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
256///    64-bit vectors of [2 x i32].
257///
258/// \headerfile <x86intrin.h>
259///
260/// This intrinsic corresponds to the \c PHADDD instruction.
261///
262/// \param __a
263///    A 64-bit vector of [2 x i32] containing one of the source operands. The
264///    horizontal sums of the values are stored in the lower bits of the
265///    destination.
266/// \param __b
267///    A 64-bit vector of [2 x i32] containing one of the source operands. The
268///    horizontal sums of the values are stored in the upper bits of the
269///    destination.
270/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
271///    operands.
272static __inline__ __m64 __DEFAULT_FN_ATTRS
273_mm_hadd_pi32(__m64 __a, __m64 __b)
274{
275    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
276}
277
278/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
279///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
280///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
281///
282/// \headerfile <x86intrin.h>
283///
284/// This intrinsic corresponds to the \c VPHADDSW instruction.
285///
286/// \param __a
287///    A 128-bit vector of [8 x i16] containing one of the source operands. The
288///    horizontal sums of the values are stored in the lower bits of the
289///    destination.
290/// \param __b
291///    A 128-bit vector of [8 x i16] containing one of the source operands. The
292///    horizontal sums of the values are stored in the upper bits of the
293///    destination.
294/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
295///    sums of both operands.
296static __inline__ __m128i __DEFAULT_FN_ATTRS
297_mm_hadds_epi16(__m128i __a, __m128i __b)
298{
299    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
300}
301
302/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
303///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
304///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
305///
306/// \headerfile <x86intrin.h>
307///
308/// This intrinsic corresponds to the \c PHADDSW instruction.
309///
310/// \param __a
311///    A 64-bit vector of [4 x i16] containing one of the source operands. The
312///    horizontal sums of the values are stored in the lower bits of the
313///    destination.
314/// \param __b
315///    A 64-bit vector of [4 x i16] containing one of the source operands. The
316///    horizontal sums of the values are stored in the upper bits of the
317///    destination.
318/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
319///    sums of both operands.
320static __inline__ __m64 __DEFAULT_FN_ATTRS
321_mm_hadds_pi16(__m64 __a, __m64 __b)
322{
323    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
324}
325
326/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
327///    packed 128-bit vectors of [8 x i16].
328///
329/// \headerfile <x86intrin.h>
330///
331/// This intrinsic corresponds to the \c VPHSUBW instruction.
332///
333/// \param __a
334///    A 128-bit vector of [8 x i16] containing one of the source operands. The
335///    horizontal differences between the values are stored in the lower bits of
336///    the destination.
337/// \param __b
338///    A 128-bit vector of [8 x i16] containing one of the source operands. The
339///    horizontal differences between the values are stored in the upper bits of
340///    the destination.
341/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
342///    of both operands.
343static __inline__ __m128i __DEFAULT_FN_ATTRS
344_mm_hsub_epi16(__m128i __a, __m128i __b)
345{
346    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
347}
348
349/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
350///    packed 128-bit vectors of [4 x i32].
351///
352/// \headerfile <x86intrin.h>
353///
354/// This intrinsic corresponds to the \c VPHSUBD instruction.
355///
356/// \param __a
357///    A 128-bit vector of [4 x i32] containing one of the source operands. The
358///    horizontal differences between the values are stored in the lower bits of
359///    the destination.
360/// \param __b
361///    A 128-bit vector of [4 x i32] containing one of the source operands. The
362///    horizontal differences between the values are stored in the upper bits of
363///    the destination.
364/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
365///    of both operands.
366static __inline__ __m128i __DEFAULT_FN_ATTRS
367_mm_hsub_epi32(__m128i __a, __m128i __b)
368{
369    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
370}
371
372/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
373///    packed 64-bit vectors of [4 x i16].
374///
375/// \headerfile <x86intrin.h>
376///
377/// This intrinsic corresponds to the \c PHSUBW instruction.
378///
379/// \param __a
380///    A 64-bit vector of [4 x i16] containing one of the source operands. The
381///    horizontal differences between the values are stored in the lower bits of
382///    the destination.
383/// \param __b
384///    A 64-bit vector of [4 x i16] containing one of the source operands. The
385///    horizontal differences between the values are stored in the upper bits of
386///    the destination.
387/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
388///    of both operands.
389static __inline__ __m64 __DEFAULT_FN_ATTRS
390_mm_hsub_pi16(__m64 __a, __m64 __b)
391{
392    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
393}
394
395/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
396///    packed 64-bit vectors of [2 x i32].
397///
398/// \headerfile <x86intrin.h>
399///
400/// This intrinsic corresponds to the \c PHSUBD instruction.
401///
402/// \param __a
403///    A 64-bit vector of [2 x i32] containing one of the source operands. The
404///    horizontal differences between the values are stored in the lower bits of
405///    the destination.
406/// \param __b
407///    A 64-bit vector of [2 x i32] containing one of the source operands. The
408///    horizontal differences between the values are stored in the upper bits of
409///    the destination.
410/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
411///    of both operands.
412static __inline__ __m64 __DEFAULT_FN_ATTRS
413_mm_hsub_pi32(__m64 __a, __m64 __b)
414{
415    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
416}
417
418/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
419///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
420///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
421///    saturated to 8000h.
422///
423/// \headerfile <x86intrin.h>
424///
425/// This intrinsic corresponds to the \c VPHSUBSW instruction.
426///
427/// \param __a
428///    A 128-bit vector of [8 x i16] containing one of the source operands. The
429///    horizontal differences between the values are stored in the lower bits of
430///    the destination.
431/// \param __b
432///    A 128-bit vector of [8 x i16] containing one of the source operands. The
433///    horizontal differences between the values are stored in the upper bits of
434///    the destination.
435/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
436///    differences of both operands.
437static __inline__ __m128i __DEFAULT_FN_ATTRS
438_mm_hsubs_epi16(__m128i __a, __m128i __b)
439{
440    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
441}
442
443/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
444///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
445///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
446///    saturated to 8000h.
447///
448/// \headerfile <x86intrin.h>
449///
450/// This intrinsic corresponds to the \c PHSUBSW instruction.
451///
452/// \param __a
453///    A 64-bit vector of [4 x i16] containing one of the source operands. The
454///    horizontal differences between the values are stored in the lower bits of
455///    the destination.
456/// \param __b
457///    A 64-bit vector of [4 x i16] containing one of the source operands. The
458///    horizontal differences between the values are stored in the upper bits of
459///    the destination.
460/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
461///    differences of both operands.
462static __inline__ __m64 __DEFAULT_FN_ATTRS
463_mm_hsubs_pi16(__m64 __a, __m64 __b)
464{
465    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
466}
467
468/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
469///    values contained in the first source operand and packed 8-bit signed
470///    integer values contained in the second source operand, adds pairs of
471///    contiguous products with signed saturation, and writes the 16-bit sums to
472///    the corresponding bits in the destination. For example, bits [7:0] of
473///    both operands are multiplied, bits [15:8] of both operands are
474///    multiplied, and the sum of both results is written to bits [15:0] of the
475///    destination.
476///
477/// \headerfile <x86intrin.h>
478///
479/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
480///
481/// \param __a
482///    A 128-bit integer vector containing the first source operand.
483/// \param __b
484///    A 128-bit integer vector containing the second source operand.
485/// \returns A 128-bit integer vector containing the sums of products of both
486///    operands:
487///    R0 := (__a0 * __b0) + (__a1 * __b1)
488///    R1 := (__a2 * __b2) + (__a3 * __b3)
489///    R2 := (__a4 * __b4) + (__a5 * __b5)
490///    R3 := (__a6 * __b6) + (__a7 * __b7)
491///    R4 := (__a8 * __b8) + (__a9 * __b9)
492///    R5 := (__a10 * __b10) + (__a11 * __b11)
493///    R6 := (__a12 * __b12) + (__a13 * __b13)
494///    R7 := (__a14 * __b14) + (__a15 * __b15)
495static __inline__ __m128i __DEFAULT_FN_ATTRS
496_mm_maddubs_epi16(__m128i __a, __m128i __b)
497{
498    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
499}
500
501/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
502///    values contained in the first source operand and packed 8-bit signed
503///    integer values contained in the second source operand, adds pairs of
504///    contiguous products with signed saturation, and writes the 16-bit sums to
505///    the corresponding bits in the destination. For example, bits [7:0] of
506///    both operands are multiplied, bits [15:8] of both operands are
507///    multiplied, and the sum of both results is written to bits [15:0] of the
508///    destination.
509///
510/// \headerfile <x86intrin.h>
511///
512/// This intrinsic corresponds to the \c PMADDUBSW instruction.
513///
514/// \param __a
515///    A 64-bit integer vector containing the first source operand.
516/// \param __b
517///    A 64-bit integer vector containing the second source operand.
518/// \returns A 64-bit integer vector containing the sums of products of both
519///    operands:
520///    R0 := (__a0 * __b0) + (__a1 * __b1)
521///    R1 := (__a2 * __b2) + (__a3 * __b3)
522///    R2 := (__a4 * __b4) + (__a5 * __b5)
523///    R3 := (__a6 * __b6) + (__a7 * __b7)
524static __inline__ __m64 __DEFAULT_FN_ATTRS
525_mm_maddubs_pi16(__m64 __a, __m64 __b)
526{
527    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
528}
529
530/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
531///    products to the 18 most significant bits by right-shifting, rounds the
532///    truncated value by adding 1, and writes bits [16:1] to the destination.
533///
534/// \headerfile <x86intrin.h>
535///
536/// This intrinsic corresponds to the \c VPMULHRSW instruction.
537///
538/// \param __a
539///    A 128-bit vector of [8 x i16] containing one of the source operands.
540/// \param __b
541///    A 128-bit vector of [8 x i16] containing one of the source operands.
542/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
543///    products of both operands.
544static __inline__ __m128i __DEFAULT_FN_ATTRS
545_mm_mulhrs_epi16(__m128i __a, __m128i __b)
546{
547    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
548}
549
550/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
551///    products to the 18 most significant bits by right-shifting, rounds the
552///    truncated value by adding 1, and writes bits [16:1] to the destination.
553///
554/// \headerfile <x86intrin.h>
555///
556/// This intrinsic corresponds to the \c PMULHRSW instruction.
557///
558/// \param __a
559///    A 64-bit vector of [4 x i16] containing one of the source operands.
560/// \param __b
561///    A 64-bit vector of [4 x i16] containing one of the source operands.
562/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
563///    products of both operands.
564static __inline__ __m64 __DEFAULT_FN_ATTRS
565_mm_mulhrs_pi16(__m64 __a, __m64 __b)
566{
567    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
568}
569
570/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
571///    destination or clears 8-bit values in the destination, as specified by
572///    the second source operand.
573///
574/// \headerfile <x86intrin.h>
575///
576/// This intrinsic corresponds to the \c VPSHUFB instruction.
577///
578/// \param __a
579///    A 128-bit integer vector containing the values to be copied.
580/// \param __b
581///    A 128-bit integer vector containing control bytes corresponding to
582///    positions in the destination:
583///    Bit 7:
584///    1: Clear the corresponding byte in the destination.
585///    0: Copy the selected source byte to the corresponding byte in the
586///    destination.
587///    Bits [6:4] Reserved.
588///    Bits [3:0] select the source byte to be copied.
589/// \returns A 128-bit integer vector containing the copied or cleared values.
590static __inline__ __m128i __DEFAULT_FN_ATTRS
591_mm_shuffle_epi8(__m128i __a, __m128i __b)
592{
593    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
594}
595
596/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
597///    destination or clears 8-bit values in the destination, as specified by
598///    the second source operand.
599///
600/// \headerfile <x86intrin.h>
601///
602/// This intrinsic corresponds to the \c PSHUFB instruction.
603///
604/// \param __a
605///    A 64-bit integer vector containing the values to be copied.
606/// \param __b
607///    A 64-bit integer vector containing control bytes corresponding to
608///    positions in the destination:
609///    Bit 7:
610///    1: Clear the corresponding byte in the destination.
611///    0: Copy the selected source byte to the corresponding byte in the
612///    destination.
613///    Bits [3:0] select the source byte to be copied.
614/// \returns A 64-bit integer vector containing the copied or cleared values.
615static __inline__ __m64 __DEFAULT_FN_ATTRS
616_mm_shuffle_pi8(__m64 __a, __m64 __b)
617{
618    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
619}
620
621/// \brief For each 8-bit integer in the first source operand, perform one of
622///    the following actions as specified by the second source operand: If the
623///    byte in the second source is negative, calculate the two's complement of
624///    the corresponding byte in the first source, and write that value to the
625///    destination. If the byte in the second source is positive, copy the
626///    corresponding byte from the first source to the destination. If the byte
627///    in the second source is zero, clear the corresponding byte in the
628///    destination.
629///
630/// \headerfile <x86intrin.h>
631///
632/// This intrinsic corresponds to the \c VPSIGNB instruction.
633///
634/// \param __a
635///    A 128-bit integer vector containing the values to be copied.
636/// \param __b
637///    A 128-bit integer vector containing control bytes corresponding to
638///    positions in the destination.
639/// \returns A 128-bit integer vector containing the resultant values.
640static __inline__ __m128i __DEFAULT_FN_ATTRS
641_mm_sign_epi8(__m128i __a, __m128i __b)
642{
643    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
644}
645
646/// \brief For each 16-bit integer in the first source operand, perform one of
647///    the following actions as specified by the second source operand: If the
648///    word in the second source is negative, calculate the two's complement of
649///    the corresponding word in the first source, and write that value to the
650///    destination. If the word in the second source is positive, copy the
651///    corresponding word from the first source to the destination. If the word
652///    in the second source is zero, clear the corresponding word in the
653///    destination.
654///
655/// \headerfile <x86intrin.h>
656///
657/// This intrinsic corresponds to the \c VPSIGNW instruction.
658///
659/// \param __a
660///    A 128-bit integer vector containing the values to be copied.
661/// \param __b
662///    A 128-bit integer vector containing control words corresponding to
663///    positions in the destination.
664/// \returns A 128-bit integer vector containing the resultant values.
665static __inline__ __m128i __DEFAULT_FN_ATTRS
666_mm_sign_epi16(__m128i __a, __m128i __b)
667{
668    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
669}
670
671/// \brief For each 32-bit integer in the first source operand, perform one of
672///    the following actions as specified by the second source operand: If the
673///    doubleword in the second source is negative, calculate the two's
674///    complement of the corresponding word in the first source, and write that
675///    value to the destination. If the doubleword in the second source is
676///    positive, copy the corresponding word from the first source to the
677///    destination. If the doubleword in the second source is zero, clear the
678///    corresponding word in the destination.
679///
680/// \headerfile <x86intrin.h>
681///
682/// This intrinsic corresponds to the \c VPSIGND instruction.
683///
684/// \param __a
685///    A 128-bit integer vector containing the values to be copied.
686/// \param __b
687///    A 128-bit integer vector containing control doublewords corresponding to
688///    positions in the destination.
689/// \returns A 128-bit integer vector containing the resultant values.
690static __inline__ __m128i __DEFAULT_FN_ATTRS
691_mm_sign_epi32(__m128i __a, __m128i __b)
692{
693    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
694}
695
696/// \brief For each 8-bit integer in the first source operand, perform one of
697///    the following actions as specified by the second source operand: If the
698///    byte in the second source is negative, calculate the two's complement of
699///    the corresponding byte in the first source, and write that value to the
700///    destination. If the byte in the second source is positive, copy the
701///    corresponding byte from the first source to the destination. If the byte
702///    in the second source is zero, clear the corresponding byte in the
703///    destination.
704///
705/// \headerfile <x86intrin.h>
706///
707/// This intrinsic corresponds to the \c PSIGNB instruction.
708///
709/// \param __a
710///    A 64-bit integer vector containing the values to be copied.
711/// \param __b
712///    A 64-bit integer vector containing control bytes corresponding to
713///    positions in the destination.
714/// \returns A 64-bit integer vector containing the resultant values.
715static __inline__ __m64 __DEFAULT_FN_ATTRS
716_mm_sign_pi8(__m64 __a, __m64 __b)
717{
718    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
719}
720
721/// \brief For each 16-bit integer in the first source operand, perform one of
722///    the following actions as specified by the second source operand: If the
723///    word in the second source is negative, calculate the two's complement of
724///    the corresponding word in the first source, and write that value to the
725///    destination. If the word in the second source is positive, copy the
726///    corresponding word from the first source to the destination. If the word
727///    in the second source is zero, clear the corresponding word in the
728///    destination.
729///
730/// \headerfile <x86intrin.h>
731///
732/// This intrinsic corresponds to the \c PSIGNW instruction.
733///
734/// \param __a
735///    A 64-bit integer vector containing the values to be copied.
736/// \param __b
737///    A 64-bit integer vector containing control words corresponding to
738///    positions in the destination.
739/// \returns A 64-bit integer vector containing the resultant values.
740static __inline__ __m64 __DEFAULT_FN_ATTRS
741_mm_sign_pi16(__m64 __a, __m64 __b)
742{
743    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
744}
745
746/// \brief For each 32-bit integer in the first source operand, perform one of
747///    the following actions as specified by the second source operand: If the
748///    doubleword in the second source is negative, calculate the two's
749///    complement of the corresponding doubleword in the first source, and
750///    write that value to the destination. If the doubleword in the second
751///    source is positive, copy the corresponding doubleword from the first
752///    source to the destination. If the doubleword in the second source is
753///    zero, clear the corresponding doubleword in the destination.
754///
755/// \headerfile <x86intrin.h>
756///
757/// This intrinsic corresponds to the \c PSIGND instruction.
758///
759/// \param __a
760///    A 64-bit integer vector containing the values to be copied.
761/// \param __b
762///    A 64-bit integer vector containing two control doublewords corresponding
763///    to positions in the destination.
764/// \returns A 64-bit integer vector containing the resultant values.
765static __inline__ __m64 __DEFAULT_FN_ATTRS
766_mm_sign_pi32(__m64 __a, __m64 __b)
767{
768    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
769}
770
771#undef __DEFAULT_FN_ATTRS
772
773#endif /* __TMMINTRIN_H */
774