1
2/* A program to test SSE4.1/SSE4.2 instructions.
3   Revisions:  Nov.208     - wrote this file
4               Apr.10.2010 - added PEXTR* tests
5               Apr.16.2010 - added PINS*  tests
6*/
7
8/* HOW TO COMPILE:
9   gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c
10*/
11
12#include <stdio.h>
13#include <stdlib.h>
14#include <assert.h>
15#include "tests/malloc.h"
16#include <string.h>
17
18
19typedef  unsigned char           V128[16];
20typedef  unsigned int            UInt;
21typedef  signed int              Int;
22typedef  unsigned char           UChar;
23typedef  unsigned long long int  ULong;
24
25typedef  unsigned char           Bool;
26#define False ((Bool)0)
27#define True  ((Bool)1)
28
29
30typedef
31   struct {
32      V128 arg1;
33      V128 arg2;
34      V128 res;
35   }
36   RRArgs;
37
38typedef
39   struct {
40      V128 arg1;
41      V128 res;
42   }
43   RMArgs;
44
45static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo )
46{
47   // try to sidestep strict-aliasing snafus by memcpying explicitly
48   UChar* p = (UChar*)res;
49   memcpy(&p[8], (UChar*)&wHi, 8);
50   memcpy(&p[0], (UChar*)&wLo, 8);
51}
52
53static UChar randUChar ( void )
54{
55   static UInt seed = 80021;
56   seed = 1103515245 * seed + 12345;
57   return (seed >> 17) & 0xFF;
58}
59
60static ULong randULong ( void )
61{
62   Int i;
63   ULong r = 0;
64   for (i = 0; i < 8; i++) {
65      r = (r << 8) | (ULong)(0xFF & randUChar());
66   }
67   return r;
68}
69
70static void randV128 ( V128* v )
71{
72   Int i;
73   for (i = 0; i < 16; i++)
74      (*v)[i] = randUChar();
75}
76
77static void showV128 ( V128* v )
78{
79   Int i;
80   for (i = 15; i >= 0; i--)
81      printf("%02x", (Int)(*v)[i]);
82}
83
84static void showMaskedV128 ( V128* v, V128* mask )
85{
86   Int i;
87   for (i = 15; i >= 0; i--)
88      printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) ));
89}
90
91static void showIGVV( char* rOrM, char* op, Int imm,
92                      ULong src64, V128* dst, V128* res )
93{
94   printf("%s %10s $%d ", rOrM, op, imm);
95   printf("%016llx", src64);
96   printf(" ");
97   showV128(dst);
98   printf(" ");
99   showV128(res);
100   printf("\n");
101}
102
103static void showIAG ( char* rOrM, char* op, Int imm,
104                      V128* argL, ULong argR, ULong res )
105{
106   printf("%s %10s $%d ", rOrM, op, imm);
107   showV128(argL);
108   printf(" ");
109   printf("%016llx", argR);
110   printf(" ");
111   printf("%016llx", res);
112   printf("\n");
113}
114
115static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask )
116{
117   printf("%s %10s $%d ", rOrM, op, imm);
118   showV128(&rra->arg1);
119   printf(" ");
120   showV128(&rra->arg2);
121   printf(" ");
122   showMaskedV128(&rra->res, rmask);
123   printf("\n");
124}
125
126static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask )
127{
128   printf("%s %10s ", rOrM, op);
129   showV128(&rra->arg1);
130   printf(" ");
131   showV128(&rra->arg2);
132   printf(" ");
133   showMaskedV128(&rra->res, rmask);
134   printf("\n");
135}
136
137/* Note: these are little endian.  Hence first byte is the least
138   significant byte of lane zero. */
139
140/* Mask for insns where all result bits are non-approximated. */
141static V128 AllMask  = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
142                         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
143
144/* Mark for insns which produce approximated vector short results. */
145__attribute__((unused))
146static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF,
147                         0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF };
148
149/* Mark for insns which produce approximated scalar short results. */
150__attribute__((unused))
151static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF,
152                         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
153
154static V128 fives    = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55,
155                         0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 };
156
157static V128 zeroes   = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
158                         0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
159
160double mkPosInf ( void ) { return 1.0 / 0.0; }
161double mkNegInf ( void ) { return -mkPosInf(); }
162double mkPosNan ( void ) { return 0.0 / 0.0; }
163double mkNegNan ( void ) { return -mkPosNan(); }
164
165__attribute__((noinline))
166UInt get_mxcsr ( void )
167{
168   ULong w64;
169   __asm__ __volatile__(
170      "subq    $8, %%rsp"    "\n\t"
171      "stmxcsr (%%rsp)"      "\n\t"
172      "movq    (%%rsp), %0"  "\n"
173      "addq    $8, %%rsp"
174      : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc"
175   );
176   if (0) printf("get %08x\n", (UInt)w64);
177   return (UInt)w64;
178}
179
180__attribute__((noinline))
181void set_mxcsr ( UInt w32 )
182{
183   if (0) printf("set %08x\n", w32);
184   ULong w64 = (ULong)w32;
185   __asm__ __volatile__(
186      "subq    $8, %%rsp"    "\n\t"
187      "movq    %0, (%%rsp)"  "\n\t"
188      "ldmxcsr (%%rsp)"      "\n\t"
189      "addq    $8, %%rsp"
190      : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc"
191   );
192}
193
194UInt get_sse_roundingmode ( void )
195{
196   UInt w = get_mxcsr();
197   return (w >> 13) & 3;
198}
199
200void set_sse_roundingmode ( UInt m )
201{
202   UInt w;
203   assert(0 == (m & ~3));
204   w = get_mxcsr();
205   w &= ~(3 << 13);
206   w |= (m << 13);
207   set_mxcsr(w);
208}
209
210
211#define DO_imm_r_r(_opname, _imm, _src, _dst)  \
212   {  \
213      V128 _tmp;  \
214      __asm__ __volatile__(  \
215         "movupd (%0), %%xmm2"    "\n\t"  \
216         "movupd (%1), %%xmm11"   "\n\t"  \
217         _opname " $" #_imm ", %%xmm2, %%xmm11"  "\n\t"  \
218         "movupd %%xmm11, (%2)" "\n"  \
219         : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp))  \
220         : "cc", "memory", "xmm2", "xmm11"                            \
221      );  \
222      RRArgs rra;  \
223      memcpy(&rra.arg1, &(_src), sizeof(V128));  \
224      memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
225      memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
226      showIAA("r", (_opname), (_imm), &rra, &AllMask);  \
227   }
228
229#define DO_imm_m_r(_opname, _imm, _src, _dst)  \
230   {  \
231      V128 _tmp;  \
232      V128* _srcM = memalign16(sizeof(V128));  \
233      memcpy(_srcM, &(_src), sizeof(V128));  \
234      __asm__ __volatile__(  \
235         "movupd (%1), %%xmm11"   "\n\t"  \
236         _opname " $" #_imm ", (%0), %%xmm11"  "\n\t"  \
237         "movupd %%xmm11, (%2)" "\n"  \
238         : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp))  \
239         : "cc", "memory", "xmm11"  \
240      );  \
241      RRArgs rra;  \
242      memcpy(&rra.arg1, &(_src), sizeof(V128));  \
243      memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
244      memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
245      showIAA("m", (_opname), (_imm), &rra, &AllMask);  \
246      free(_srcM);  \
247   }
248
249#define DO_imm_mandr_r(_opname, _imm, _src, _dst)  \
250      DO_imm_r_r( _opname, _imm, _src, _dst ) \
251      DO_imm_m_r( _opname, _imm, _src, _dst )
252
253
254
255
256
257#define DO_r_r(_opname, _src, _dst)  \
258   {  \
259      V128 _tmp;  \
260      __asm__ __volatile__(  \
261         "movupd (%0), %%xmm2"    "\n\t"  \
262         "movupd (%1), %%xmm11"   "\n\t"  \
263         _opname " %%xmm2, %%xmm11"  "\n\t"  \
264         "movupd %%xmm11, (%2)" "\n"  \
265         : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp))  \
266         : "cc", "memory", "xmm2", "xmm11"  \
267      );  \
268      RRArgs rra;  \
269      memcpy(&rra.arg1, &(_src), sizeof(V128));  \
270      memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
271      memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
272      showAA("r", (_opname), &rra, &AllMask);  \
273   }
274
275#define DO_m_r(_opname, _src, _dst)  \
276   {  \
277      V128 _tmp;  \
278      V128* _srcM = memalign16(sizeof(V128));  \
279      memcpy(_srcM, &(_src), sizeof(V128));  \
280      __asm__ __volatile__(  \
281         "movupd (%1), %%xmm11"   "\n\t"  \
282         _opname " (%0), %%xmm11"  "\n\t"  \
283         "movupd %%xmm11, (%2)" "\n"  \
284         : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp))  \
285         : "cc", "memory", "xmm11"  \
286      );  \
287      RRArgs rra;  \
288      memcpy(&rra.arg1, &(_src), sizeof(V128));  \
289      memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
290      memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
291      showAA("m", (_opname), &rra, &AllMask);  \
292      free(_srcM);  \
293   }
294
295#define DO_mandr_r(_opname, _src, _dst)  \
296      DO_r_r(_opname, _src, _dst) \
297      DO_m_r(_opname, _src, _dst)
298
299
300
301
302#define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix)       \
303   {  \
304      ULong _scbefore = 0x5555555555555555ULL;  \
305      ULong _scafter  = 0xAAAAAAAAAAAAAAAAULL; \
306      /* This assumes that gcc won't make any of %0, %1, %2 */ \
307      /* be r11.  That should be ensured (cough, cough) */ \
308      /* by declaring r11 to be clobbered. */ \
309      __asm__ __volatile__(  \
310         "movupd (%0), %%xmm2"    "\n\t"  \
311         "movq   (%1), %%r11"   "\n\t"  \
312         _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix  "\n\t"  \
313         "movq   %%r11, (%2)" "\n"  \
314         : /*out*/ \
315         : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter))  \
316         : "cc", "memory", "xmm2", "r11"  \
317      );  \
318      showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter));  \
319   }
320
321#define DO_imm_r_to_mscalar(_opname, _imm, _src)   \
322   {  \
323      ULong _scbefore = 0x5555555555555555ULL;  \
324      ULong _scafter = _scbefore; \
325      __asm__ __volatile__(  \
326         "movupd (%0), %%xmm2"    "\n\t"  \
327         _opname " $" #_imm ", %%xmm2, (%1)"  "\n\t"  \
328         : /*out*/ \
329         : /*in*/ "r"(&(_src)), "r"(&(_scafter))  \
330         : "cc", "memory", "xmm2"  \
331      );  \
332      showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter));  \
333   }
334
335#define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix)   \
336      DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix )       \
337      DO_imm_r_to_mscalar( _opname, _imm, _src )
338
339
340
341
342
343
344
345
346#define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix)       \
347   {  \
348      V128  dstv;         \
349      V128  res;          \
350      ULong src64 = (ULong)(_src); \
351      memcpy(dstv, fives, sizeof(dstv)); \
352      memcpy(res,  zeroes, sizeof(res)); \
353      /* This assumes that gcc won't make any of %0, %1, %2 */ \
354      /* be r11.  That should be ensured (cough, cough) */ \
355      /* by declaring r11 to be clobbered. */ \
356      __asm__ __volatile__(  \
357         "movupd (%0), %%xmm2"    "\n\t"   /*dstv*/   \
358         "movq   (%1), %%r11"     "\n\t"   /*src64*/  \
359         _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2"   "\n\t"  \
360         "movupd  %%xmm2, (%2)" "\n" /*res*/                          \
361         : /*out*/ \
362         : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res)  \
363         : "cc", "memory", "xmm2", "r11"  \
364      );  \
365      showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \
366   }
367#define DO_imm_mscalar_to_r(_opname, _imm, _src)       \
368   {  \
369      V128  dstv;         \
370      V128  res;          \
371      ULong src64 = (ULong)(_src); \
372      memcpy(dstv, fives, sizeof(dstv)); \
373      memcpy(res,  zeroes, sizeof(res)); \
374      __asm__ __volatile__(  \
375         "movupd (%0), %%xmm2"    "\n\t"   /*dstv*/   \
376         _opname " $" #_imm ", (%1), %%xmm2"   "\n\t"  \
377         "movupd  %%xmm2, (%2)" "\n" /*res*/                          \
378         : /*out*/ \
379         : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res)  \
380         : "cc", "memory", "xmm2"  \
381      );  \
382      showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \
383   }
384
385#define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix)   \
386      DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix )       \
387      DO_imm_mscalar_to_r( _opname, _imm, _src )
388
389
390
391
392
393void test_BLENDPD ( void )
394{
395   V128 src, dst;
396   Int i;
397   for (i = 0; i < 10; i++) {
398      randV128(&src);
399      randV128(&dst);
400      DO_imm_mandr_r("blendpd", 0, src, dst);
401      DO_imm_mandr_r("blendpd", 1, src, dst);
402      DO_imm_mandr_r("blendpd", 2, src, dst);
403      DO_imm_mandr_r("blendpd", 3, src, dst);
404   }
405}
406
407void test_BLENDPS ( void )
408{
409   V128 src, dst;
410   Int i;
411   for (i = 0; i < 10; i++) {
412      randV128(&src);
413      randV128(&dst);
414      DO_imm_mandr_r("blendps", 0, src, dst);
415      DO_imm_mandr_r("blendps", 1, src, dst);
416      DO_imm_mandr_r("blendps", 2, src, dst);
417      DO_imm_mandr_r("blendps", 3, src, dst);
418      DO_imm_mandr_r("blendps", 4, src, dst);
419      DO_imm_mandr_r("blendps", 5, src, dst);
420      DO_imm_mandr_r("blendps", 6, src, dst);
421      DO_imm_mandr_r("blendps", 7, src, dst);
422      DO_imm_mandr_r("blendps", 8, src, dst);
423      DO_imm_mandr_r("blendps", 9, src, dst);
424      DO_imm_mandr_r("blendps", 10, src, dst);
425      DO_imm_mandr_r("blendps", 11, src, dst);
426      DO_imm_mandr_r("blendps", 12, src, dst);
427      DO_imm_mandr_r("blendps", 13, src, dst);
428      DO_imm_mandr_r("blendps", 14, src, dst);
429      DO_imm_mandr_r("blendps", 15, src, dst);
430   }
431}
432
433void test_DPPD ( void )
434{
435   V128 src, dst;
436   {
437      *(double*)(&src[0]) =  1.2345;
438      *(double*)(&src[8]) = -6.78910;
439      *(double*)(&dst[0]) = -11.121314;
440      *(double*)(&dst[8]) =  15.161718;
441      DO_imm_mandr_r("dppd", 0, src, dst);
442      DO_imm_mandr_r("dppd", 1, src, dst);
443      DO_imm_mandr_r("dppd", 2, src, dst);
444      DO_imm_mandr_r("dppd", 3, src, dst);
445      DO_imm_mandr_r("dppd", 4, src, dst);
446      DO_imm_mandr_r("dppd", 5, src, dst);
447      DO_imm_mandr_r("dppd", 6, src, dst);
448      DO_imm_mandr_r("dppd", 7, src, dst);
449      DO_imm_mandr_r("dppd", 8, src, dst);
450      DO_imm_mandr_r("dppd", 9, src, dst);
451      DO_imm_mandr_r("dppd", 10, src, dst);
452      DO_imm_mandr_r("dppd", 11, src, dst);
453      DO_imm_mandr_r("dppd", 12, src, dst);
454      DO_imm_mandr_r("dppd", 13, src, dst);
455      DO_imm_mandr_r("dppd", 14, src, dst);
456      DO_imm_mandr_r("dppd", 15, src, dst);
457      DO_imm_mandr_r("dppd", 16, src, dst);
458      DO_imm_mandr_r("dppd", 17, src, dst);
459      DO_imm_mandr_r("dppd", 18, src, dst);
460      DO_imm_mandr_r("dppd", 19, src, dst);
461      DO_imm_mandr_r("dppd", 20, src, dst);
462      DO_imm_mandr_r("dppd", 21, src, dst);
463      DO_imm_mandr_r("dppd", 22, src, dst);
464      DO_imm_mandr_r("dppd", 23, src, dst);
465      DO_imm_mandr_r("dppd", 24, src, dst);
466      DO_imm_mandr_r("dppd", 25, src, dst);
467      DO_imm_mandr_r("dppd", 26, src, dst);
468      DO_imm_mandr_r("dppd", 27, src, dst);
469      DO_imm_mandr_r("dppd", 28, src, dst);
470      DO_imm_mandr_r("dppd", 29, src, dst);
471      DO_imm_mandr_r("dppd", 30, src, dst);
472      DO_imm_mandr_r("dppd", 31, src, dst);
473      DO_imm_mandr_r("dppd", 32, src, dst);
474      DO_imm_mandr_r("dppd", 33, src, dst);
475      DO_imm_mandr_r("dppd", 34, src, dst);
476      DO_imm_mandr_r("dppd", 35, src, dst);
477      DO_imm_mandr_r("dppd", 36, src, dst);
478      DO_imm_mandr_r("dppd", 37, src, dst);
479      DO_imm_mandr_r("dppd", 38, src, dst);
480      DO_imm_mandr_r("dppd", 39, src, dst);
481      DO_imm_mandr_r("dppd", 40, src, dst);
482      DO_imm_mandr_r("dppd", 41, src, dst);
483      DO_imm_mandr_r("dppd", 42, src, dst);
484      DO_imm_mandr_r("dppd", 43, src, dst);
485      DO_imm_mandr_r("dppd", 44, src, dst);
486      DO_imm_mandr_r("dppd", 45, src, dst);
487      DO_imm_mandr_r("dppd", 46, src, dst);
488      DO_imm_mandr_r("dppd", 47, src, dst);
489      DO_imm_mandr_r("dppd", 48, src, dst);
490      DO_imm_mandr_r("dppd", 49, src, dst);
491      DO_imm_mandr_r("dppd", 50, src, dst);
492      DO_imm_mandr_r("dppd", 51, src, dst);
493      DO_imm_mandr_r("dppd", 52, src, dst);
494      DO_imm_mandr_r("dppd", 53, src, dst);
495      DO_imm_mandr_r("dppd", 54, src, dst);
496      DO_imm_mandr_r("dppd", 55, src, dst);
497      DO_imm_mandr_r("dppd", 56, src, dst);
498      DO_imm_mandr_r("dppd", 57, src, dst);
499      DO_imm_mandr_r("dppd", 58, src, dst);
500      DO_imm_mandr_r("dppd", 59, src, dst);
501      DO_imm_mandr_r("dppd", 60, src, dst);
502      DO_imm_mandr_r("dppd", 61, src, dst);
503      DO_imm_mandr_r("dppd", 62, src, dst);
504      DO_imm_mandr_r("dppd", 63, src, dst);
505      DO_imm_mandr_r("dppd", 64, src, dst);
506      DO_imm_mandr_r("dppd", 65, src, dst);
507      DO_imm_mandr_r("dppd", 66, src, dst);
508      DO_imm_mandr_r("dppd", 67, src, dst);
509      DO_imm_mandr_r("dppd", 68, src, dst);
510      DO_imm_mandr_r("dppd", 69, src, dst);
511      DO_imm_mandr_r("dppd", 70, src, dst);
512      DO_imm_mandr_r("dppd", 71, src, dst);
513      DO_imm_mandr_r("dppd", 72, src, dst);
514      DO_imm_mandr_r("dppd", 73, src, dst);
515      DO_imm_mandr_r("dppd", 74, src, dst);
516      DO_imm_mandr_r("dppd", 75, src, dst);
517      DO_imm_mandr_r("dppd", 76, src, dst);
518      DO_imm_mandr_r("dppd", 77, src, dst);
519      DO_imm_mandr_r("dppd", 78, src, dst);
520      DO_imm_mandr_r("dppd", 79, src, dst);
521      DO_imm_mandr_r("dppd", 80, src, dst);
522      DO_imm_mandr_r("dppd", 81, src, dst);
523      DO_imm_mandr_r("dppd", 82, src, dst);
524      DO_imm_mandr_r("dppd", 83, src, dst);
525      DO_imm_mandr_r("dppd", 84, src, dst);
526      DO_imm_mandr_r("dppd", 85, src, dst);
527      DO_imm_mandr_r("dppd", 86, src, dst);
528      DO_imm_mandr_r("dppd", 87, src, dst);
529      DO_imm_mandr_r("dppd", 88, src, dst);
530      DO_imm_mandr_r("dppd", 89, src, dst);
531      DO_imm_mandr_r("dppd", 90, src, dst);
532      DO_imm_mandr_r("dppd", 91, src, dst);
533      DO_imm_mandr_r("dppd", 92, src, dst);
534      DO_imm_mandr_r("dppd", 93, src, dst);
535      DO_imm_mandr_r("dppd", 94, src, dst);
536      DO_imm_mandr_r("dppd", 95, src, dst);
537      DO_imm_mandr_r("dppd", 96, src, dst);
538      DO_imm_mandr_r("dppd", 97, src, dst);
539      DO_imm_mandr_r("dppd", 98, src, dst);
540      DO_imm_mandr_r("dppd", 99, src, dst);
541      DO_imm_mandr_r("dppd", 100, src, dst);
542      DO_imm_mandr_r("dppd", 101, src, dst);
543      DO_imm_mandr_r("dppd", 102, src, dst);
544      DO_imm_mandr_r("dppd", 103, src, dst);
545      DO_imm_mandr_r("dppd", 104, src, dst);
546      DO_imm_mandr_r("dppd", 105, src, dst);
547      DO_imm_mandr_r("dppd", 106, src, dst);
548      DO_imm_mandr_r("dppd", 107, src, dst);
549      DO_imm_mandr_r("dppd", 108, src, dst);
550      DO_imm_mandr_r("dppd", 109, src, dst);
551      DO_imm_mandr_r("dppd", 110, src, dst);
552      DO_imm_mandr_r("dppd", 111, src, dst);
553      DO_imm_mandr_r("dppd", 112, src, dst);
554      DO_imm_mandr_r("dppd", 113, src, dst);
555      DO_imm_mandr_r("dppd", 114, src, dst);
556      DO_imm_mandr_r("dppd", 115, src, dst);
557      DO_imm_mandr_r("dppd", 116, src, dst);
558      DO_imm_mandr_r("dppd", 117, src, dst);
559      DO_imm_mandr_r("dppd", 118, src, dst);
560      DO_imm_mandr_r("dppd", 119, src, dst);
561      DO_imm_mandr_r("dppd", 120, src, dst);
562      DO_imm_mandr_r("dppd", 121, src, dst);
563      DO_imm_mandr_r("dppd", 122, src, dst);
564      DO_imm_mandr_r("dppd", 123, src, dst);
565      DO_imm_mandr_r("dppd", 124, src, dst);
566      DO_imm_mandr_r("dppd", 125, src, dst);
567      DO_imm_mandr_r("dppd", 126, src, dst);
568      DO_imm_mandr_r("dppd", 127, src, dst);
569      DO_imm_mandr_r("dppd", 128, src, dst);
570      DO_imm_mandr_r("dppd", 129, src, dst);
571      DO_imm_mandr_r("dppd", 130, src, dst);
572      DO_imm_mandr_r("dppd", 131, src, dst);
573      DO_imm_mandr_r("dppd", 132, src, dst);
574      DO_imm_mandr_r("dppd", 133, src, dst);
575      DO_imm_mandr_r("dppd", 134, src, dst);
576      DO_imm_mandr_r("dppd", 135, src, dst);
577      DO_imm_mandr_r("dppd", 136, src, dst);
578      DO_imm_mandr_r("dppd", 137, src, dst);
579      DO_imm_mandr_r("dppd", 138, src, dst);
580      DO_imm_mandr_r("dppd", 139, src, dst);
581      DO_imm_mandr_r("dppd", 140, src, dst);
582      DO_imm_mandr_r("dppd", 141, src, dst);
583      DO_imm_mandr_r("dppd", 142, src, dst);
584      DO_imm_mandr_r("dppd", 143, src, dst);
585      DO_imm_mandr_r("dppd", 144, src, dst);
586      DO_imm_mandr_r("dppd", 145, src, dst);
587      DO_imm_mandr_r("dppd", 146, src, dst);
588      DO_imm_mandr_r("dppd", 147, src, dst);
589      DO_imm_mandr_r("dppd", 148, src, dst);
590      DO_imm_mandr_r("dppd", 149, src, dst);
591      DO_imm_mandr_r("dppd", 150, src, dst);
592      DO_imm_mandr_r("dppd", 151, src, dst);
593      DO_imm_mandr_r("dppd", 152, src, dst);
594      DO_imm_mandr_r("dppd", 153, src, dst);
595      DO_imm_mandr_r("dppd", 154, src, dst);
596      DO_imm_mandr_r("dppd", 155, src, dst);
597      DO_imm_mandr_r("dppd", 156, src, dst);
598      DO_imm_mandr_r("dppd", 157, src, dst);
599      DO_imm_mandr_r("dppd", 158, src, dst);
600      DO_imm_mandr_r("dppd", 159, src, dst);
601      DO_imm_mandr_r("dppd", 160, src, dst);
602      DO_imm_mandr_r("dppd", 161, src, dst);
603      DO_imm_mandr_r("dppd", 162, src, dst);
604      DO_imm_mandr_r("dppd", 163, src, dst);
605      DO_imm_mandr_r("dppd", 164, src, dst);
606      DO_imm_mandr_r("dppd", 165, src, dst);
607      DO_imm_mandr_r("dppd", 166, src, dst);
608      DO_imm_mandr_r("dppd", 167, src, dst);
609      DO_imm_mandr_r("dppd", 168, src, dst);
610      DO_imm_mandr_r("dppd", 169, src, dst);
611      DO_imm_mandr_r("dppd", 170, src, dst);
612      DO_imm_mandr_r("dppd", 171, src, dst);
613      DO_imm_mandr_r("dppd", 172, src, dst);
614      DO_imm_mandr_r("dppd", 173, src, dst);
615      DO_imm_mandr_r("dppd", 174, src, dst);
616      DO_imm_mandr_r("dppd", 175, src, dst);
617      DO_imm_mandr_r("dppd", 176, src, dst);
618      DO_imm_mandr_r("dppd", 177, src, dst);
619      DO_imm_mandr_r("dppd", 178, src, dst);
620      DO_imm_mandr_r("dppd", 179, src, dst);
621      DO_imm_mandr_r("dppd", 180, src, dst);
622      DO_imm_mandr_r("dppd", 181, src, dst);
623      DO_imm_mandr_r("dppd", 182, src, dst);
624      DO_imm_mandr_r("dppd", 183, src, dst);
625      DO_imm_mandr_r("dppd", 184, src, dst);
626      DO_imm_mandr_r("dppd", 185, src, dst);
627      DO_imm_mandr_r("dppd", 186, src, dst);
628      DO_imm_mandr_r("dppd", 187, src, dst);
629      DO_imm_mandr_r("dppd", 188, src, dst);
630      DO_imm_mandr_r("dppd", 189, src, dst);
631      DO_imm_mandr_r("dppd", 190, src, dst);
632      DO_imm_mandr_r("dppd", 191, src, dst);
633      DO_imm_mandr_r("dppd", 192, src, dst);
634      DO_imm_mandr_r("dppd", 193, src, dst);
635      DO_imm_mandr_r("dppd", 194, src, dst);
636      DO_imm_mandr_r("dppd", 195, src, dst);
637      DO_imm_mandr_r("dppd", 196, src, dst);
638      DO_imm_mandr_r("dppd", 197, src, dst);
639      DO_imm_mandr_r("dppd", 198, src, dst);
640      DO_imm_mandr_r("dppd", 199, src, dst);
641      DO_imm_mandr_r("dppd", 200, src, dst);
642      DO_imm_mandr_r("dppd", 201, src, dst);
643      DO_imm_mandr_r("dppd", 202, src, dst);
644      DO_imm_mandr_r("dppd", 203, src, dst);
645      DO_imm_mandr_r("dppd", 204, src, dst);
646      DO_imm_mandr_r("dppd", 205, src, dst);
647      DO_imm_mandr_r("dppd", 206, src, dst);
648      DO_imm_mandr_r("dppd", 207, src, dst);
649      DO_imm_mandr_r("dppd", 208, src, dst);
650      DO_imm_mandr_r("dppd", 209, src, dst);
651      DO_imm_mandr_r("dppd", 210, src, dst);
652      DO_imm_mandr_r("dppd", 211, src, dst);
653      DO_imm_mandr_r("dppd", 212, src, dst);
654      DO_imm_mandr_r("dppd", 213, src, dst);
655      DO_imm_mandr_r("dppd", 214, src, dst);
656      DO_imm_mandr_r("dppd", 215, src, dst);
657      DO_imm_mandr_r("dppd", 216, src, dst);
658      DO_imm_mandr_r("dppd", 217, src, dst);
659      DO_imm_mandr_r("dppd", 218, src, dst);
660      DO_imm_mandr_r("dppd", 219, src, dst);
661      DO_imm_mandr_r("dppd", 220, src, dst);
662      DO_imm_mandr_r("dppd", 221, src, dst);
663      DO_imm_mandr_r("dppd", 222, src, dst);
664      DO_imm_mandr_r("dppd", 223, src, dst);
665      DO_imm_mandr_r("dppd", 224, src, dst);
666      DO_imm_mandr_r("dppd", 225, src, dst);
667      DO_imm_mandr_r("dppd", 226, src, dst);
668      DO_imm_mandr_r("dppd", 227, src, dst);
669      DO_imm_mandr_r("dppd", 228, src, dst);
670      DO_imm_mandr_r("dppd", 229, src, dst);
671      DO_imm_mandr_r("dppd", 230, src, dst);
672      DO_imm_mandr_r("dppd", 231, src, dst);
673      DO_imm_mandr_r("dppd", 232, src, dst);
674      DO_imm_mandr_r("dppd", 233, src, dst);
675      DO_imm_mandr_r("dppd", 234, src, dst);
676      DO_imm_mandr_r("dppd", 235, src, dst);
677      DO_imm_mandr_r("dppd", 236, src, dst);
678      DO_imm_mandr_r("dppd", 237, src, dst);
679      DO_imm_mandr_r("dppd", 238, src, dst);
680      DO_imm_mandr_r("dppd", 239, src, dst);
681      DO_imm_mandr_r("dppd", 240, src, dst);
682      DO_imm_mandr_r("dppd", 241, src, dst);
683      DO_imm_mandr_r("dppd", 242, src, dst);
684      DO_imm_mandr_r("dppd", 243, src, dst);
685      DO_imm_mandr_r("dppd", 244, src, dst);
686      DO_imm_mandr_r("dppd", 245, src, dst);
687      DO_imm_mandr_r("dppd", 246, src, dst);
688      DO_imm_mandr_r("dppd", 247, src, dst);
689      DO_imm_mandr_r("dppd", 248, src, dst);
690      DO_imm_mandr_r("dppd", 249, src, dst);
691      DO_imm_mandr_r("dppd", 250, src, dst);
692      DO_imm_mandr_r("dppd", 251, src, dst);
693      DO_imm_mandr_r("dppd", 252, src, dst);
694      DO_imm_mandr_r("dppd", 253, src, dst);
695      DO_imm_mandr_r("dppd", 254, src, dst);
696      DO_imm_mandr_r("dppd", 255, src, dst);
697   }
698}
699
700void test_DPPS ( void )
701{
702   V128 src, dst;
703   {
704      *(float*)(&src[0])  =   1.2;
705      *(float*)(&src[4])  =  -3.4;
706      *(float*)(&src[8])  =  -6.7;
707      *(float*)(&src[12]) =   8.9;
708      *(float*)(&dst[0])  = -10.11;
709      *(float*)(&dst[4])  =  12.13;
710      *(float*)(&dst[8])  =  14.15;
711      *(float*)(&dst[12]) = -16.17;
712      DO_imm_mandr_r("dpps", 0, src, dst);
713      DO_imm_mandr_r("dpps", 1, src, dst);
714      DO_imm_mandr_r("dpps", 2, src, dst);
715      DO_imm_mandr_r("dpps", 3, src, dst);
716      DO_imm_mandr_r("dpps", 4, src, dst);
717      DO_imm_mandr_r("dpps", 5, src, dst);
718      DO_imm_mandr_r("dpps", 6, src, dst);
719      DO_imm_mandr_r("dpps", 7, src, dst);
720      DO_imm_mandr_r("dpps", 8, src, dst);
721      DO_imm_mandr_r("dpps", 9, src, dst);
722      DO_imm_mandr_r("dpps", 10, src, dst);
723      DO_imm_mandr_r("dpps", 11, src, dst);
724      DO_imm_mandr_r("dpps", 12, src, dst);
725      DO_imm_mandr_r("dpps", 13, src, dst);
726      DO_imm_mandr_r("dpps", 14, src, dst);
727      DO_imm_mandr_r("dpps", 15, src, dst);
728      DO_imm_mandr_r("dpps", 16, src, dst);
729      DO_imm_mandr_r("dpps", 17, src, dst);
730      DO_imm_mandr_r("dpps", 18, src, dst);
731      DO_imm_mandr_r("dpps", 19, src, dst);
732      DO_imm_mandr_r("dpps", 20, src, dst);
733      DO_imm_mandr_r("dpps", 21, src, dst);
734      DO_imm_mandr_r("dpps", 22, src, dst);
735      DO_imm_mandr_r("dpps", 23, src, dst);
736      DO_imm_mandr_r("dpps", 24, src, dst);
737      DO_imm_mandr_r("dpps", 25, src, dst);
738      DO_imm_mandr_r("dpps", 26, src, dst);
739      DO_imm_mandr_r("dpps", 27, src, dst);
740      DO_imm_mandr_r("dpps", 28, src, dst);
741      DO_imm_mandr_r("dpps", 29, src, dst);
742      DO_imm_mandr_r("dpps", 30, src, dst);
743      DO_imm_mandr_r("dpps", 31, src, dst);
744      DO_imm_mandr_r("dpps", 32, src, dst);
745      DO_imm_mandr_r("dpps", 33, src, dst);
746      DO_imm_mandr_r("dpps", 34, src, dst);
747      DO_imm_mandr_r("dpps", 35, src, dst);
748      DO_imm_mandr_r("dpps", 36, src, dst);
749      DO_imm_mandr_r("dpps", 37, src, dst);
750      DO_imm_mandr_r("dpps", 38, src, dst);
751      DO_imm_mandr_r("dpps", 39, src, dst);
752      DO_imm_mandr_r("dpps", 40, src, dst);
753      DO_imm_mandr_r("dpps", 41, src, dst);
754      DO_imm_mandr_r("dpps", 42, src, dst);
755      DO_imm_mandr_r("dpps", 43, src, dst);
756      DO_imm_mandr_r("dpps", 44, src, dst);
757      DO_imm_mandr_r("dpps", 45, src, dst);
758      DO_imm_mandr_r("dpps", 46, src, dst);
759      DO_imm_mandr_r("dpps", 47, src, dst);
760      DO_imm_mandr_r("dpps", 48, src, dst);
761      DO_imm_mandr_r("dpps", 49, src, dst);
762      DO_imm_mandr_r("dpps", 50, src, dst);
763      DO_imm_mandr_r("dpps", 51, src, dst);
764      DO_imm_mandr_r("dpps", 52, src, dst);
765      DO_imm_mandr_r("dpps", 53, src, dst);
766      DO_imm_mandr_r("dpps", 54, src, dst);
767      DO_imm_mandr_r("dpps", 55, src, dst);
768      DO_imm_mandr_r("dpps", 56, src, dst);
769      DO_imm_mandr_r("dpps", 57, src, dst);
770      DO_imm_mandr_r("dpps", 58, src, dst);
771      DO_imm_mandr_r("dpps", 59, src, dst);
772      DO_imm_mandr_r("dpps", 60, src, dst);
773      DO_imm_mandr_r("dpps", 61, src, dst);
774      DO_imm_mandr_r("dpps", 62, src, dst);
775      DO_imm_mandr_r("dpps", 63, src, dst);
776      DO_imm_mandr_r("dpps", 64, src, dst);
777      DO_imm_mandr_r("dpps", 65, src, dst);
778      DO_imm_mandr_r("dpps", 66, src, dst);
779      DO_imm_mandr_r("dpps", 67, src, dst);
780      DO_imm_mandr_r("dpps", 68, src, dst);
781      DO_imm_mandr_r("dpps", 69, src, dst);
782      DO_imm_mandr_r("dpps", 70, src, dst);
783      DO_imm_mandr_r("dpps", 71, src, dst);
784      DO_imm_mandr_r("dpps", 72, src, dst);
785      DO_imm_mandr_r("dpps", 73, src, dst);
786      DO_imm_mandr_r("dpps", 74, src, dst);
787      DO_imm_mandr_r("dpps", 75, src, dst);
788      DO_imm_mandr_r("dpps", 76, src, dst);
789      DO_imm_mandr_r("dpps", 77, src, dst);
790      DO_imm_mandr_r("dpps", 78, src, dst);
791      DO_imm_mandr_r("dpps", 79, src, dst);
792      DO_imm_mandr_r("dpps", 80, src, dst);
793      DO_imm_mandr_r("dpps", 81, src, dst);
794      DO_imm_mandr_r("dpps", 82, src, dst);
795      DO_imm_mandr_r("dpps", 83, src, dst);
796      DO_imm_mandr_r("dpps", 84, src, dst);
797      DO_imm_mandr_r("dpps", 85, src, dst);
798      DO_imm_mandr_r("dpps", 86, src, dst);
799      DO_imm_mandr_r("dpps", 87, src, dst);
800      DO_imm_mandr_r("dpps", 88, src, dst);
801      DO_imm_mandr_r("dpps", 89, src, dst);
802      DO_imm_mandr_r("dpps", 90, src, dst);
803      DO_imm_mandr_r("dpps", 91, src, dst);
804      DO_imm_mandr_r("dpps", 92, src, dst);
805      DO_imm_mandr_r("dpps", 93, src, dst);
806      DO_imm_mandr_r("dpps", 94, src, dst);
807      DO_imm_mandr_r("dpps", 95, src, dst);
808      DO_imm_mandr_r("dpps", 96, src, dst);
809      DO_imm_mandr_r("dpps", 97, src, dst);
810      DO_imm_mandr_r("dpps", 98, src, dst);
811      DO_imm_mandr_r("dpps", 99, src, dst);
812      DO_imm_mandr_r("dpps", 100, src, dst);
813      DO_imm_mandr_r("dpps", 101, src, dst);
814      DO_imm_mandr_r("dpps", 102, src, dst);
815      DO_imm_mandr_r("dpps", 103, src, dst);
816      DO_imm_mandr_r("dpps", 104, src, dst);
817      DO_imm_mandr_r("dpps", 105, src, dst);
818      DO_imm_mandr_r("dpps", 106, src, dst);
819      DO_imm_mandr_r("dpps", 107, src, dst);
820      DO_imm_mandr_r("dpps", 108, src, dst);
821      DO_imm_mandr_r("dpps", 109, src, dst);
822      DO_imm_mandr_r("dpps", 110, src, dst);
823      DO_imm_mandr_r("dpps", 111, src, dst);
824      DO_imm_mandr_r("dpps", 112, src, dst);
825      DO_imm_mandr_r("dpps", 113, src, dst);
826      DO_imm_mandr_r("dpps", 114, src, dst);
827      DO_imm_mandr_r("dpps", 115, src, dst);
828      DO_imm_mandr_r("dpps", 116, src, dst);
829      DO_imm_mandr_r("dpps", 117, src, dst);
830      DO_imm_mandr_r("dpps", 118, src, dst);
831      DO_imm_mandr_r("dpps", 119, src, dst);
832      DO_imm_mandr_r("dpps", 120, src, dst);
833      DO_imm_mandr_r("dpps", 121, src, dst);
834      DO_imm_mandr_r("dpps", 122, src, dst);
835      DO_imm_mandr_r("dpps", 123, src, dst);
836      DO_imm_mandr_r("dpps", 124, src, dst);
837      DO_imm_mandr_r("dpps", 125, src, dst);
838      DO_imm_mandr_r("dpps", 126, src, dst);
839      DO_imm_mandr_r("dpps", 127, src, dst);
840      DO_imm_mandr_r("dpps", 128, src, dst);
841      DO_imm_mandr_r("dpps", 129, src, dst);
842      DO_imm_mandr_r("dpps", 130, src, dst);
843      DO_imm_mandr_r("dpps", 131, src, dst);
844      DO_imm_mandr_r("dpps", 132, src, dst);
845      DO_imm_mandr_r("dpps", 133, src, dst);
846      DO_imm_mandr_r("dpps", 134, src, dst);
847      DO_imm_mandr_r("dpps", 135, src, dst);
848      DO_imm_mandr_r("dpps", 136, src, dst);
849      DO_imm_mandr_r("dpps", 137, src, dst);
850      DO_imm_mandr_r("dpps", 138, src, dst);
851      DO_imm_mandr_r("dpps", 139, src, dst);
852      DO_imm_mandr_r("dpps", 140, src, dst);
853      DO_imm_mandr_r("dpps", 141, src, dst);
854      DO_imm_mandr_r("dpps", 142, src, dst);
855      DO_imm_mandr_r("dpps", 143, src, dst);
856      DO_imm_mandr_r("dpps", 144, src, dst);
857      DO_imm_mandr_r("dpps", 145, src, dst);
858      DO_imm_mandr_r("dpps", 146, src, dst);
859      DO_imm_mandr_r("dpps", 147, src, dst);
860      DO_imm_mandr_r("dpps", 148, src, dst);
861      DO_imm_mandr_r("dpps", 149, src, dst);
862      DO_imm_mandr_r("dpps", 150, src, dst);
863      DO_imm_mandr_r("dpps", 151, src, dst);
864      DO_imm_mandr_r("dpps", 152, src, dst);
865      DO_imm_mandr_r("dpps", 153, src, dst);
866      DO_imm_mandr_r("dpps", 154, src, dst);
867      DO_imm_mandr_r("dpps", 155, src, dst);
868      DO_imm_mandr_r("dpps", 156, src, dst);
869      DO_imm_mandr_r("dpps", 157, src, dst);
870      DO_imm_mandr_r("dpps", 158, src, dst);
871      DO_imm_mandr_r("dpps", 159, src, dst);
872      DO_imm_mandr_r("dpps", 160, src, dst);
873      DO_imm_mandr_r("dpps", 161, src, dst);
874      DO_imm_mandr_r("dpps", 162, src, dst);
875      DO_imm_mandr_r("dpps", 163, src, dst);
876      DO_imm_mandr_r("dpps", 164, src, dst);
877      DO_imm_mandr_r("dpps", 165, src, dst);
878      DO_imm_mandr_r("dpps", 166, src, dst);
879      DO_imm_mandr_r("dpps", 167, src, dst);
880      DO_imm_mandr_r("dpps", 168, src, dst);
881      DO_imm_mandr_r("dpps", 169, src, dst);
882      DO_imm_mandr_r("dpps", 170, src, dst);
883      DO_imm_mandr_r("dpps", 171, src, dst);
884      DO_imm_mandr_r("dpps", 172, src, dst);
885      DO_imm_mandr_r("dpps", 173, src, dst);
886      DO_imm_mandr_r("dpps", 174, src, dst);
887      DO_imm_mandr_r("dpps", 175, src, dst);
888      DO_imm_mandr_r("dpps", 176, src, dst);
889      DO_imm_mandr_r("dpps", 177, src, dst);
890      DO_imm_mandr_r("dpps", 178, src, dst);
891      DO_imm_mandr_r("dpps", 179, src, dst);
892      DO_imm_mandr_r("dpps", 180, src, dst);
893      DO_imm_mandr_r("dpps", 181, src, dst);
894      DO_imm_mandr_r("dpps", 182, src, dst);
895      DO_imm_mandr_r("dpps", 183, src, dst);
896      DO_imm_mandr_r("dpps", 184, src, dst);
897      DO_imm_mandr_r("dpps", 185, src, dst);
898      DO_imm_mandr_r("dpps", 186, src, dst);
899      DO_imm_mandr_r("dpps", 187, src, dst);
900      DO_imm_mandr_r("dpps", 188, src, dst);
901      DO_imm_mandr_r("dpps", 189, src, dst);
902      DO_imm_mandr_r("dpps", 190, src, dst);
903      DO_imm_mandr_r("dpps", 191, src, dst);
904      DO_imm_mandr_r("dpps", 192, src, dst);
905      DO_imm_mandr_r("dpps", 193, src, dst);
906      DO_imm_mandr_r("dpps", 194, src, dst);
907      DO_imm_mandr_r("dpps", 195, src, dst);
908      DO_imm_mandr_r("dpps", 196, src, dst);
909      DO_imm_mandr_r("dpps", 197, src, dst);
910      DO_imm_mandr_r("dpps", 198, src, dst);
911      DO_imm_mandr_r("dpps", 199, src, dst);
912      DO_imm_mandr_r("dpps", 200, src, dst);
913      DO_imm_mandr_r("dpps", 201, src, dst);
914      DO_imm_mandr_r("dpps", 202, src, dst);
915      DO_imm_mandr_r("dpps", 203, src, dst);
916      DO_imm_mandr_r("dpps", 204, src, dst);
917      DO_imm_mandr_r("dpps", 205, src, dst);
918      DO_imm_mandr_r("dpps", 206, src, dst);
919      DO_imm_mandr_r("dpps", 207, src, dst);
920      DO_imm_mandr_r("dpps", 208, src, dst);
921      DO_imm_mandr_r("dpps", 209, src, dst);
922      DO_imm_mandr_r("dpps", 210, src, dst);
923      DO_imm_mandr_r("dpps", 211, src, dst);
924      DO_imm_mandr_r("dpps", 212, src, dst);
925      DO_imm_mandr_r("dpps", 213, src, dst);
926      DO_imm_mandr_r("dpps", 214, src, dst);
927      DO_imm_mandr_r("dpps", 215, src, dst);
928      DO_imm_mandr_r("dpps", 216, src, dst);
929      DO_imm_mandr_r("dpps", 217, src, dst);
930      DO_imm_mandr_r("dpps", 218, src, dst);
931      DO_imm_mandr_r("dpps", 219, src, dst);
932      DO_imm_mandr_r("dpps", 220, src, dst);
933      DO_imm_mandr_r("dpps", 221, src, dst);
934      DO_imm_mandr_r("dpps", 222, src, dst);
935      DO_imm_mandr_r("dpps", 223, src, dst);
936      DO_imm_mandr_r("dpps", 224, src, dst);
937      DO_imm_mandr_r("dpps", 225, src, dst);
938      DO_imm_mandr_r("dpps", 226, src, dst);
939      DO_imm_mandr_r("dpps", 227, src, dst);
940      DO_imm_mandr_r("dpps", 228, src, dst);
941      DO_imm_mandr_r("dpps", 229, src, dst);
942      DO_imm_mandr_r("dpps", 230, src, dst);
943      DO_imm_mandr_r("dpps", 231, src, dst);
944      DO_imm_mandr_r("dpps", 232, src, dst);
945      DO_imm_mandr_r("dpps", 233, src, dst);
946      DO_imm_mandr_r("dpps", 234, src, dst);
947      DO_imm_mandr_r("dpps", 235, src, dst);
948      DO_imm_mandr_r("dpps", 236, src, dst);
949      DO_imm_mandr_r("dpps", 237, src, dst);
950      DO_imm_mandr_r("dpps", 238, src, dst);
951      DO_imm_mandr_r("dpps", 239, src, dst);
952      DO_imm_mandr_r("dpps", 240, src, dst);
953      DO_imm_mandr_r("dpps", 241, src, dst);
954      DO_imm_mandr_r("dpps", 242, src, dst);
955      DO_imm_mandr_r("dpps", 243, src, dst);
956      DO_imm_mandr_r("dpps", 244, src, dst);
957      DO_imm_mandr_r("dpps", 245, src, dst);
958      DO_imm_mandr_r("dpps", 246, src, dst);
959      DO_imm_mandr_r("dpps", 247, src, dst);
960      DO_imm_mandr_r("dpps", 248, src, dst);
961      DO_imm_mandr_r("dpps", 249, src, dst);
962      DO_imm_mandr_r("dpps", 250, src, dst);
963      DO_imm_mandr_r("dpps", 251, src, dst);
964      DO_imm_mandr_r("dpps", 252, src, dst);
965      DO_imm_mandr_r("dpps", 253, src, dst);
966      DO_imm_mandr_r("dpps", 254, src, dst);
967      DO_imm_mandr_r("dpps", 255, src, dst);
968   }
969}
970
971void test_INSERTPS ( void )
972{
973   V128 src, dst;
974   {
975      *(float*)(&src[0])  =   1.2;
976      *(float*)(&src[4])  =  -3.4;
977      *(float*)(&src[8])  =  -6.7;
978      *(float*)(&src[12]) =   8.9;
979      *(float*)(&dst[0])  = -10.11;
980      *(float*)(&dst[4])  =  12.13;
981      *(float*)(&dst[8])  =  14.15;
982      *(float*)(&dst[12]) = -16.17;
983      DO_imm_mandr_r("insertps", 0, src, dst);
984      DO_imm_mandr_r("insertps", 1, src, dst);
985      DO_imm_mandr_r("insertps", 2, src, dst);
986      DO_imm_mandr_r("insertps", 3, src, dst);
987      DO_imm_mandr_r("insertps", 4, src, dst);
988      DO_imm_mandr_r("insertps", 5, src, dst);
989      DO_imm_mandr_r("insertps", 6, src, dst);
990      DO_imm_mandr_r("insertps", 7, src, dst);
991      DO_imm_mandr_r("insertps", 8, src, dst);
992      DO_imm_mandr_r("insertps", 9, src, dst);
993      DO_imm_mandr_r("insertps", 10, src, dst);
994      DO_imm_mandr_r("insertps", 11, src, dst);
995      DO_imm_mandr_r("insertps", 12, src, dst);
996      DO_imm_mandr_r("insertps", 13, src, dst);
997      DO_imm_mandr_r("insertps", 14, src, dst);
998      DO_imm_mandr_r("insertps", 15, src, dst);
999      DO_imm_mandr_r("insertps", 16, src, dst);
1000      DO_imm_mandr_r("insertps", 17, src, dst);
1001      DO_imm_mandr_r("insertps", 18, src, dst);
1002      DO_imm_mandr_r("insertps", 19, src, dst);
1003      DO_imm_mandr_r("insertps", 20, src, dst);
1004      DO_imm_mandr_r("insertps", 21, src, dst);
1005      DO_imm_mandr_r("insertps", 22, src, dst);
1006      DO_imm_mandr_r("insertps", 23, src, dst);
1007      DO_imm_mandr_r("insertps", 24, src, dst);
1008      DO_imm_mandr_r("insertps", 25, src, dst);
1009      DO_imm_mandr_r("insertps", 26, src, dst);
1010      DO_imm_mandr_r("insertps", 27, src, dst);
1011      DO_imm_mandr_r("insertps", 28, src, dst);
1012      DO_imm_mandr_r("insertps", 29, src, dst);
1013      DO_imm_mandr_r("insertps", 30, src, dst);
1014      DO_imm_mandr_r("insertps", 31, src, dst);
1015      DO_imm_mandr_r("insertps", 32, src, dst);
1016      DO_imm_mandr_r("insertps", 33, src, dst);
1017      DO_imm_mandr_r("insertps", 34, src, dst);
1018      DO_imm_mandr_r("insertps", 35, src, dst);
1019      DO_imm_mandr_r("insertps", 36, src, dst);
1020      DO_imm_mandr_r("insertps", 37, src, dst);
1021      DO_imm_mandr_r("insertps", 38, src, dst);
1022      DO_imm_mandr_r("insertps", 39, src, dst);
1023      DO_imm_mandr_r("insertps", 40, src, dst);
1024      DO_imm_mandr_r("insertps", 41, src, dst);
1025      DO_imm_mandr_r("insertps", 42, src, dst);
1026      DO_imm_mandr_r("insertps", 43, src, dst);
1027      DO_imm_mandr_r("insertps", 44, src, dst);
1028      DO_imm_mandr_r("insertps", 45, src, dst);
1029      DO_imm_mandr_r("insertps", 46, src, dst);
1030      DO_imm_mandr_r("insertps", 47, src, dst);
1031      DO_imm_mandr_r("insertps", 48, src, dst);
1032      DO_imm_mandr_r("insertps", 49, src, dst);
1033      DO_imm_mandr_r("insertps", 50, src, dst);
1034      DO_imm_mandr_r("insertps", 51, src, dst);
1035      DO_imm_mandr_r("insertps", 52, src, dst);
1036      DO_imm_mandr_r("insertps", 53, src, dst);
1037      DO_imm_mandr_r("insertps", 54, src, dst);
1038      DO_imm_mandr_r("insertps", 55, src, dst);
1039      DO_imm_mandr_r("insertps", 56, src, dst);
1040      DO_imm_mandr_r("insertps", 57, src, dst);
1041      DO_imm_mandr_r("insertps", 58, src, dst);
1042      DO_imm_mandr_r("insertps", 59, src, dst);
1043      DO_imm_mandr_r("insertps", 60, src, dst);
1044      DO_imm_mandr_r("insertps", 61, src, dst);
1045      DO_imm_mandr_r("insertps", 62, src, dst);
1046      DO_imm_mandr_r("insertps", 63, src, dst);
1047      DO_imm_mandr_r("insertps", 64, src, dst);
1048      DO_imm_mandr_r("insertps", 65, src, dst);
1049      DO_imm_mandr_r("insertps", 66, src, dst);
1050      DO_imm_mandr_r("insertps", 67, src, dst);
1051      DO_imm_mandr_r("insertps", 68, src, dst);
1052      DO_imm_mandr_r("insertps", 69, src, dst);
1053      DO_imm_mandr_r("insertps", 70, src, dst);
1054      DO_imm_mandr_r("insertps", 71, src, dst);
1055      DO_imm_mandr_r("insertps", 72, src, dst);
1056      DO_imm_mandr_r("insertps", 73, src, dst);
1057      DO_imm_mandr_r("insertps", 74, src, dst);
1058      DO_imm_mandr_r("insertps", 75, src, dst);
1059      DO_imm_mandr_r("insertps", 76, src, dst);
1060      DO_imm_mandr_r("insertps", 77, src, dst);
1061      DO_imm_mandr_r("insertps", 78, src, dst);
1062      DO_imm_mandr_r("insertps", 79, src, dst);
1063      DO_imm_mandr_r("insertps", 80, src, dst);
1064      DO_imm_mandr_r("insertps", 81, src, dst);
1065      DO_imm_mandr_r("insertps", 82, src, dst);
1066      DO_imm_mandr_r("insertps", 83, src, dst);
1067      DO_imm_mandr_r("insertps", 84, src, dst);
1068      DO_imm_mandr_r("insertps", 85, src, dst);
1069      DO_imm_mandr_r("insertps", 86, src, dst);
1070      DO_imm_mandr_r("insertps", 87, src, dst);
1071      DO_imm_mandr_r("insertps", 88, src, dst);
1072      DO_imm_mandr_r("insertps", 89, src, dst);
1073      DO_imm_mandr_r("insertps", 90, src, dst);
1074      DO_imm_mandr_r("insertps", 91, src, dst);
1075      DO_imm_mandr_r("insertps", 92, src, dst);
1076      DO_imm_mandr_r("insertps", 93, src, dst);
1077      DO_imm_mandr_r("insertps", 94, src, dst);
1078      DO_imm_mandr_r("insertps", 95, src, dst);
1079      DO_imm_mandr_r("insertps", 96, src, dst);
1080      DO_imm_mandr_r("insertps", 97, src, dst);
1081      DO_imm_mandr_r("insertps", 98, src, dst);
1082      DO_imm_mandr_r("insertps", 99, src, dst);
1083      DO_imm_mandr_r("insertps", 100, src, dst);
1084      DO_imm_mandr_r("insertps", 101, src, dst);
1085      DO_imm_mandr_r("insertps", 102, src, dst);
1086      DO_imm_mandr_r("insertps", 103, src, dst);
1087      DO_imm_mandr_r("insertps", 104, src, dst);
1088      DO_imm_mandr_r("insertps", 105, src, dst);
1089      DO_imm_mandr_r("insertps", 106, src, dst);
1090      DO_imm_mandr_r("insertps", 107, src, dst);
1091      DO_imm_mandr_r("insertps", 108, src, dst);
1092      DO_imm_mandr_r("insertps", 109, src, dst);
1093      DO_imm_mandr_r("insertps", 110, src, dst);
1094      DO_imm_mandr_r("insertps", 111, src, dst);
1095      DO_imm_mandr_r("insertps", 112, src, dst);
1096      DO_imm_mandr_r("insertps", 113, src, dst);
1097      DO_imm_mandr_r("insertps", 114, src, dst);
1098      DO_imm_mandr_r("insertps", 115, src, dst);
1099      DO_imm_mandr_r("insertps", 116, src, dst);
1100      DO_imm_mandr_r("insertps", 117, src, dst);
1101      DO_imm_mandr_r("insertps", 118, src, dst);
1102      DO_imm_mandr_r("insertps", 119, src, dst);
1103      DO_imm_mandr_r("insertps", 120, src, dst);
1104      DO_imm_mandr_r("insertps", 121, src, dst);
1105      DO_imm_mandr_r("insertps", 122, src, dst);
1106      DO_imm_mandr_r("insertps", 123, src, dst);
1107      DO_imm_mandr_r("insertps", 124, src, dst);
1108      DO_imm_mandr_r("insertps", 125, src, dst);
1109      DO_imm_mandr_r("insertps", 126, src, dst);
1110      DO_imm_mandr_r("insertps", 127, src, dst);
1111      DO_imm_mandr_r("insertps", 128, src, dst);
1112      DO_imm_mandr_r("insertps", 129, src, dst);
1113      DO_imm_mandr_r("insertps", 130, src, dst);
1114      DO_imm_mandr_r("insertps", 131, src, dst);
1115      DO_imm_mandr_r("insertps", 132, src, dst);
1116      DO_imm_mandr_r("insertps", 133, src, dst);
1117      DO_imm_mandr_r("insertps", 134, src, dst);
1118      DO_imm_mandr_r("insertps", 135, src, dst);
1119      DO_imm_mandr_r("insertps", 136, src, dst);
1120      DO_imm_mandr_r("insertps", 137, src, dst);
1121      DO_imm_mandr_r("insertps", 138, src, dst);
1122      DO_imm_mandr_r("insertps", 139, src, dst);
1123      DO_imm_mandr_r("insertps", 140, src, dst);
1124      DO_imm_mandr_r("insertps", 141, src, dst);
1125      DO_imm_mandr_r("insertps", 142, src, dst);
1126      DO_imm_mandr_r("insertps", 143, src, dst);
1127      DO_imm_mandr_r("insertps", 144, src, dst);
1128      DO_imm_mandr_r("insertps", 145, src, dst);
1129      DO_imm_mandr_r("insertps", 146, src, dst);
1130      DO_imm_mandr_r("insertps", 147, src, dst);
1131      DO_imm_mandr_r("insertps", 148, src, dst);
1132      DO_imm_mandr_r("insertps", 149, src, dst);
1133      DO_imm_mandr_r("insertps", 150, src, dst);
1134      DO_imm_mandr_r("insertps", 151, src, dst);
1135      DO_imm_mandr_r("insertps", 152, src, dst);
1136      DO_imm_mandr_r("insertps", 153, src, dst);
1137      DO_imm_mandr_r("insertps", 154, src, dst);
1138      DO_imm_mandr_r("insertps", 155, src, dst);
1139      DO_imm_mandr_r("insertps", 156, src, dst);
1140      DO_imm_mandr_r("insertps", 157, src, dst);
1141      DO_imm_mandr_r("insertps", 158, src, dst);
1142      DO_imm_mandr_r("insertps", 159, src, dst);
1143      DO_imm_mandr_r("insertps", 160, src, dst);
1144      DO_imm_mandr_r("insertps", 161, src, dst);
1145      DO_imm_mandr_r("insertps", 162, src, dst);
1146      DO_imm_mandr_r("insertps", 163, src, dst);
1147      DO_imm_mandr_r("insertps", 164, src, dst);
1148      DO_imm_mandr_r("insertps", 165, src, dst);
1149      DO_imm_mandr_r("insertps", 166, src, dst);
1150      DO_imm_mandr_r("insertps", 167, src, dst);
1151      DO_imm_mandr_r("insertps", 168, src, dst);
1152      DO_imm_mandr_r("insertps", 169, src, dst);
1153      DO_imm_mandr_r("insertps", 170, src, dst);
1154      DO_imm_mandr_r("insertps", 171, src, dst);
1155      DO_imm_mandr_r("insertps", 172, src, dst);
1156      DO_imm_mandr_r("insertps", 173, src, dst);
1157      DO_imm_mandr_r("insertps", 174, src, dst);
1158      DO_imm_mandr_r("insertps", 175, src, dst);
1159      DO_imm_mandr_r("insertps", 176, src, dst);
1160      DO_imm_mandr_r("insertps", 177, src, dst);
1161      DO_imm_mandr_r("insertps", 178, src, dst);
1162      DO_imm_mandr_r("insertps", 179, src, dst);
1163      DO_imm_mandr_r("insertps", 180, src, dst);
1164      DO_imm_mandr_r("insertps", 181, src, dst);
1165      DO_imm_mandr_r("insertps", 182, src, dst);
1166      DO_imm_mandr_r("insertps", 183, src, dst);
1167      DO_imm_mandr_r("insertps", 184, src, dst);
1168      DO_imm_mandr_r("insertps", 185, src, dst);
1169      DO_imm_mandr_r("insertps", 186, src, dst);
1170      DO_imm_mandr_r("insertps", 187, src, dst);
1171      DO_imm_mandr_r("insertps", 188, src, dst);
1172      DO_imm_mandr_r("insertps", 189, src, dst);
1173      DO_imm_mandr_r("insertps", 190, src, dst);
1174      DO_imm_mandr_r("insertps", 191, src, dst);
1175      DO_imm_mandr_r("insertps", 192, src, dst);
1176      DO_imm_mandr_r("insertps", 193, src, dst);
1177      DO_imm_mandr_r("insertps", 194, src, dst);
1178      DO_imm_mandr_r("insertps", 195, src, dst);
1179      DO_imm_mandr_r("insertps", 196, src, dst);
1180      DO_imm_mandr_r("insertps", 197, src, dst);
1181      DO_imm_mandr_r("insertps", 198, src, dst);
1182      DO_imm_mandr_r("insertps", 199, src, dst);
1183      DO_imm_mandr_r("insertps", 200, src, dst);
1184      DO_imm_mandr_r("insertps", 201, src, dst);
1185      DO_imm_mandr_r("insertps", 202, src, dst);
1186      DO_imm_mandr_r("insertps", 203, src, dst);
1187      DO_imm_mandr_r("insertps", 204, src, dst);
1188      DO_imm_mandr_r("insertps", 205, src, dst);
1189      DO_imm_mandr_r("insertps", 206, src, dst);
1190      DO_imm_mandr_r("insertps", 207, src, dst);
1191      DO_imm_mandr_r("insertps", 208, src, dst);
1192      DO_imm_mandr_r("insertps", 209, src, dst);
1193      DO_imm_mandr_r("insertps", 210, src, dst);
1194      DO_imm_mandr_r("insertps", 211, src, dst);
1195      DO_imm_mandr_r("insertps", 212, src, dst);
1196      DO_imm_mandr_r("insertps", 213, src, dst);
1197      DO_imm_mandr_r("insertps", 214, src, dst);
1198      DO_imm_mandr_r("insertps", 215, src, dst);
1199      DO_imm_mandr_r("insertps", 216, src, dst);
1200      DO_imm_mandr_r("insertps", 217, src, dst);
1201      DO_imm_mandr_r("insertps", 218, src, dst);
1202      DO_imm_mandr_r("insertps", 219, src, dst);
1203      DO_imm_mandr_r("insertps", 220, src, dst);
1204      DO_imm_mandr_r("insertps", 221, src, dst);
1205      DO_imm_mandr_r("insertps", 222, src, dst);
1206      DO_imm_mandr_r("insertps", 223, src, dst);
1207      DO_imm_mandr_r("insertps", 224, src, dst);
1208      DO_imm_mandr_r("insertps", 225, src, dst);
1209      DO_imm_mandr_r("insertps", 226, src, dst);
1210      DO_imm_mandr_r("insertps", 227, src, dst);
1211      DO_imm_mandr_r("insertps", 228, src, dst);
1212      DO_imm_mandr_r("insertps", 229, src, dst);
1213      DO_imm_mandr_r("insertps", 230, src, dst);
1214      DO_imm_mandr_r("insertps", 231, src, dst);
1215      DO_imm_mandr_r("insertps", 232, src, dst);
1216      DO_imm_mandr_r("insertps", 233, src, dst);
1217      DO_imm_mandr_r("insertps", 234, src, dst);
1218      DO_imm_mandr_r("insertps", 235, src, dst);
1219      DO_imm_mandr_r("insertps", 236, src, dst);
1220      DO_imm_mandr_r("insertps", 237, src, dst);
1221      DO_imm_mandr_r("insertps", 238, src, dst);
1222      DO_imm_mandr_r("insertps", 239, src, dst);
1223      DO_imm_mandr_r("insertps", 240, src, dst);
1224      DO_imm_mandr_r("insertps", 241, src, dst);
1225      DO_imm_mandr_r("insertps", 242, src, dst);
1226      DO_imm_mandr_r("insertps", 243, src, dst);
1227      DO_imm_mandr_r("insertps", 244, src, dst);
1228      DO_imm_mandr_r("insertps", 245, src, dst);
1229      DO_imm_mandr_r("insertps", 246, src, dst);
1230      DO_imm_mandr_r("insertps", 247, src, dst);
1231      DO_imm_mandr_r("insertps", 248, src, dst);
1232      DO_imm_mandr_r("insertps", 249, src, dst);
1233      DO_imm_mandr_r("insertps", 250, src, dst);
1234      DO_imm_mandr_r("insertps", 251, src, dst);
1235      DO_imm_mandr_r("insertps", 252, src, dst);
1236      DO_imm_mandr_r("insertps", 253, src, dst);
1237      DO_imm_mandr_r("insertps", 254, src, dst);
1238      DO_imm_mandr_r("insertps", 255, src, dst);
1239   }
1240}
1241
1242void test_MPSADBW ( void )
1243{
1244   V128 src, dst;
1245   Int i;
1246   for (i = 0; i < 50; i++) {
1247      randV128(&src);
1248      randV128(&dst);
1249      DO_imm_mandr_r("mpsadbw", 0, src, dst);
1250      DO_imm_mandr_r("mpsadbw", 1, src, dst);
1251      DO_imm_mandr_r("mpsadbw", 2, src, dst);
1252      DO_imm_mandr_r("mpsadbw", 3, src, dst);
1253      DO_imm_mandr_r("mpsadbw", 4, src, dst);
1254      DO_imm_mandr_r("mpsadbw", 5, src, dst);
1255      DO_imm_mandr_r("mpsadbw", 6, src, dst);
1256      DO_imm_mandr_r("mpsadbw", 7, src, dst);
1257   }
1258}
1259
1260void test_PACKUSDW ( void )
1261{
1262   V128 src, dst;
1263   Int i;
1264   for (i = 0; i < 10; i++) {
1265      if (i < 9) {
1266         randV128(&src);
1267         randV128(&dst);
1268      } else {
1269         memset(&src, 0, sizeof(src));
1270         memset(&dst, 0, sizeof(src));
1271         src[0] = 0x11; src[1] = 0x22;
1272         src[4] = 0x33; src[5] = 0x44;
1273         src[8] = 0x55; src[9] = 0x66;
1274         src[12] = 0x77; src[13] = 0x88;
1275         dst[0] = 0xaa; dst[1] = 0xbb;
1276         dst[4] = 0xcc; dst[5] = 0xdd;
1277         dst[8] = 0xee; dst[9] = 0xff;
1278         dst[12] = 0xa1; dst[13] = 0xb2;
1279      }
1280      DO_mandr_r("packusdw", src, dst);
1281   }
1282}
1283
1284void test_PBLENDW ( void )
1285{
1286   V128 src, dst;
1287   randV128(&src);
1288   randV128(&dst);
1289   {
1290      DO_imm_mandr_r("pblendw", 0, src, dst);
1291      DO_imm_mandr_r("pblendw", 1, src, dst);
1292      DO_imm_mandr_r("pblendw", 2, src, dst);
1293      DO_imm_mandr_r("pblendw", 3, src, dst);
1294      DO_imm_mandr_r("pblendw", 4, src, dst);
1295      DO_imm_mandr_r("pblendw", 5, src, dst);
1296      DO_imm_mandr_r("pblendw", 6, src, dst);
1297      DO_imm_mandr_r("pblendw", 7, src, dst);
1298      DO_imm_mandr_r("pblendw", 8, src, dst);
1299      DO_imm_mandr_r("pblendw", 9, src, dst);
1300      DO_imm_mandr_r("pblendw", 10, src, dst);
1301      DO_imm_mandr_r("pblendw", 11, src, dst);
1302      DO_imm_mandr_r("pblendw", 12, src, dst);
1303      DO_imm_mandr_r("pblendw", 13, src, dst);
1304      DO_imm_mandr_r("pblendw", 14, src, dst);
1305      DO_imm_mandr_r("pblendw", 15, src, dst);
1306      DO_imm_mandr_r("pblendw", 16, src, dst);
1307      DO_imm_mandr_r("pblendw", 17, src, dst);
1308      DO_imm_mandr_r("pblendw", 18, src, dst);
1309      DO_imm_mandr_r("pblendw", 19, src, dst);
1310      DO_imm_mandr_r("pblendw", 20, src, dst);
1311      DO_imm_mandr_r("pblendw", 21, src, dst);
1312      DO_imm_mandr_r("pblendw", 22, src, dst);
1313      DO_imm_mandr_r("pblendw", 23, src, dst);
1314      DO_imm_mandr_r("pblendw", 24, src, dst);
1315      DO_imm_mandr_r("pblendw", 25, src, dst);
1316      DO_imm_mandr_r("pblendw", 26, src, dst);
1317      DO_imm_mandr_r("pblendw", 27, src, dst);
1318      DO_imm_mandr_r("pblendw", 28, src, dst);
1319      DO_imm_mandr_r("pblendw", 29, src, dst);
1320      DO_imm_mandr_r("pblendw", 30, src, dst);
1321      DO_imm_mandr_r("pblendw", 31, src, dst);
1322      DO_imm_mandr_r("pblendw", 32, src, dst);
1323      DO_imm_mandr_r("pblendw", 33, src, dst);
1324      DO_imm_mandr_r("pblendw", 34, src, dst);
1325      DO_imm_mandr_r("pblendw", 35, src, dst);
1326      DO_imm_mandr_r("pblendw", 36, src, dst);
1327      DO_imm_mandr_r("pblendw", 37, src, dst);
1328      DO_imm_mandr_r("pblendw", 38, src, dst);
1329      DO_imm_mandr_r("pblendw", 39, src, dst);
1330      DO_imm_mandr_r("pblendw", 40, src, dst);
1331      DO_imm_mandr_r("pblendw", 41, src, dst);
1332      DO_imm_mandr_r("pblendw", 42, src, dst);
1333      DO_imm_mandr_r("pblendw", 43, src, dst);
1334      DO_imm_mandr_r("pblendw", 44, src, dst);
1335      DO_imm_mandr_r("pblendw", 45, src, dst);
1336      DO_imm_mandr_r("pblendw", 46, src, dst);
1337      DO_imm_mandr_r("pblendw", 47, src, dst);
1338      DO_imm_mandr_r("pblendw", 48, src, dst);
1339      DO_imm_mandr_r("pblendw", 49, src, dst);
1340      DO_imm_mandr_r("pblendw", 50, src, dst);
1341      DO_imm_mandr_r("pblendw", 51, src, dst);
1342      DO_imm_mandr_r("pblendw", 52, src, dst);
1343      DO_imm_mandr_r("pblendw", 53, src, dst);
1344      DO_imm_mandr_r("pblendw", 54, src, dst);
1345      DO_imm_mandr_r("pblendw", 55, src, dst);
1346      DO_imm_mandr_r("pblendw", 56, src, dst);
1347      DO_imm_mandr_r("pblendw", 57, src, dst);
1348      DO_imm_mandr_r("pblendw", 58, src, dst);
1349      DO_imm_mandr_r("pblendw", 59, src, dst);
1350      DO_imm_mandr_r("pblendw", 60, src, dst);
1351      DO_imm_mandr_r("pblendw", 61, src, dst);
1352      DO_imm_mandr_r("pblendw", 62, src, dst);
1353      DO_imm_mandr_r("pblendw", 63, src, dst);
1354      DO_imm_mandr_r("pblendw", 64, src, dst);
1355      DO_imm_mandr_r("pblendw", 65, src, dst);
1356      DO_imm_mandr_r("pblendw", 66, src, dst);
1357      DO_imm_mandr_r("pblendw", 67, src, dst);
1358      DO_imm_mandr_r("pblendw", 68, src, dst);
1359      DO_imm_mandr_r("pblendw", 69, src, dst);
1360      DO_imm_mandr_r("pblendw", 70, src, dst);
1361      DO_imm_mandr_r("pblendw", 71, src, dst);
1362      DO_imm_mandr_r("pblendw", 72, src, dst);
1363      DO_imm_mandr_r("pblendw", 73, src, dst);
1364      DO_imm_mandr_r("pblendw", 74, src, dst);
1365      DO_imm_mandr_r("pblendw", 75, src, dst);
1366      DO_imm_mandr_r("pblendw", 76, src, dst);
1367      DO_imm_mandr_r("pblendw", 77, src, dst);
1368      DO_imm_mandr_r("pblendw", 78, src, dst);
1369      DO_imm_mandr_r("pblendw", 79, src, dst);
1370      DO_imm_mandr_r("pblendw", 80, src, dst);
1371      DO_imm_mandr_r("pblendw", 81, src, dst);
1372      DO_imm_mandr_r("pblendw", 82, src, dst);
1373      DO_imm_mandr_r("pblendw", 83, src, dst);
1374      DO_imm_mandr_r("pblendw", 84, src, dst);
1375      DO_imm_mandr_r("pblendw", 85, src, dst);
1376      DO_imm_mandr_r("pblendw", 86, src, dst);
1377      DO_imm_mandr_r("pblendw", 87, src, dst);
1378      DO_imm_mandr_r("pblendw", 88, src, dst);
1379      DO_imm_mandr_r("pblendw", 89, src, dst);
1380      DO_imm_mandr_r("pblendw", 90, src, dst);
1381      DO_imm_mandr_r("pblendw", 91, src, dst);
1382      DO_imm_mandr_r("pblendw", 92, src, dst);
1383      DO_imm_mandr_r("pblendw", 93, src, dst);
1384      DO_imm_mandr_r("pblendw", 94, src, dst);
1385      DO_imm_mandr_r("pblendw", 95, src, dst);
1386      DO_imm_mandr_r("pblendw", 96, src, dst);
1387      DO_imm_mandr_r("pblendw", 97, src, dst);
1388      DO_imm_mandr_r("pblendw", 98, src, dst);
1389      DO_imm_mandr_r("pblendw", 99, src, dst);
1390      DO_imm_mandr_r("pblendw", 100, src, dst);
1391      DO_imm_mandr_r("pblendw", 101, src, dst);
1392      DO_imm_mandr_r("pblendw", 102, src, dst);
1393      DO_imm_mandr_r("pblendw", 103, src, dst);
1394      DO_imm_mandr_r("pblendw", 104, src, dst);
1395      DO_imm_mandr_r("pblendw", 105, src, dst);
1396      DO_imm_mandr_r("pblendw", 106, src, dst);
1397      DO_imm_mandr_r("pblendw", 107, src, dst);
1398      DO_imm_mandr_r("pblendw", 108, src, dst);
1399      DO_imm_mandr_r("pblendw", 109, src, dst);
1400      DO_imm_mandr_r("pblendw", 110, src, dst);
1401      DO_imm_mandr_r("pblendw", 111, src, dst);
1402      DO_imm_mandr_r("pblendw", 112, src, dst);
1403      DO_imm_mandr_r("pblendw", 113, src, dst);
1404      DO_imm_mandr_r("pblendw", 114, src, dst);
1405      DO_imm_mandr_r("pblendw", 115, src, dst);
1406      DO_imm_mandr_r("pblendw", 116, src, dst);
1407      DO_imm_mandr_r("pblendw", 117, src, dst);
1408      DO_imm_mandr_r("pblendw", 118, src, dst);
1409      DO_imm_mandr_r("pblendw", 119, src, dst);
1410      DO_imm_mandr_r("pblendw", 120, src, dst);
1411      DO_imm_mandr_r("pblendw", 121, src, dst);
1412      DO_imm_mandr_r("pblendw", 122, src, dst);
1413      DO_imm_mandr_r("pblendw", 123, src, dst);
1414      DO_imm_mandr_r("pblendw", 124, src, dst);
1415      DO_imm_mandr_r("pblendw", 125, src, dst);
1416      DO_imm_mandr_r("pblendw", 126, src, dst);
1417      DO_imm_mandr_r("pblendw", 127, src, dst);
1418      DO_imm_mandr_r("pblendw", 128, src, dst);
1419      DO_imm_mandr_r("pblendw", 129, src, dst);
1420      DO_imm_mandr_r("pblendw", 130, src, dst);
1421      DO_imm_mandr_r("pblendw", 131, src, dst);
1422      DO_imm_mandr_r("pblendw", 132, src, dst);
1423      DO_imm_mandr_r("pblendw", 133, src, dst);
1424      DO_imm_mandr_r("pblendw", 134, src, dst);
1425      DO_imm_mandr_r("pblendw", 135, src, dst);
1426      DO_imm_mandr_r("pblendw", 136, src, dst);
1427      DO_imm_mandr_r("pblendw", 137, src, dst);
1428      DO_imm_mandr_r("pblendw", 138, src, dst);
1429      DO_imm_mandr_r("pblendw", 139, src, dst);
1430      DO_imm_mandr_r("pblendw", 140, src, dst);
1431      DO_imm_mandr_r("pblendw", 141, src, dst);
1432      DO_imm_mandr_r("pblendw", 142, src, dst);
1433      DO_imm_mandr_r("pblendw", 143, src, dst);
1434      DO_imm_mandr_r("pblendw", 144, src, dst);
1435      DO_imm_mandr_r("pblendw", 145, src, dst);
1436      DO_imm_mandr_r("pblendw", 146, src, dst);
1437      DO_imm_mandr_r("pblendw", 147, src, dst);
1438      DO_imm_mandr_r("pblendw", 148, src, dst);
1439      DO_imm_mandr_r("pblendw", 149, src, dst);
1440      DO_imm_mandr_r("pblendw", 150, src, dst);
1441      DO_imm_mandr_r("pblendw", 151, src, dst);
1442      DO_imm_mandr_r("pblendw", 152, src, dst);
1443      DO_imm_mandr_r("pblendw", 153, src, dst);
1444      DO_imm_mandr_r("pblendw", 154, src, dst);
1445      DO_imm_mandr_r("pblendw", 155, src, dst);
1446      DO_imm_mandr_r("pblendw", 156, src, dst);
1447      DO_imm_mandr_r("pblendw", 157, src, dst);
1448      DO_imm_mandr_r("pblendw", 158, src, dst);
1449      DO_imm_mandr_r("pblendw", 159, src, dst);
1450      DO_imm_mandr_r("pblendw", 160, src, dst);
1451      DO_imm_mandr_r("pblendw", 161, src, dst);
1452      DO_imm_mandr_r("pblendw", 162, src, dst);
1453      DO_imm_mandr_r("pblendw", 163, src, dst);
1454      DO_imm_mandr_r("pblendw", 164, src, dst);
1455      DO_imm_mandr_r("pblendw", 165, src, dst);
1456      DO_imm_mandr_r("pblendw", 166, src, dst);
1457      DO_imm_mandr_r("pblendw", 167, src, dst);
1458      DO_imm_mandr_r("pblendw", 168, src, dst);
1459      DO_imm_mandr_r("pblendw", 169, src, dst);
1460      DO_imm_mandr_r("pblendw", 170, src, dst);
1461      DO_imm_mandr_r("pblendw", 171, src, dst);
1462      DO_imm_mandr_r("pblendw", 172, src, dst);
1463      DO_imm_mandr_r("pblendw", 173, src, dst);
1464      DO_imm_mandr_r("pblendw", 174, src, dst);
1465      DO_imm_mandr_r("pblendw", 175, src, dst);
1466      DO_imm_mandr_r("pblendw", 176, src, dst);
1467      DO_imm_mandr_r("pblendw", 177, src, dst);
1468      DO_imm_mandr_r("pblendw", 178, src, dst);
1469      DO_imm_mandr_r("pblendw", 179, src, dst);
1470      DO_imm_mandr_r("pblendw", 180, src, dst);
1471      DO_imm_mandr_r("pblendw", 181, src, dst);
1472      DO_imm_mandr_r("pblendw", 182, src, dst);
1473      DO_imm_mandr_r("pblendw", 183, src, dst);
1474      DO_imm_mandr_r("pblendw", 184, src, dst);
1475      DO_imm_mandr_r("pblendw", 185, src, dst);
1476      DO_imm_mandr_r("pblendw", 186, src, dst);
1477      DO_imm_mandr_r("pblendw", 187, src, dst);
1478      DO_imm_mandr_r("pblendw", 188, src, dst);
1479      DO_imm_mandr_r("pblendw", 189, src, dst);
1480      DO_imm_mandr_r("pblendw", 190, src, dst);
1481      DO_imm_mandr_r("pblendw", 191, src, dst);
1482      DO_imm_mandr_r("pblendw", 192, src, dst);
1483      DO_imm_mandr_r("pblendw", 193, src, dst);
1484      DO_imm_mandr_r("pblendw", 194, src, dst);
1485      DO_imm_mandr_r("pblendw", 195, src, dst);
1486      DO_imm_mandr_r("pblendw", 196, src, dst);
1487      DO_imm_mandr_r("pblendw", 197, src, dst);
1488      DO_imm_mandr_r("pblendw", 198, src, dst);
1489      DO_imm_mandr_r("pblendw", 199, src, dst);
1490      DO_imm_mandr_r("pblendw", 200, src, dst);
1491      DO_imm_mandr_r("pblendw", 201, src, dst);
1492      DO_imm_mandr_r("pblendw", 202, src, dst);
1493      DO_imm_mandr_r("pblendw", 203, src, dst);
1494      DO_imm_mandr_r("pblendw", 204, src, dst);
1495      DO_imm_mandr_r("pblendw", 205, src, dst);
1496      DO_imm_mandr_r("pblendw", 206, src, dst);
1497      DO_imm_mandr_r("pblendw", 207, src, dst);
1498      DO_imm_mandr_r("pblendw", 208, src, dst);
1499      DO_imm_mandr_r("pblendw", 209, src, dst);
1500      DO_imm_mandr_r("pblendw", 210, src, dst);
1501      DO_imm_mandr_r("pblendw", 211, src, dst);
1502      DO_imm_mandr_r("pblendw", 212, src, dst);
1503      DO_imm_mandr_r("pblendw", 213, src, dst);
1504      DO_imm_mandr_r("pblendw", 214, src, dst);
1505      DO_imm_mandr_r("pblendw", 215, src, dst);
1506      DO_imm_mandr_r("pblendw", 216, src, dst);
1507      DO_imm_mandr_r("pblendw", 217, src, dst);
1508      DO_imm_mandr_r("pblendw", 218, src, dst);
1509      DO_imm_mandr_r("pblendw", 219, src, dst);
1510      DO_imm_mandr_r("pblendw", 220, src, dst);
1511      DO_imm_mandr_r("pblendw", 221, src, dst);
1512      DO_imm_mandr_r("pblendw", 222, src, dst);
1513      DO_imm_mandr_r("pblendw", 223, src, dst);
1514      DO_imm_mandr_r("pblendw", 224, src, dst);
1515      DO_imm_mandr_r("pblendw", 225, src, dst);
1516      DO_imm_mandr_r("pblendw", 226, src, dst);
1517      DO_imm_mandr_r("pblendw", 227, src, dst);
1518      DO_imm_mandr_r("pblendw", 228, src, dst);
1519      DO_imm_mandr_r("pblendw", 229, src, dst);
1520      DO_imm_mandr_r("pblendw", 230, src, dst);
1521      DO_imm_mandr_r("pblendw", 231, src, dst);
1522      DO_imm_mandr_r("pblendw", 232, src, dst);
1523      DO_imm_mandr_r("pblendw", 233, src, dst);
1524      DO_imm_mandr_r("pblendw", 234, src, dst);
1525      DO_imm_mandr_r("pblendw", 235, src, dst);
1526      DO_imm_mandr_r("pblendw", 236, src, dst);
1527      DO_imm_mandr_r("pblendw", 237, src, dst);
1528      DO_imm_mandr_r("pblendw", 238, src, dst);
1529      DO_imm_mandr_r("pblendw", 239, src, dst);
1530      DO_imm_mandr_r("pblendw", 240, src, dst);
1531      DO_imm_mandr_r("pblendw", 241, src, dst);
1532      DO_imm_mandr_r("pblendw", 242, src, dst);
1533      DO_imm_mandr_r("pblendw", 243, src, dst);
1534      DO_imm_mandr_r("pblendw", 244, src, dst);
1535      DO_imm_mandr_r("pblendw", 245, src, dst);
1536      DO_imm_mandr_r("pblendw", 246, src, dst);
1537      DO_imm_mandr_r("pblendw", 247, src, dst);
1538      DO_imm_mandr_r("pblendw", 248, src, dst);
1539      DO_imm_mandr_r("pblendw", 249, src, dst);
1540      DO_imm_mandr_r("pblendw", 250, src, dst);
1541      DO_imm_mandr_r("pblendw", 251, src, dst);
1542      DO_imm_mandr_r("pblendw", 252, src, dst);
1543      DO_imm_mandr_r("pblendw", 253, src, dst);
1544      DO_imm_mandr_r("pblendw", 254, src, dst);
1545      DO_imm_mandr_r("pblendw", 255, src, dst);
1546   }
1547}
1548
1549
1550void test_PCMPEQQ ( void )
1551{
1552   V128 src, dst;
1553   Int i;
1554   for (i = 0; i < 10; i++) {
1555      randV128(&src);
1556      randV128(&dst);
1557      switch (i - 6) {
1558         case 0: memset(&src[0], 0x55, 8);
1559                 memset(&dst[0], 0x55, 8); break;
1560         case 1: memset(&src[8], 0x55, 8);
1561                 memset(&dst[8], 0x55, 8); break;
1562         default:
1563            break;
1564      }
1565      DO_mandr_r("pcmpeqq", src, dst);
1566   }
1567}
1568
1569
1570void test_PEXTRB ( void )
1571{
1572   V128 src;
1573   randV128(&src);
1574   DO_imm_r_to_mandrscalar("pextrb", 0, src, "d");
1575   DO_imm_r_to_mandrscalar("pextrb", 1, src, "d");
1576   DO_imm_r_to_mandrscalar("pextrb", 2, src, "d");
1577   DO_imm_r_to_mandrscalar("pextrb", 3, src, "d");
1578   DO_imm_r_to_mandrscalar("pextrb", 4, src, "d");
1579   DO_imm_r_to_mandrscalar("pextrb", 5, src, "d");
1580   DO_imm_r_to_mandrscalar("pextrb", 6, src, "d");
1581   DO_imm_r_to_mandrscalar("pextrb", 7, src, "d");
1582   DO_imm_r_to_mandrscalar("pextrb", 8, src, "d");
1583   DO_imm_r_to_mandrscalar("pextrb", 9, src, "d");
1584   DO_imm_r_to_mandrscalar("pextrb", 10, src, "d");
1585   DO_imm_r_to_mandrscalar("pextrb", 11, src, "d");
1586   DO_imm_r_to_mandrscalar("pextrb", 12, src, "d");
1587   DO_imm_r_to_mandrscalar("pextrb", 13, src, "d");
1588   DO_imm_r_to_mandrscalar("pextrb", 14, src, "d");
1589   DO_imm_r_to_mandrscalar("pextrb", 15, src, "d");
1590}
1591
1592void test_PINSRB ( void )
1593{
1594   ULong src;
1595   src = randULong();
1596   DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d");
1597   src = randULong();
1598   DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d");
1599   src = randULong();
1600   DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d");
1601   src = randULong();
1602   DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d");
1603   src = randULong();
1604   DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d");
1605   src = randULong();
1606   DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d");
1607   src = randULong();
1608   DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d");
1609   src = randULong();
1610   DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d");
1611   src = randULong();
1612   DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d");
1613   src = randULong();
1614   DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d");
1615   src = randULong();
1616   DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d");
1617   src = randULong();
1618   DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d");
1619   src = randULong();
1620   DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d");
1621   src = randULong();
1622   DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d");
1623   src = randULong();
1624   DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d");
1625   src = randULong();
1626   DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d");
1627}
1628
1629
1630void test_PEXTRW ( void )
1631{
1632   V128 src;
1633   randV128(&src);
1634   DO_imm_r_to_mandrscalar("pextrw", 0, src, "d");
1635   DO_imm_r_to_mandrscalar("pextrw", 1, src, "d");
1636   DO_imm_r_to_mandrscalar("pextrw", 2, src, "d");
1637   DO_imm_r_to_mandrscalar("pextrw", 3, src, "d");
1638   DO_imm_r_to_mandrscalar("pextrw", 4, src, "d");
1639   DO_imm_r_to_mandrscalar("pextrw", 5, src, "d");
1640   DO_imm_r_to_mandrscalar("pextrw", 6, src, "d");
1641   DO_imm_r_to_mandrscalar("pextrw", 7, src, "d");
1642}
1643
1644void test_PINSRW ( void )
1645{
1646   ULong src;
1647   src = randULong();
1648   DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d");
1649   src = randULong();
1650   DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d");
1651   src = randULong();
1652   DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d");
1653   src = randULong();
1654   DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d");
1655   src = randULong();
1656   DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d");
1657   src = randULong();
1658   DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d");
1659   src = randULong();
1660   DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d");
1661   src = randULong();
1662   DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d");
1663}
1664
1665
1666void test_PEXTRD ( void )
1667{
1668   V128 src;
1669   randV128(&src);
1670   DO_imm_r_to_mandrscalar("pextrd", 0, src, "d");
1671   DO_imm_r_to_mandrscalar("pextrd", 1, src, "d");
1672   DO_imm_r_to_mandrscalar("pextrd", 2, src, "d");
1673   DO_imm_r_to_mandrscalar("pextrd", 3, src, "d");
1674}
1675
1676void test_PINSRD ( void )
1677{
1678   ULong src;
1679   src = randULong();
1680   DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d");
1681   src = randULong();
1682   DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d");
1683   src = randULong();
1684   DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d");
1685   src = randULong();
1686   DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d");
1687}
1688
1689
1690void test_PEXTRQ ( void )
1691{
1692   V128 src;
1693   randV128(&src);
1694   DO_imm_r_to_mandrscalar("pextrq", 0, src, "");
1695   DO_imm_r_to_mandrscalar("pextrq", 1, src, "");
1696}
1697
1698void test_PINSRQ ( void )
1699{
1700   ULong src;
1701   src = randULong();
1702   DO_imm_mandrscalar_to_r("pinsrq", 0, src, "");
1703   src = randULong();
1704   DO_imm_mandrscalar_to_r("pinsrq", 1, src, "");
1705}
1706
1707
1708void test_EXTRACTPS ( void )
1709{
1710   V128 src;
1711   randV128(&src);
1712   DO_imm_r_to_mandrscalar("extractps", 0, src, "d");
1713   DO_imm_r_to_mandrscalar("extractps", 1, src, "d");
1714   DO_imm_r_to_mandrscalar("extractps", 2, src, "d");
1715   DO_imm_r_to_mandrscalar("extractps", 3, src, "d");
1716}
1717
1718
1719void test_PHMINPOSUW ( void )
1720{
1721   V128 src, dst;
1722   Int i;
1723   for (i = 0; i < 20; i++) {
1724      randV128(&src);
1725      randV128(&dst);
1726      DO_mandr_r("phminposuw", src, dst);
1727   }
1728   memset(src, 0x55, sizeof(src));
1729   memset(dst, 0xAA, sizeof(dst));
1730   DO_mandr_r("phminposuw", src, dst);
1731}
1732
1733void test_PMAXSB ( void )
1734{
1735   V128 src, dst;
1736   Int i;
1737   for (i = 0; i < 10; i++) {
1738      randV128(&src);
1739      randV128(&dst);
1740      DO_mandr_r("pmaxsb", src, dst);
1741   }
1742}
1743
1744void test_PMAXSD ( void )
1745{
1746   V128 src, dst;
1747   Int i;
1748   for (i = 0; i < 10; i++) {
1749      randV128(&src);
1750      randV128(&dst);
1751      DO_mandr_r("pmaxsd", src, dst);
1752   }
1753}
1754
1755void test_PMAXUD ( void )
1756{
1757   V128 src, dst;
1758   Int i;
1759   for (i = 0; i < 10; i++) {
1760      randV128(&src);
1761      randV128(&dst);
1762      DO_mandr_r("pmaxud", src, dst);
1763   }
1764}
1765
1766void test_PMAXUW ( void )
1767{
1768   V128 src, dst;
1769   Int i;
1770   for (i = 0; i < 10; i++) {
1771      randV128(&src);
1772      randV128(&dst);
1773      DO_mandr_r("pmaxuw", src, dst);
1774   }
1775}
1776
1777void test_PMINSB ( void )
1778{
1779   V128 src, dst;
1780   Int i;
1781   for (i = 0; i < 10; i++) {
1782      randV128(&src);
1783      randV128(&dst);
1784      DO_mandr_r("pminsb", src, dst);
1785   }
1786}
1787
1788void test_PMINSD ( void )
1789{
1790   V128 src, dst;
1791   Int i;
1792   for (i = 0; i < 10; i++) {
1793      randV128(&src);
1794      randV128(&dst);
1795      DO_mandr_r("pminsd", src, dst);
1796   }
1797}
1798
1799void test_PMINUD ( void )
1800{
1801   V128 src, dst;
1802   Int i;
1803   for (i = 0; i < 10; i++) {
1804      randV128(&src);
1805      randV128(&dst);
1806      DO_mandr_r("pminud", src, dst);
1807   }
1808}
1809
1810void test_PMINUW ( void )
1811{
1812   V128 src, dst;
1813   Int i;
1814   for (i = 0; i < 10; i++) {
1815      randV128(&src);
1816      randV128(&dst);
1817      DO_mandr_r("pminuw", src, dst);
1818   }
1819}
1820
1821void test_PMOVSXBW ( void )
1822{
1823   V128 src, dst;
1824   Int i;
1825   for (i = 0; i < 10; i++) {
1826      randV128(&src);
1827      randV128(&dst);
1828      DO_mandr_r("pmovsxbw", src, dst);
1829   }
1830}
1831
1832void test_PMOVSXBD ( void )
1833{
1834   V128 src, dst;
1835   Int i;
1836   for (i = 0; i < 10; i++) {
1837      randV128(&src);
1838      randV128(&dst);
1839      DO_mandr_r("pmovsxbd", src, dst);
1840   }
1841}
1842
1843void test_PMOVSXBQ ( void )
1844{
1845   V128 src, dst;
1846   Int i;
1847   for (i = 0; i < 10; i++) {
1848      randV128(&src);
1849      randV128(&dst);
1850      DO_mandr_r("pmovsxbq", src, dst);
1851   }
1852}
1853
1854void test_PMOVSXWD ( void )
1855{
1856   V128 src, dst;
1857   Int i;
1858   for (i = 0; i < 10; i++) {
1859      randV128(&src);
1860      randV128(&dst);
1861      DO_mandr_r("pmovsxwd", src, dst);
1862   }
1863}
1864
1865void test_PMOVSXWQ ( void )
1866{
1867   V128 src, dst;
1868   Int i;
1869   for (i = 0; i < 10; i++) {
1870      randV128(&src);
1871      randV128(&dst);
1872      DO_mandr_r("pmovsxwq", src, dst);
1873   }
1874}
1875
1876void test_PMOVSXDQ ( void )
1877{
1878   V128 src, dst;
1879   Int i;
1880   for (i = 0; i < 10; i++) {
1881      randV128(&src);
1882      randV128(&dst);
1883      DO_mandr_r("pmovsxdq", src, dst);
1884   }
1885}
1886
1887void test_PMOVZXBW ( void )
1888{
1889   V128 src, dst;
1890   Int i;
1891   for (i = 0; i < 10; i++) {
1892      randV128(&src);
1893      randV128(&dst);
1894      DO_mandr_r("pmovzxbw", src, dst);
1895   }
1896}
1897
1898void test_PMOVZXBD ( void )
1899{
1900   V128 src, dst;
1901   Int i;
1902   for (i = 0; i < 10; i++) {
1903      randV128(&src);
1904      randV128(&dst);
1905      DO_mandr_r("pmovzxbd", src, dst);
1906   }
1907}
1908
1909void test_PMOVZXBQ ( void )
1910{
1911   V128 src, dst;
1912   Int i;
1913   for (i = 0; i < 10; i++) {
1914      randV128(&src);
1915      randV128(&dst);
1916      DO_mandr_r("pmovzxbq", src, dst);
1917   }
1918}
1919
1920void test_PMOVZXWD ( void )
1921{
1922   V128 src, dst;
1923   Int i;
1924   for (i = 0; i < 10; i++) {
1925      randV128(&src);
1926      randV128(&dst);
1927      DO_mandr_r("pmovzxwd", src, dst);
1928   }
1929}
1930
1931void test_PMOVZXWQ ( void )
1932{
1933   V128 src, dst;
1934   Int i;
1935   for (i = 0; i < 10; i++) {
1936      randV128(&src);
1937      randV128(&dst);
1938      DO_mandr_r("pmovzxwq", src, dst);
1939   }
1940}
1941
1942void test_PMOVZXDQ ( void )
1943{
1944   V128 src, dst;
1945   Int i;
1946   for (i = 0; i < 10; i++) {
1947      randV128(&src);
1948      randV128(&dst);
1949      DO_mandr_r("pmovzxdq", src, dst);
1950   }
1951}
1952
1953void test_PMULDQ ( void )
1954{
1955   V128 src, dst;
1956   Int i;
1957   for (i = 0; i < 10; i++) {
1958      randV128(&src);
1959      randV128(&dst);
1960      DO_mandr_r("pmuldq", src, dst);
1961   }
1962}
1963
1964
1965void test_PMULLD ( void )
1966{
1967   V128 src, dst;
1968   Int i;
1969   for (i = 0; i < 10; i++) {
1970      randV128(&src);
1971      randV128(&dst);
1972      DO_mandr_r("pmulld", src, dst);
1973   }
1974}
1975
1976
1977void test_POPCNTQ ( void )
1978{
1979   ULong block[4];
1980   Int i;
1981   ULong oszacp_mask = 0x8D5;
1982   for (i = 0; i < 10; i++) {
1983      block[0] = i == 0 ? 0 : randULong();
1984      block[1] = randULong();
1985      block[2] = randULong();
1986      block[3] = randULong();
1987      __asm__ __volatile__(
1988         "movq %0,       %%rax"  "\n\t"
1989         "movq 0(%%rax), %%rdi"  "\n\t"
1990         "movq 8(%%rax), %%r11"  "\n\t"
1991#ifndef VGP_amd64_darwin
1992         "popcntq %%rdi, %%r11"  "\n\t"
1993#else
1994         "popcnt  %%rdi, %%r11"  "\n\t"
1995#endif
1996         "movq %%r11, 16(%%rax)"  "\n\t"
1997         "pushfq"                 "\n\t"
1998         "popq %%r12"             "\n\t"
1999         "movq %%r12, 24(%%rax)"  "\n"
2000         : /*out*/
2001         : /*in*/"r"(&block[0])
2002         : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2003      );
2004      printf("r popcntq  %016llx %016llx  %016llx %016llx\n",
2005             block[0], block[1], block[2], block[3] & oszacp_mask);
2006
2007      block[0] = i == 0 ? 0 : randULong();
2008      block[1] = randULong();
2009      block[2] = randULong();
2010      block[3] = randULong();
2011      __asm__ __volatile__(
2012         "movq %0,       %%rax"  "\n\t"
2013         "movq 8(%%rax), %%r11"  "\n\t"
2014#ifndef VGP_amd64_darwin
2015         "popcntq 0(%%rax), %%r11"  "\n\t"
2016#else
2017         "popcnt  0(%%rax), %%r11"  "\n\t"
2018#endif
2019         "movq %%r11, 16(%%rax)"  "\n\t"
2020         "pushfq"                 "\n\t"
2021         "popq %%r12"             "\n\t"
2022         "movq %%r12, 24(%%rax)"  "\n"
2023         : /*out*/
2024         : /*in*/"r"(&block[0])
2025         : /*trash*/ "cc", "memory", "r11", "r12"
2026      );
2027      printf("m popcntq  %016llx %016llx  %016llx %016llx\n",
2028             block[0], block[1], block[2], block[3] & oszacp_mask);
2029   }
2030}
2031
2032
2033void test_POPCNTL ( void )
2034{
2035   ULong block[4];
2036   Int i;
2037   ULong oszacp_mask = 0x8D5;
2038   for (i = 0; i < 10; i++) {
2039      block[0] = i == 0 ? 0 : randULong();
2040      block[1] = randULong();
2041      block[2] = randULong();
2042      block[3] = randULong();
2043      __asm__ __volatile__(
2044         "movq %0,       %%rax"  "\n\t"
2045         "movq 0(%%rax), %%rdi"  "\n\t"
2046         "movq 8(%%rax), %%r11"  "\n\t"
2047#ifndef VGP_amd64_darwin
2048         "popcntl %%edi, %%r11d"  "\n\t"
2049#else
2050         "popcnt  %%edi, %%r11d"  "\n\t"
2051#endif
2052         "movq %%r11, 16(%%rax)"  "\n\t"
2053         "pushfq"                 "\n\t"
2054         "popq %%r12"             "\n\t"
2055         "movq %%r12, 24(%%rax)"  "\n"
2056         : /*out*/
2057         : /*in*/"r"(&block[0])
2058         : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2059      );
2060      printf("r popcntl  %016llx %016llx  %016llx %016llx\n",
2061             block[0], block[1], block[2], block[3] & oszacp_mask);
2062
2063      block[0] = i == 0 ? 0 : randULong();
2064      block[1] = randULong();
2065      block[2] = randULong();
2066      block[3] = randULong();
2067      __asm__ __volatile__(
2068         "movq %0,       %%rax"  "\n\t"
2069         "movq 8(%%rax), %%r11"  "\n\t"
2070#ifndef VGP_amd64_darwin
2071         "popcntl 0(%%rax), %%r11d"  "\n\t"
2072#else
2073         "popcnt  0(%%rax), %%r11d"  "\n\t"
2074#endif
2075         "movq %%r11, 16(%%rax)"  "\n\t"
2076         "pushfq"                 "\n\t"
2077         "popq %%r12"             "\n\t"
2078         "movq %%r12, 24(%%rax)"  "\n"
2079         : /*out*/
2080         : /*in*/"r"(&block[0])
2081         : /*trash*/ "cc", "memory", "r11", "r12"
2082      );
2083      printf("m popcntl  %016llx %016llx  %016llx %016llx\n",
2084             block[0], block[1], block[2], block[3] & oszacp_mask);
2085   }
2086}
2087
2088
2089void test_POPCNTW ( void )
2090{
2091   ULong block[4];
2092   Int i;
2093   ULong oszacp_mask = 0x8D5;
2094   for (i = 0; i < 10; i++) {
2095      block[0] = i == 0 ? 0 : randULong();
2096      block[1] = randULong();
2097      block[2] = randULong();
2098      block[3] = randULong();
2099      __asm__ __volatile__(
2100         "movq %0,       %%rax"  "\n\t"
2101         "movq 0(%%rax), %%rdi"  "\n\t"
2102         "movq 8(%%rax), %%r11"  "\n\t"
2103#ifndef VGP_amd64_darwin
2104         "popcntw %%di,  %%r11w"  "\n\t"
2105#else
2106         "popcnt  %%di,  %%r11w"  "\n\t"
2107#endif
2108         "movq %%r11, 16(%%rax)"  "\n\t"
2109         "pushfq"                 "\n\t"
2110         "popq %%r12"             "\n\t"
2111         "movq %%r12, 24(%%rax)"  "\n"
2112         : /*out*/
2113         : /*in*/"r"(&block[0])
2114         : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2115      );
2116      printf("r popcntw  %016llx %016llx  %016llx %016llx\n",
2117             block[0], block[1], block[2], block[3] & oszacp_mask);
2118
2119      block[0] = i == 0 ? 0 : randULong();
2120      block[1] = randULong();
2121      block[2] = randULong();
2122      block[3] = randULong();
2123      __asm__ __volatile__(
2124         "movq %0,       %%rax"  "\n\t"
2125         "movq 8(%%rax), %%r11"  "\n\t"
2126#ifndef VGP_amd64_darwin
2127         "popcntw 0(%%rax), %%r11w"  "\n\t"
2128#else
2129         "popcnt  0(%%rax), %%r11w"  "\n\t"
2130#endif
2131         "movq %%r11, 16(%%rax)"  "\n\t"
2132         "pushfq"                 "\n\t"
2133         "popq %%r12"             "\n\t"
2134         "movq %%r12, 24(%%rax)"  "\n"
2135         : /*out*/
2136         : /*in*/"r"(&block[0])
2137         : /*trash*/ "cc", "memory", "r11", "r12"
2138      );
2139      printf("m popcntw  %016llx %016llx  %016llx %016llx\n",
2140             block[0], block[1], block[2], block[3] & oszacp_mask);
2141   }
2142}
2143
2144
2145void test_PCMPGTQ ( void )
2146{
2147   V128 spec[7];
2148   do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL );
2149   do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL );
2150   do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL );
2151   do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL );
2152   do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL );
2153   do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL );
2154   do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL );
2155
2156   V128 src, dst;
2157   Int i, j;
2158   for (i = 0; i < 10; i++) {
2159      randV128(&src);
2160      randV128(&dst);
2161      DO_mandr_r("pcmpgtq", src, dst);
2162   }
2163   for (i = 0; i < 7; i++) {
2164      for (j = 0; j < 7; j++) {
2165         memcpy(&src, &spec[i], 16);
2166         memcpy(&dst, &spec[j], 16);
2167         DO_mandr_r("pcmpgtq", src, dst);
2168      }
2169   }
2170}
2171
2172/* ------------ ROUNDSD ------------ */
2173
2174void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2175{
2176   if (mem) {
2177      __asm__ __volatile__(
2178         "movupd  (%1), %%xmm11"       "\n\t"
2179         "roundsd $0, (%0), %%xmm11"   "\n\t"
2180         "movupd  %%xmm11, (%1)"       "\n"
2181         : /*OUT*/
2182         : /*IN*/ "r"(src), "r"(dst)
2183         : /*TRASH*/ "xmm11"
2184      );
2185   } else {
2186      __asm__ __volatile__(
2187         "movupd  (%1), %%xmm11"         "\n\t"
2188         "movupd  (%0), %%xmm2"          "\n\t"
2189         "roundsd $0, %%xmm2, %%xmm11"   "\n\t"
2190         "movupd  %%xmm11, (%1)"         "\n"
2191         : /*OUT*/
2192         : /*IN*/ "r"(src), "r"(dst)
2193         : /*TRASH*/ "xmm11","xmm2"
2194      );
2195   }
2196}
2197
2198void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2199{
2200   if (mem) {
2201      __asm__ __volatile__(
2202         "movupd  (%1), %%xmm11"       "\n\t"
2203         "roundsd $1, (%0), %%xmm11"   "\n\t"
2204         "movupd  %%xmm11, (%1)"       "\n"
2205         : /*OUT*/
2206         : /*IN*/ "r"(src), "r"(dst)
2207         : /*TRASH*/ "xmm11"
2208      );
2209   } else {
2210      __asm__ __volatile__(
2211         "movupd  (%1), %%xmm11"         "\n\t"
2212         "movupd  (%0), %%xmm2"          "\n\t"
2213         "roundsd $1, %%xmm2, %%xmm11"   "\n\t"
2214         "movupd  %%xmm11, (%1)"         "\n"
2215         : /*OUT*/
2216         : /*IN*/ "r"(src), "r"(dst)
2217         : /*TRASH*/ "xmm11","xmm2"
2218      );
2219   }
2220}
2221
2222void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2223{
2224   if (mem) {
2225      __asm__ __volatile__(
2226         "movupd  (%1), %%xmm11"       "\n\t"
2227         "roundsd $2, (%0), %%xmm11"   "\n\t"
2228         "movupd  %%xmm11, (%1)"       "\n"
2229         : /*OUT*/
2230         : /*IN*/ "r"(src), "r"(dst)
2231         : /*TRASH*/ "xmm11"
2232      );
2233   } else {
2234      __asm__ __volatile__(
2235         "movupd  (%1), %%xmm11"         "\n\t"
2236         "movupd  (%0), %%xmm2"          "\n\t"
2237         "roundsd $2, %%xmm2, %%xmm11"   "\n\t"
2238         "movupd  %%xmm11, (%1)"         "\n"
2239         : /*OUT*/
2240         : /*IN*/ "r"(src), "r"(dst)
2241         : /*TRASH*/ "xmm11","xmm2"
2242      );
2243   }
2244}
2245
2246void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2247{
2248   if (mem) {
2249      __asm__ __volatile__(
2250         "movupd  (%1), %%xmm11"       "\n\t"
2251         "roundsd $3, (%0), %%xmm11"   "\n\t"
2252         "movupd  %%xmm11, (%1)"       "\n"
2253         : /*OUT*/
2254         : /*IN*/ "r"(src), "r"(dst)
2255         : /*TRASH*/ "xmm11"
2256      );
2257   } else {
2258      __asm__ __volatile__(
2259         "movupd  (%1), %%xmm11"         "\n\t"
2260         "movupd  (%0), %%xmm2"          "\n\t"
2261         "roundsd $3, %%xmm2, %%xmm11"   "\n\t"
2262         "movupd  %%xmm11, (%1)"         "\n"
2263         : /*OUT*/
2264         : /*IN*/ "r"(src), "r"(dst)
2265         : /*TRASH*/ "xmm11","xmm2"
2266      );
2267   }
2268}
2269
2270void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2271{
2272   if (mem) {
2273      __asm__ __volatile__(
2274         "movupd  (%1), %%xmm11"       "\n\t"
2275         "roundsd $4, (%0), %%xmm11"   "\n\t"
2276         "movupd  %%xmm11, (%1)"       "\n"
2277         : /*OUT*/
2278         : /*IN*/ "r"(src), "r"(dst)
2279         : /*TRASH*/ "xmm11"
2280      );
2281   } else {
2282      __asm__ __volatile__(
2283         "movupd  (%1), %%xmm11"         "\n\t"
2284         "movupd  (%0), %%xmm2"          "\n\t"
2285         "roundsd $4, %%xmm2, %%xmm11"   "\n\t"
2286         "movupd  %%xmm11, (%1)"         "\n"
2287         : /*OUT*/
2288         : /*IN*/ "r"(src), "r"(dst)
2289         : /*TRASH*/ "xmm11","xmm2"
2290      );
2291   }
2292}
2293
2294void test_ROUNDSD_w_immediate_rounding ( void )
2295{
2296   double vals[22];
2297   Int i = 0;
2298   vals[i++] = 0.0;
2299   vals[i++] = -0.0;
2300   vals[i++] = mkPosInf();
2301   vals[i++] = mkNegInf();
2302   vals[i++] = mkPosNan();
2303   vals[i++] = mkNegNan();
2304   vals[i++] = -1.3;
2305   vals[i++] = -1.1;
2306   vals[i++] = -0.9;
2307   vals[i++] = -0.7;
2308   vals[i++] = -0.50001;
2309   vals[i++] = -0.49999;
2310   vals[i++] = -0.3;
2311   vals[i++] = -0.1;
2312   vals[i++] = 0.1;
2313   vals[i++] = 0.3;
2314   vals[i++] = 0.49999;
2315   vals[i++] = 0.50001;
2316   vals[i++] = 0.7;
2317   vals[i++] = 0.9;
2318   vals[i++] = 1.1;
2319   vals[i++] = 1.3;
2320   assert(i == 22);
2321
2322   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2323      V128 src, dst;
2324
2325      randV128(&src);
2326      randV128(&dst);
2327      memcpy(&src[0], &vals[i], 8);
2328      do_ROUNDSD_000(False/*reg*/, &src, &dst);
2329      printf("r roundsd_000  ");
2330      showV128(&src);
2331      printf(" ");
2332      showV128(&dst);
2333      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2334      printf("\n");
2335
2336      randV128(&src);
2337      randV128(&dst);
2338      memcpy(&src[0], &vals[i], 8);
2339      do_ROUNDSD_000(True/*mem*/, &src, &dst);
2340      printf("m roundsd_000  ");
2341      showV128(&src);
2342      printf(" ");
2343      showV128(&dst);
2344      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2345      printf("\n");
2346
2347
2348      randV128(&src);
2349      randV128(&dst);
2350      memcpy(&src[0], &vals[i], 8);
2351      do_ROUNDSD_001(False/*reg*/, &src, &dst);
2352      printf("r roundsd_001  ");
2353      showV128(&src);
2354      printf(" ");
2355      showV128(&dst);
2356      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2357      printf("\n");
2358
2359      randV128(&src);
2360      randV128(&dst);
2361      memcpy(&src[0], &vals[i], 8);
2362      do_ROUNDSD_001(True/*mem*/, &src, &dst);
2363      printf("m roundsd_001  ");
2364      showV128(&src);
2365      printf(" ");
2366      showV128(&dst);
2367      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2368      printf("\n");
2369
2370
2371      randV128(&src);
2372      randV128(&dst);
2373      memcpy(&src[0], &vals[i], 8);
2374      do_ROUNDSD_010(False/*reg*/, &src, &dst);
2375      printf("r roundsd_010  ");
2376      showV128(&src);
2377      printf(" ");
2378      showV128(&dst);
2379      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2380      printf("\n");
2381
2382      randV128(&src);
2383      randV128(&dst);
2384      memcpy(&src[0], &vals[i], 8);
2385      do_ROUNDSD_010(True/*mem*/, &src, &dst);
2386      printf("m roundsd_010  ");
2387      showV128(&src);
2388      printf(" ");
2389      showV128(&dst);
2390      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2391      printf("\n");
2392
2393
2394      randV128(&src);
2395      randV128(&dst);
2396      memcpy(&src[0], &vals[i], 8);
2397      do_ROUNDSD_011(False/*reg*/, &src, &dst);
2398      printf("r roundsd_011  ");
2399      showV128(&src);
2400      printf(" ");
2401      showV128(&dst);
2402      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2403      printf("\n");
2404
2405      randV128(&src);
2406      randV128(&dst);
2407      memcpy(&src[0], &vals[i], 8);
2408      do_ROUNDSD_011(True/*mem*/, &src, &dst);
2409      printf("m roundsd_011  ");
2410      showV128(&src);
2411      printf(" ");
2412      showV128(&dst);
2413      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2414      printf("\n");
2415   }
2416}
2417
2418void test_ROUNDSD_w_mxcsr_rounding ( void )
2419{
2420   UInt rm;
2421   double vals[22];
2422   Int i = 0;
2423   vals[i++] = 0.0;
2424   vals[i++] = -0.0;
2425   vals[i++] = mkPosInf();
2426   vals[i++] = mkNegInf();
2427   vals[i++] = mkPosNan();
2428   vals[i++] = mkNegNan();
2429   vals[i++] = -1.3;
2430   vals[i++] = -1.1;
2431   vals[i++] = -0.9;
2432   vals[i++] = -0.7;
2433   vals[i++] = -0.50001;
2434   vals[i++] = -0.49999;
2435   vals[i++] = -0.3;
2436   vals[i++] = -0.1;
2437   vals[i++] = 0.1;
2438   vals[i++] = 0.3;
2439   vals[i++] = 0.49999;
2440   vals[i++] = 0.50001;
2441   vals[i++] = 0.7;
2442   vals[i++] = 0.9;
2443   vals[i++] = 1.1;
2444   vals[i++] = 1.3;
2445   assert(i == 22);
2446
2447   rm = get_sse_roundingmode();
2448   assert(rm == 0); // 0 == RN == default
2449
2450   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2451      V128 src, dst;
2452
2453      for (rm = 0; rm <= 3; rm++) {
2454         set_sse_roundingmode(rm);
2455
2456         randV128(&src);
2457         randV128(&dst);
2458         memcpy(&src[0], &vals[i], 8);
2459         do_ROUNDSD_1XX(False/*reg*/, &src, &dst);
2460         printf("r (rm=%u) roundsd_1XX  ", rm);
2461         showV128(&src);
2462         printf(" ");
2463         showV128(&dst);
2464         printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2465         printf("\n");
2466
2467         randV128(&src);
2468         randV128(&dst);
2469         memcpy(&src[0], &vals[i], 8);
2470         do_ROUNDSD_1XX(True/*mem*/, &src, &dst);
2471         printf("m (rm=%u) roundsd_1XX  ", rm);
2472         showV128(&src);
2473         printf(" ");
2474         showV128(&dst);
2475         printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2476         printf("\n");
2477      }
2478   }
2479
2480   rm = get_sse_roundingmode();
2481   assert(rm == 3);
2482   set_sse_roundingmode(0);
2483   rm = get_sse_roundingmode();
2484   assert(rm == 0); // 0 == RN == default
2485}
2486
2487
2488/* ------------ ROUNDSS ------------ */
2489
2490void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2491{
2492   if (mem) {
2493      __asm__ __volatile__(
2494         "movupd  (%1), %%xmm11"       "\n\t"
2495         "roundss $0, (%0), %%xmm11"   "\n\t"
2496         "movupd  %%xmm11, (%1)"       "\n"
2497         : /*OUT*/
2498         : /*IN*/ "r"(src), "r"(dst)
2499         : /*TRASH*/ "xmm11"
2500      );
2501   } else {
2502      __asm__ __volatile__(
2503         "movupd  (%1), %%xmm11"         "\n\t"
2504         "movupd  (%0), %%xmm2"          "\n\t"
2505         "roundss $0, %%xmm2, %%xmm11"   "\n\t"
2506         "movupd  %%xmm11, (%1)"         "\n"
2507         : /*OUT*/
2508         : /*IN*/ "r"(src), "r"(dst)
2509         : /*TRASH*/ "xmm11","xmm2"
2510      );
2511   }
2512}
2513
2514void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2515{
2516   if (mem) {
2517      __asm__ __volatile__(
2518         "movupd  (%1), %%xmm11"       "\n\t"
2519         "roundss $1, (%0), %%xmm11"   "\n\t"
2520         "movupd  %%xmm11, (%1)"       "\n"
2521         : /*OUT*/
2522         : /*IN*/ "r"(src), "r"(dst)
2523         : /*TRASH*/ "xmm11"
2524      );
2525   } else {
2526      __asm__ __volatile__(
2527         "movupd  (%1), %%xmm11"         "\n\t"
2528         "movupd  (%0), %%xmm2"          "\n\t"
2529         "roundss $1, %%xmm2, %%xmm11"   "\n\t"
2530         "movupd  %%xmm11, (%1)"         "\n"
2531         : /*OUT*/
2532         : /*IN*/ "r"(src), "r"(dst)
2533         : /*TRASH*/ "xmm11","xmm2"
2534      );
2535   }
2536}
2537
2538void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2539{
2540   if (mem) {
2541      __asm__ __volatile__(
2542         "movupd  (%1), %%xmm11"       "\n\t"
2543         "roundss $2, (%0), %%xmm11"   "\n\t"
2544         "movupd  %%xmm11, (%1)"       "\n"
2545         : /*OUT*/
2546         : /*IN*/ "r"(src), "r"(dst)
2547         : /*TRASH*/ "xmm11"
2548      );
2549   } else {
2550      __asm__ __volatile__(
2551         "movupd  (%1), %%xmm11"         "\n\t"
2552         "movupd  (%0), %%xmm2"          "\n\t"
2553         "roundss $2, %%xmm2, %%xmm11"   "\n\t"
2554         "movupd  %%xmm11, (%1)"         "\n"
2555         : /*OUT*/
2556         : /*IN*/ "r"(src), "r"(dst)
2557         : /*TRASH*/ "xmm11","xmm2"
2558      );
2559   }
2560}
2561
2562void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2563{
2564   if (mem) {
2565      __asm__ __volatile__(
2566         "movupd  (%1), %%xmm11"       "\n\t"
2567         "roundss $3, (%0), %%xmm11"   "\n\t"
2568         "movupd  %%xmm11, (%1)"       "\n"
2569         : /*OUT*/
2570         : /*IN*/ "r"(src), "r"(dst)
2571         : /*TRASH*/ "xmm11"
2572      );
2573   } else {
2574      __asm__ __volatile__(
2575         "movupd  (%1), %%xmm11"         "\n\t"
2576         "movupd  (%0), %%xmm2"          "\n\t"
2577         "roundss $3, %%xmm2, %%xmm11"   "\n\t"
2578         "movupd  %%xmm11, (%1)"         "\n"
2579         : /*OUT*/
2580         : /*IN*/ "r"(src), "r"(dst)
2581         : /*TRASH*/ "xmm11","xmm2"
2582      );
2583   }
2584}
2585
2586void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2587{
2588   if (mem) {
2589      __asm__ __volatile__(
2590         "movupd  (%1), %%xmm11"       "\n\t"
2591         "roundss $4, (%0), %%xmm11"   "\n\t"
2592         "movupd  %%xmm11, (%1)"       "\n"
2593         : /*OUT*/
2594         : /*IN*/ "r"(src), "r"(dst)
2595         : /*TRASH*/ "xmm11"
2596      );
2597   } else {
2598      __asm__ __volatile__(
2599         "movupd  (%1), %%xmm11"         "\n\t"
2600         "movupd  (%0), %%xmm2"          "\n\t"
2601         "roundss $4, %%xmm2, %%xmm11"   "\n\t"
2602         "movupd  %%xmm11, (%1)"         "\n"
2603         : /*OUT*/
2604         : /*IN*/ "r"(src), "r"(dst)
2605         : /*TRASH*/ "xmm11","xmm2"
2606      );
2607   }
2608}
2609
2610void test_ROUNDSS_w_immediate_rounding ( void )
2611{
2612   float vals[22];
2613   Int i = 0;
2614   vals[i++] = 0.0;
2615   vals[i++] = -0.0;
2616   vals[i++] = mkPosInf();
2617   vals[i++] = mkNegInf();
2618   vals[i++] = mkPosNan();
2619   vals[i++] = mkNegNan();
2620   vals[i++] = -1.3;
2621   vals[i++] = -1.1;
2622   vals[i++] = -0.9;
2623   vals[i++] = -0.7;
2624   vals[i++] = -0.50001;
2625   vals[i++] = -0.49999;
2626   vals[i++] = -0.3;
2627   vals[i++] = -0.1;
2628   vals[i++] = 0.1;
2629   vals[i++] = 0.3;
2630   vals[i++] = 0.49999;
2631   vals[i++] = 0.50001;
2632   vals[i++] = 0.7;
2633   vals[i++] = 0.9;
2634   vals[i++] = 1.1;
2635   vals[i++] = 1.3;
2636   assert(i == 22);
2637
2638   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2639      V128 src, dst;
2640
2641      randV128(&src);
2642      randV128(&dst);
2643      memcpy(&src[0], &vals[i], 4);
2644      do_ROUNDSS_000(False/*reg*/, &src, &dst);
2645      printf("r roundss_000  ");
2646      showV128(&src);
2647      printf(" ");
2648      showV128(&dst);
2649      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2650      printf("\n");
2651
2652      randV128(&src);
2653      randV128(&dst);
2654      memcpy(&src[0], &vals[i], 4);
2655      do_ROUNDSS_000(True/*mem*/, &src, &dst);
2656      printf("m roundss_000  ");
2657      showV128(&src);
2658      printf(" ");
2659      showV128(&dst);
2660      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2661      printf("\n");
2662
2663
2664      randV128(&src);
2665      randV128(&dst);
2666      memcpy(&src[0], &vals[i], 4);
2667      do_ROUNDSS_001(False/*reg*/, &src, &dst);
2668      printf("r roundss_001  ");
2669      showV128(&src);
2670      printf(" ");
2671      showV128(&dst);
2672      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2673      printf("\n");
2674
2675      randV128(&src);
2676      randV128(&dst);
2677      memcpy(&src[0], &vals[i], 4);
2678      do_ROUNDSS_001(True/*mem*/, &src, &dst);
2679      printf("m roundss_001  ");
2680      showV128(&src);
2681      printf(" ");
2682      showV128(&dst);
2683      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2684      printf("\n");
2685
2686
2687      randV128(&src);
2688      randV128(&dst);
2689      memcpy(&src[0], &vals[i], 4);
2690      do_ROUNDSS_010(False/*reg*/, &src, &dst);
2691      printf("r roundss_010  ");
2692      showV128(&src);
2693      printf(" ");
2694      showV128(&dst);
2695      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2696      printf("\n");
2697
2698      randV128(&src);
2699      randV128(&dst);
2700      memcpy(&src[0], &vals[i], 4);
2701      do_ROUNDSS_010(True/*mem*/, &src, &dst);
2702      printf("m roundss_010  ");
2703      showV128(&src);
2704      printf(" ");
2705      showV128(&dst);
2706      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2707      printf("\n");
2708
2709
2710      randV128(&src);
2711      randV128(&dst);
2712      memcpy(&src[0], &vals[i], 4);
2713      do_ROUNDSS_011(False/*reg*/, &src, &dst);
2714      printf("r roundss_011  ");
2715      showV128(&src);
2716      printf(" ");
2717      showV128(&dst);
2718      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2719      printf("\n");
2720
2721      randV128(&src);
2722      randV128(&dst);
2723      memcpy(&src[0], &vals[i], 4);
2724      do_ROUNDSS_011(True/*mem*/, &src, &dst);
2725      printf("m roundss_011  ");
2726      showV128(&src);
2727      printf(" ");
2728      showV128(&dst);
2729      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2730      printf("\n");
2731   }
2732}
2733
2734void test_ROUNDSS_w_mxcsr_rounding ( void )
2735{
2736   UInt rm;
2737   float vals[22];
2738   Int i = 0;
2739   vals[i++] = 0.0;
2740   vals[i++] = -0.0;
2741   vals[i++] = mkPosInf();
2742   vals[i++] = mkNegInf();
2743   vals[i++] = mkPosNan();
2744   vals[i++] = mkNegNan();
2745   vals[i++] = -1.3;
2746   vals[i++] = -1.1;
2747   vals[i++] = -0.9;
2748   vals[i++] = -0.7;
2749   vals[i++] = -0.50001;
2750   vals[i++] = -0.49999;
2751   vals[i++] = -0.3;
2752   vals[i++] = -0.1;
2753   vals[i++] = 0.1;
2754   vals[i++] = 0.3;
2755   vals[i++] = 0.49999;
2756   vals[i++] = 0.50001;
2757   vals[i++] = 0.7;
2758   vals[i++] = 0.9;
2759   vals[i++] = 1.1;
2760   vals[i++] = 1.3;
2761   assert(i == 22);
2762
2763   rm = get_sse_roundingmode();
2764   assert(rm == 0); // 0 == RN == default
2765
2766   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2767      V128 src, dst;
2768
2769      for (rm = 0; rm <= 3; rm++) {
2770         set_sse_roundingmode(rm);
2771
2772         randV128(&src);
2773         randV128(&dst);
2774         memcpy(&src[0], &vals[i], 4);
2775         do_ROUNDSS_1XX(False/*reg*/, &src, &dst);
2776         printf("r (rm=%u) roundss_1XX  ", rm);
2777         showV128(&src);
2778         printf(" ");
2779         showV128(&dst);
2780         printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2781         printf("\n");
2782
2783         randV128(&src);
2784         randV128(&dst);
2785         memcpy(&src[0], &vals[i], 4);
2786         do_ROUNDSS_1XX(True/*mem*/, &src, &dst);
2787         printf("m (rm=%u) roundss_1XX  ", rm);
2788         showV128(&src);
2789         printf(" ");
2790         showV128(&dst);
2791         printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2792         printf("\n");
2793      }
2794   }
2795
2796   rm = get_sse_roundingmode();
2797   assert(rm == 3);
2798   set_sse_roundingmode(0);
2799   rm = get_sse_roundingmode();
2800   assert(rm == 0); // 0 == RN == default
2801}
2802
2803/* ------------ ROUNDPD ------------ */
2804
2805void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2806{
2807   if (mem) {
2808      __asm__ __volatile__(
2809         "movupd  (%1), %%xmm11"       "\n\t"
2810         "roundpd $0, (%0), %%xmm11"   "\n\t"
2811         "movupd  %%xmm11, (%1)"       "\n"
2812         : /*OUT*/
2813         : /*IN*/ "r"(src), "r"(dst)
2814         : /*TRASH*/ "xmm11"
2815      );
2816   } else {
2817      __asm__ __volatile__(
2818         "movupd  (%1), %%xmm11"         "\n\t"
2819         "movupd  (%0), %%xmm2"          "\n\t"
2820         "roundpd $0, %%xmm2, %%xmm11"   "\n\t"
2821         "movupd  %%xmm11, (%1)"         "\n"
2822         : /*OUT*/
2823         : /*IN*/ "r"(src), "r"(dst)
2824         : /*TRASH*/ "xmm11","xmm2"
2825      );
2826   }
2827}
2828
2829void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2830{
2831   if (mem) {
2832      __asm__ __volatile__(
2833         "movupd  (%1), %%xmm11"       "\n\t"
2834         "roundpd $1, (%0), %%xmm11"   "\n\t"
2835         "movupd  %%xmm11, (%1)"       "\n"
2836         : /*OUT*/
2837         : /*IN*/ "r"(src), "r"(dst)
2838         : /*TRASH*/ "xmm11"
2839      );
2840   } else {
2841      __asm__ __volatile__(
2842         "movupd  (%1), %%xmm11"         "\n\t"
2843         "movupd  (%0), %%xmm2"          "\n\t"
2844         "roundpd $1, %%xmm2, %%xmm11"   "\n\t"
2845         "movupd  %%xmm11, (%1)"         "\n"
2846         : /*OUT*/
2847         : /*IN*/ "r"(src), "r"(dst)
2848         : /*TRASH*/ "xmm11","xmm2"
2849      );
2850   }
2851}
2852
2853void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2854{
2855   if (mem) {
2856      __asm__ __volatile__(
2857         "movupd  (%1), %%xmm11"       "\n\t"
2858         "roundpd $2, (%0), %%xmm11"   "\n\t"
2859         "movupd  %%xmm11, (%1)"       "\n"
2860         : /*OUT*/
2861         : /*IN*/ "r"(src), "r"(dst)
2862         : /*TRASH*/ "xmm11"
2863      );
2864   } else {
2865      __asm__ __volatile__(
2866         "movupd  (%1), %%xmm11"         "\n\t"
2867         "movupd  (%0), %%xmm2"          "\n\t"
2868         "roundpd $2, %%xmm2, %%xmm11"   "\n\t"
2869         "movupd  %%xmm11, (%1)"         "\n"
2870         : /*OUT*/
2871         : /*IN*/ "r"(src), "r"(dst)
2872         : /*TRASH*/ "xmm11","xmm2"
2873      );
2874   }
2875}
2876
2877void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2878{
2879   if (mem) {
2880      __asm__ __volatile__(
2881         "movupd  (%1), %%xmm11"       "\n\t"
2882         "roundpd $3, (%0), %%xmm11"   "\n\t"
2883         "movupd  %%xmm11, (%1)"       "\n"
2884         : /*OUT*/
2885         : /*IN*/ "r"(src), "r"(dst)
2886         : /*TRASH*/ "xmm11"
2887      );
2888   } else {
2889      __asm__ __volatile__(
2890         "movupd  (%1), %%xmm11"         "\n\t"
2891         "movupd  (%0), %%xmm2"          "\n\t"
2892         "roundpd $3, %%xmm2, %%xmm11"   "\n\t"
2893         "movupd  %%xmm11, (%1)"         "\n"
2894         : /*OUT*/
2895         : /*IN*/ "r"(src), "r"(dst)
2896         : /*TRASH*/ "xmm11","xmm2"
2897      );
2898   }
2899}
2900
2901void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2902{
2903   if (mem) {
2904      __asm__ __volatile__(
2905         "movupd  (%1), %%xmm11"       "\n\t"
2906         "roundpd $4, (%0), %%xmm11"   "\n\t"
2907         "movupd  %%xmm11, (%1)"       "\n"
2908         : /*OUT*/
2909         : /*IN*/ "r"(src), "r"(dst)
2910         : /*TRASH*/ "xmm11"
2911      );
2912   } else {
2913      __asm__ __volatile__(
2914         "movupd  (%1), %%xmm11"         "\n\t"
2915         "movupd  (%0), %%xmm2"          "\n\t"
2916         "roundpd $4, %%xmm2, %%xmm11"   "\n\t"
2917         "movupd  %%xmm11, (%1)"         "\n"
2918         : /*OUT*/
2919         : /*IN*/ "r"(src), "r"(dst)
2920         : /*TRASH*/ "xmm11","xmm2"
2921      );
2922   }
2923}
2924
2925void test_ROUNDPD_w_immediate_rounding ( void )
2926{
2927   double vals[22];
2928   Int i = 0;
2929   vals[i++] = 0.0;
2930   vals[i++] = -0.0;
2931   vals[i++] = mkPosInf();
2932   vals[i++] = mkNegInf();
2933   vals[i++] = mkPosNan();
2934   vals[i++] = mkNegNan();
2935   vals[i++] = -1.3;
2936   vals[i++] = -1.1;
2937   vals[i++] = -0.9;
2938   vals[i++] = -0.7;
2939   vals[i++] = -0.50001;
2940   vals[i++] = -0.49999;
2941   vals[i++] = -0.3;
2942   vals[i++] = -0.1;
2943   vals[i++] = 0.1;
2944   vals[i++] = 0.3;
2945   vals[i++] = 0.49999;
2946   vals[i++] = 0.50001;
2947   vals[i++] = 0.7;
2948   vals[i++] = 0.9;
2949   vals[i++] = 1.1;
2950   vals[i++] = 1.3;
2951   assert(i == 22);
2952
2953   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2954      V128 src, dst;
2955
2956      randV128(&src);
2957      randV128(&dst);
2958      memcpy(&src[0], &vals[i], 8);
2959      memcpy(&src[8], &vals[(i+11)%22], 8);
2960      do_ROUNDPD_000(False/*reg*/, &src, &dst);
2961      printf("r roundpd_000  ");
2962      showV128(&src);
2963      printf(" ");
2964      showV128(&dst);
2965      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
2966      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2967      printf("\n");
2968
2969      randV128(&src);
2970      randV128(&dst);
2971      memcpy(&src[0], &vals[i], 8);
2972      memcpy(&src[8], &vals[(i+11)%22], 8);
2973      do_ROUNDPD_000(True/*mem*/, &src, &dst);
2974      printf("m roundpd_000  ");
2975      showV128(&src);
2976      printf(" ");
2977      showV128(&dst);
2978      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
2979      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2980      printf("\n");
2981
2982
2983      randV128(&src);
2984      randV128(&dst);
2985      memcpy(&src[0], &vals[i], 8);
2986      memcpy(&src[8], &vals[(i+11)%22], 8);
2987      do_ROUNDPD_001(False/*reg*/, &src, &dst);
2988      printf("r roundpd_001  ");
2989      showV128(&src);
2990      printf(" ");
2991      showV128(&dst);
2992      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
2993      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2994      printf("\n");
2995
2996      randV128(&src);
2997      randV128(&dst);
2998      memcpy(&src[0], &vals[i], 8);
2999      memcpy(&src[8], &vals[(i+11)%22], 8);
3000      do_ROUNDPD_001(True/*mem*/, &src, &dst);
3001      printf("m roundpd_001  ");
3002      showV128(&src);
3003      printf(" ");
3004      showV128(&dst);
3005      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3006      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3007      printf("\n");
3008
3009
3010      randV128(&src);
3011      randV128(&dst);
3012      memcpy(&src[0], &vals[i], 8);
3013      memcpy(&src[8], &vals[(i+11)%22], 8);
3014      do_ROUNDPD_010(False/*reg*/, &src, &dst);
3015      printf("r roundpd_010  ");
3016      showV128(&src);
3017      printf(" ");
3018      showV128(&dst);
3019      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3020      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3021      printf("\n");
3022
3023      randV128(&src);
3024      randV128(&dst);
3025      memcpy(&src[0], &vals[i], 8);
3026      memcpy(&src[8], &vals[(i+11)%22], 8);
3027      do_ROUNDPD_010(True/*mem*/, &src, &dst);
3028      printf("m roundpd_010  ");
3029      showV128(&src);
3030      printf(" ");
3031      showV128(&dst);
3032      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3033      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3034      printf("\n");
3035
3036
3037      randV128(&src);
3038      randV128(&dst);
3039      memcpy(&src[0], &vals[i], 8);
3040      memcpy(&src[8], &vals[(i+11)%22], 8);
3041      do_ROUNDPD_011(False/*reg*/, &src, &dst);
3042      printf("r roundpd_011  ");
3043      showV128(&src);
3044      printf(" ");
3045      showV128(&dst);
3046      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3047      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3048      printf("\n");
3049
3050      randV128(&src);
3051      randV128(&dst);
3052      memcpy(&src[0], &vals[i], 8);
3053      memcpy(&src[8], &vals[(i+11)%22], 8);
3054      do_ROUNDPD_011(True/*mem*/, &src, &dst);
3055      printf("m roundpd_011  ");
3056      showV128(&src);
3057      printf(" ");
3058      showV128(&dst);
3059      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3060      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3061      printf("\n");
3062   }
3063}
3064
3065void test_ROUNDPD_w_mxcsr_rounding ( void )
3066{
3067   UInt rm;
3068   double vals[22];
3069   Int i = 0;
3070   vals[i++] = 0.0;
3071   vals[i++] = -0.0;
3072   vals[i++] = mkPosInf();
3073   vals[i++] = mkNegInf();
3074   vals[i++] = mkPosNan();
3075   vals[i++] = mkNegNan();
3076   vals[i++] = -1.3;
3077   vals[i++] = -1.1;
3078   vals[i++] = -0.9;
3079   vals[i++] = -0.7;
3080   vals[i++] = -0.50001;
3081   vals[i++] = -0.49999;
3082   vals[i++] = -0.3;
3083   vals[i++] = -0.1;
3084   vals[i++] = 0.1;
3085   vals[i++] = 0.3;
3086   vals[i++] = 0.49999;
3087   vals[i++] = 0.50001;
3088   vals[i++] = 0.7;
3089   vals[i++] = 0.9;
3090   vals[i++] = 1.1;
3091   vals[i++] = 1.3;
3092   assert(i == 22);
3093
3094   rm = get_sse_roundingmode();
3095   assert(rm == 0); // 0 == RN == default
3096
3097   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3098      V128 src, dst;
3099
3100      for (rm = 0; rm <= 3; rm++) {
3101         set_sse_roundingmode(rm);
3102
3103         randV128(&src);
3104         randV128(&dst);
3105         memcpy(&src[0], &vals[i], 8);
3106         memcpy(&src[8], &vals[(i+11)%22], 8);
3107         do_ROUNDPD_1XX(False/*reg*/, &src, &dst);
3108         printf("r (rm=%u) roundpd_1XX  ", rm);
3109         showV128(&src);
3110         printf(" ");
3111         showV128(&dst);
3112         printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3113         printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3114         printf("\n");
3115
3116         randV128(&src);
3117         randV128(&dst);
3118         memcpy(&src[0], &vals[i], 8);
3119         memcpy(&src[8], &vals[(i+11)%22], 8);
3120         do_ROUNDPD_1XX(True/*mem*/, &src, &dst);
3121         printf("m (rm=%u) roundpd_1XX  ", rm);
3122         showV128(&src);
3123         printf(" ");
3124         showV128(&dst);
3125         printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3126         printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3127         printf("\n");
3128      }
3129   }
3130
3131   rm = get_sse_roundingmode();
3132   assert(rm == 3);
3133   set_sse_roundingmode(0);
3134   rm = get_sse_roundingmode();
3135   assert(rm == 0); // 0 == RN == default
3136}
3137
3138/* ------------ ROUNDPS ------------ */
3139
3140void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
3141{
3142   if (mem) {
3143      __asm__ __volatile__(
3144         "movupd  (%1), %%xmm11"       "\n\t"
3145         "roundps $0, (%0), %%xmm11"   "\n\t"
3146         "movupd  %%xmm11, (%1)"       "\n"
3147         : /*OUT*/
3148         : /*IN*/ "r"(src), "r"(dst)
3149         : /*TRASH*/ "xmm11"
3150      );
3151   } else {
3152      __asm__ __volatile__(
3153         "movupd  (%1), %%xmm11"         "\n\t"
3154         "movupd  (%0), %%xmm2"          "\n\t"
3155         "roundps $0, %%xmm2, %%xmm11"   "\n\t"
3156         "movupd  %%xmm11, (%1)"         "\n"
3157         : /*OUT*/
3158         : /*IN*/ "r"(src), "r"(dst)
3159         : /*TRASH*/ "xmm11","xmm2"
3160      );
3161   }
3162}
3163
3164void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
3165{
3166   if (mem) {
3167      __asm__ __volatile__(
3168         "movupd  (%1), %%xmm11"       "\n\t"
3169         "roundps $1, (%0), %%xmm11"   "\n\t"
3170         "movupd  %%xmm11, (%1)"       "\n"
3171         : /*OUT*/
3172         : /*IN*/ "r"(src), "r"(dst)
3173         : /*TRASH*/ "xmm11"
3174      );
3175   } else {
3176      __asm__ __volatile__(
3177         "movupd  (%1), %%xmm11"         "\n\t"
3178         "movupd  (%0), %%xmm2"          "\n\t"
3179         "roundps $1, %%xmm2, %%xmm11"   "\n\t"
3180         "movupd  %%xmm11, (%1)"         "\n"
3181         : /*OUT*/
3182         : /*IN*/ "r"(src), "r"(dst)
3183         : /*TRASH*/ "xmm11","xmm2"
3184      );
3185   }
3186}
3187
3188void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
3189{
3190   if (mem) {
3191      __asm__ __volatile__(
3192         "movupd  (%1), %%xmm11"       "\n\t"
3193         "roundps $2, (%0), %%xmm11"   "\n\t"
3194         "movupd  %%xmm11, (%1)"       "\n"
3195         : /*OUT*/
3196         : /*IN*/ "r"(src), "r"(dst)
3197         : /*TRASH*/ "xmm11"
3198      );
3199   } else {
3200      __asm__ __volatile__(
3201         "movupd  (%1), %%xmm11"         "\n\t"
3202         "movupd  (%0), %%xmm2"          "\n\t"
3203         "roundps $2, %%xmm2, %%xmm11"   "\n\t"
3204         "movupd  %%xmm11, (%1)"         "\n"
3205         : /*OUT*/
3206         : /*IN*/ "r"(src), "r"(dst)
3207         : /*TRASH*/ "xmm11","xmm2"
3208      );
3209   }
3210}
3211
3212void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
3213{
3214   if (mem) {
3215      __asm__ __volatile__(
3216         "movupd  (%1), %%xmm11"       "\n\t"
3217         "roundps $3, (%0), %%xmm11"   "\n\t"
3218         "movupd  %%xmm11, (%1)"       "\n"
3219         : /*OUT*/
3220         : /*IN*/ "r"(src), "r"(dst)
3221         : /*TRASH*/ "xmm11"
3222      );
3223   } else {
3224      __asm__ __volatile__(
3225         "movupd  (%1), %%xmm11"         "\n\t"
3226         "movupd  (%0), %%xmm2"          "\n\t"
3227         "roundps $3, %%xmm2, %%xmm11"   "\n\t"
3228         "movupd  %%xmm11, (%1)"         "\n"
3229         : /*OUT*/
3230         : /*IN*/ "r"(src), "r"(dst)
3231         : /*TRASH*/ "xmm11","xmm2"
3232      );
3233   }
3234}
3235
3236void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
3237{
3238   if (mem) {
3239      __asm__ __volatile__(
3240         "movupd  (%1), %%xmm11"       "\n\t"
3241         "roundps $4, (%0), %%xmm11"   "\n\t"
3242         "movupd  %%xmm11, (%1)"       "\n"
3243         : /*OUT*/
3244         : /*IN*/ "r"(src), "r"(dst)
3245         : /*TRASH*/ "xmm11"
3246      );
3247   } else {
3248      __asm__ __volatile__(
3249         "movupd  (%1), %%xmm11"         "\n\t"
3250         "movupd  (%0), %%xmm2"          "\n\t"
3251         "roundps $4, %%xmm2, %%xmm11"   "\n\t"
3252         "movupd  %%xmm11, (%1)"         "\n"
3253         : /*OUT*/
3254         : /*IN*/ "r"(src), "r"(dst)
3255         : /*TRASH*/ "xmm11","xmm2"
3256      );
3257   }
3258}
3259
3260void test_ROUNDPS_w_immediate_rounding ( void )
3261{
3262   float vals[22];
3263   Int i = 0;
3264   vals[i++] = 0.0;
3265   vals[i++] = -0.0;
3266   vals[i++] = mkPosInf();
3267   vals[i++] = mkNegInf();
3268   vals[i++] = mkPosNan();
3269   vals[i++] = mkNegNan();
3270   vals[i++] = -1.3;
3271   vals[i++] = -1.1;
3272   vals[i++] = -0.9;
3273   vals[i++] = -0.7;
3274   vals[i++] = -0.50001;
3275   vals[i++] = -0.49999;
3276   vals[i++] = -0.3;
3277   vals[i++] = -0.1;
3278   vals[i++] = 0.1;
3279   vals[i++] = 0.3;
3280   vals[i++] = 0.49999;
3281   vals[i++] = 0.50001;
3282   vals[i++] = 0.7;
3283   vals[i++] = 0.9;
3284   vals[i++] = 1.1;
3285   vals[i++] = 1.3;
3286   assert(i == 22);
3287
3288   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3289      V128 src, dst;
3290
3291      randV128(&src);
3292      randV128(&dst);
3293      memcpy(&src[0], &vals[i], 4);
3294      memcpy(&src[4], &vals[(i+5)%22], 4);
3295      memcpy(&src[8], &vals[(i+11)%22], 4);
3296      memcpy(&src[12], &vals[(i+17)%22], 4);
3297      do_ROUNDPS_000(False/*reg*/, &src, &dst);
3298      printf("r roundps_000  ");
3299      showV128(&src);
3300      printf(" ");
3301      showV128(&dst);
3302      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3303      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3304      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3305      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3306      printf("\n");
3307
3308      randV128(&src);
3309      randV128(&dst);
3310      memcpy(&src[0], &vals[i], 4);
3311      memcpy(&src[4], &vals[(i+5)%22], 4);
3312      memcpy(&src[8], &vals[(i+11)%22], 4);
3313      memcpy(&src[12], &vals[(i+17)%22], 4);
3314      do_ROUNDPS_000(True/*mem*/, &src, &dst);
3315      printf("m roundps_000  ");
3316      showV128(&src);
3317      printf(" ");
3318      showV128(&dst);
3319      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3320      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3321      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3322      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3323      printf("\n");
3324
3325
3326      randV128(&src);
3327      randV128(&dst);
3328      memcpy(&src[0], &vals[i], 4);
3329      memcpy(&src[4], &vals[(i+5)%22], 4);
3330      memcpy(&src[8], &vals[(i+11)%22], 4);
3331      memcpy(&src[12], &vals[(i+17)%22], 4);
3332      do_ROUNDPS_001(False/*reg*/, &src, &dst);
3333      printf("r roundps_001  ");
3334      showV128(&src);
3335      printf(" ");
3336      showV128(&dst);
3337      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3338      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3339      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3340      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3341      printf("\n");
3342
3343      randV128(&src);
3344      randV128(&dst);
3345      memcpy(&src[0], &vals[i], 4);
3346      memcpy(&src[4], &vals[(i+5)%22], 4);
3347      memcpy(&src[8], &vals[(i+11)%22], 4);
3348      memcpy(&src[12], &vals[(i+17)%22], 4);
3349      do_ROUNDPS_001(True/*mem*/, &src, &dst);
3350      printf("m roundps_001  ");
3351      showV128(&src);
3352      printf(" ");
3353      showV128(&dst);
3354      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3355      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3356      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3357      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3358      printf("\n");
3359
3360
3361      randV128(&src);
3362      randV128(&dst);
3363      memcpy(&src[0], &vals[i], 4);
3364      memcpy(&src[4], &vals[(i+5)%22], 4);
3365      memcpy(&src[8], &vals[(i+11)%22], 4);
3366      memcpy(&src[12], &vals[(i+17)%22], 4);
3367      do_ROUNDPS_010(False/*reg*/, &src, &dst);
3368      printf("r roundps_010  ");
3369      showV128(&src);
3370      printf(" ");
3371      showV128(&dst);
3372      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3373      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3374      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3375      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3376      printf("\n");
3377
3378      randV128(&src);
3379      randV128(&dst);
3380      memcpy(&src[0], &vals[i], 4);
3381      memcpy(&src[4], &vals[(i+5)%22], 4);
3382      memcpy(&src[8], &vals[(i+11)%22], 4);
3383      memcpy(&src[12], &vals[(i+17)%22], 4);
3384      do_ROUNDPS_010(True/*mem*/, &src, &dst);
3385      printf("m roundps_010  ");
3386      showV128(&src);
3387      printf(" ");
3388      showV128(&dst);
3389      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3390      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3391      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3392      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3393      printf("\n");
3394
3395
3396      randV128(&src);
3397      randV128(&dst);
3398      memcpy(&src[0], &vals[i], 4);
3399      memcpy(&src[4], &vals[(i+5)%22], 4);
3400      memcpy(&src[8], &vals[(i+11)%22], 4);
3401      memcpy(&src[12], &vals[(i+17)%22], 4);
3402      do_ROUNDPS_011(False/*reg*/, &src, &dst);
3403      printf("r roundps_011  ");
3404      showV128(&src);
3405      printf(" ");
3406      showV128(&dst);
3407      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3408      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3409      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3410      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3411      printf("\n");
3412
3413      randV128(&src);
3414      randV128(&dst);
3415      memcpy(&src[0], &vals[i], 4);
3416      memcpy(&src[4], &vals[(i+5)%22], 4);
3417      memcpy(&src[8], &vals[(i+11)%22], 4);
3418      memcpy(&src[12], &vals[(i+17)%22], 4);
3419      do_ROUNDPS_011(True/*mem*/, &src, &dst);
3420      printf("m roundps_011  ");
3421      showV128(&src);
3422      printf(" ");
3423      showV128(&dst);
3424      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3425      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3426      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3427      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3428      printf("\n");
3429   }
3430}
3431
3432void test_ROUNDPS_w_mxcsr_rounding ( void )
3433{
3434   UInt rm;
3435   float vals[22];
3436   Int i = 0;
3437   vals[i++] = 0.0;
3438   vals[i++] = -0.0;
3439   vals[i++] = mkPosInf();
3440   vals[i++] = mkNegInf();
3441   vals[i++] = mkPosNan();
3442   vals[i++] = mkNegNan();
3443   vals[i++] = -1.3;
3444   vals[i++] = -1.1;
3445   vals[i++] = -0.9;
3446   vals[i++] = -0.7;
3447   vals[i++] = -0.50001;
3448   vals[i++] = -0.49999;
3449   vals[i++] = -0.3;
3450   vals[i++] = -0.1;
3451   vals[i++] = 0.1;
3452   vals[i++] = 0.3;
3453   vals[i++] = 0.49999;
3454   vals[i++] = 0.50001;
3455   vals[i++] = 0.7;
3456   vals[i++] = 0.9;
3457   vals[i++] = 1.1;
3458   vals[i++] = 1.3;
3459   assert(i == 22);
3460
3461   rm = get_sse_roundingmode();
3462   assert(rm == 0); // 0 == RN == default
3463
3464   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3465      V128 src, dst;
3466
3467      for (rm = 0; rm <= 3; rm++) {
3468         set_sse_roundingmode(rm);
3469
3470         randV128(&src);
3471         randV128(&dst);
3472         memcpy(&src[0], &vals[i], 4);
3473         memcpy(&src[4], &vals[(i+5)%22], 4);
3474         memcpy(&src[8], &vals[(i+11)%22], 4);
3475         memcpy(&src[12], &vals[(i+17)%22], 4);
3476         do_ROUNDPS_1XX(False/*reg*/, &src, &dst);
3477         printf("r (rm=%u) roundps_1XX  ", rm);
3478         showV128(&src);
3479         printf(" ");
3480         showV128(&dst);
3481         printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3482         printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3483         printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3484         printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3485         printf("\n");
3486
3487         randV128(&src);
3488         randV128(&dst);
3489         memcpy(&src[0], &vals[i], 4);
3490         memcpy(&src[4], &vals[(i+5)%22], 4);
3491         memcpy(&src[8], &vals[(i+11)%22], 4);
3492         memcpy(&src[12], &vals[(i+17)%22], 4);
3493         do_ROUNDPS_1XX(True/*mem*/, &src, &dst);
3494         printf("m (rm=%u) roundps_1XX  ", rm);
3495         showV128(&src);
3496         printf(" ");
3497         showV128(&dst);
3498         printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3499         printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3500         printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3501         printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3502         printf("\n");
3503      }
3504   }
3505
3506   rm = get_sse_roundingmode();
3507   assert(rm == 3);
3508   set_sse_roundingmode(0);
3509   rm = get_sse_roundingmode();
3510   assert(rm == 0); // 0 == RN == default
3511}
3512
3513/* ------------ PTEST ------------ */
3514
3515void test_PTEST ( void )
3516{
3517   const Int ntests = 8;
3518   V128 spec[ntests];
3519   do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL );
3520   do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL );
3521   do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL );
3522   do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL );
3523   do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL );
3524   do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL );
3525   do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL );
3526   do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL );
3527   V128 block[2];
3528   Int i, j;
3529   ULong flags;
3530   for (i = 0; i < ntests; i++) {
3531      for (j = 0; j < ntests; j++) {
3532         memcpy(&block[0], &spec[i], 16);
3533         memcpy(&block[1], &spec[j], 16);
3534         __asm__ __volatile__(
3535            "subq $256, %%rsp"        "\n\t"
3536            "movupd 0(%1), %%xmm2"    "\n\t"
3537            "ptest 16(%1), %%xmm2"    "\n\t"
3538            "pushfq"                  "\n\t"
3539            "popq %0"                 "\n\t"
3540            "addq $256, %%rsp"        "\n\t"
3541            : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) :
3542            "xmm2", "memory", "cc"
3543         );
3544         printf("r   ptest ");
3545         showV128(&block[0]);
3546         printf(" ");
3547         showV128(&block[1]);
3548         printf(" -> eflags %04x\n", (UInt)flags & 0x8D5);
3549      }
3550   }
3551}
3552
3553/* ------------ PBLENDVB ------------ */
3554
3555void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3556{
3557   if (mem) {
3558      __asm__ __volatile__(
3559         "movupd   (%2), %%xmm0"         "\n\t"
3560         "movupd   (%1), %%xmm11"        "\n\t"
3561         "pblendvb (%0), %%xmm11"        "\n\t"
3562         "movupd   %%xmm11, (%1)"        "\n"
3563         : /*OUT*/
3564         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3565         : /*TRASH*/ "xmm11","xmm0"
3566      );
3567   } else {
3568      __asm__ __volatile__(
3569         "movupd   (%2), %%xmm0"         "\n\t"
3570         "movupd   (%1), %%xmm11"        "\n\t"
3571         "movupd   (%0), %%xmm2"         "\n\t"
3572         "pblendvb %%xmm2, %%xmm11"      "\n\t"
3573         "movupd   %%xmm11, (%1)"        "\n"
3574         : /*OUT*/
3575         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3576         : /*TRASH*/ "xmm11","xmm2","xmm0"
3577      );
3578   }
3579}
3580
3581void test_PBLENDVB ( void )
3582{
3583   V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3584   Int i;
3585   for (i = 0; i < 10; i++) {
3586      randV128(&t_xmm0);
3587      randV128(&t_src);
3588      randV128(&t_dst);
3589
3590      memcpy(&xmm0, &t_xmm0, 16);
3591      memcpy(&src, &t_src, 16);
3592      memcpy(&dst, &t_dst, 16);
3593      do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst);
3594      printf("r pblendvb  ");
3595      showV128(&t_xmm0);
3596      printf(" ");
3597      showV128(&t_src);
3598      printf(" ");
3599      showV128(&t_dst);
3600      printf(" -> ");
3601      showV128(&dst);
3602      printf("\n");
3603
3604      memcpy(&xmm0, &t_xmm0, 16);
3605      memcpy(&src, &t_src, 16);
3606      memcpy(&dst, &t_dst, 16);
3607      do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst);
3608      printf("m pblendvb  ");
3609      showV128(&t_xmm0);
3610      printf(" ");
3611      showV128(&t_src);
3612      printf(" ");
3613      showV128(&t_dst);
3614      printf(" -> ");
3615      showV128(&dst);
3616      printf("\n");
3617   }
3618}
3619
3620/* ------------ BLENDVPD ------------ */
3621
3622void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3623{
3624   if (mem) {
3625      __asm__ __volatile__(
3626         "movupd   (%2), %%xmm0"         "\n\t"
3627         "movupd   (%1), %%xmm11"        "\n\t"
3628         "blendvpd (%0), %%xmm11"        "\n\t"
3629         "movupd   %%xmm11, (%1)"        "\n"
3630         : /*OUT*/
3631         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3632         : /*TRASH*/ "xmm11","xmm0"
3633      );
3634   } else {
3635      __asm__ __volatile__(
3636         "movupd   (%2), %%xmm0"         "\n\t"
3637         "movupd   (%1), %%xmm11"        "\n\t"
3638         "movupd   (%0), %%xmm2"         "\n\t"
3639         "blendvpd %%xmm2, %%xmm11"      "\n\t"
3640         "movupd   %%xmm11, (%1)"        "\n"
3641         : /*OUT*/
3642         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3643         : /*TRASH*/ "xmm11","xmm2","xmm0"
3644      );
3645   }
3646}
3647
3648void test_BLENDVPD ( void )
3649{
3650   V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3651   Int i;
3652   for (i = 0; i < 10; i++) {
3653      randV128(&t_xmm0);
3654      randV128(&t_src);
3655      randV128(&t_dst);
3656
3657      memcpy(&xmm0, &t_xmm0, 16);
3658      memcpy(&src, &t_src, 16);
3659      memcpy(&dst, &t_dst, 16);
3660      do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst);
3661      printf("r blendvpd  ");
3662      showV128(&t_xmm0);
3663      printf(" ");
3664      showV128(&t_src);
3665      printf(" ");
3666      showV128(&t_dst);
3667      printf(" -> ");
3668      showV128(&dst);
3669      printf("\n");
3670
3671      memcpy(&xmm0, &t_xmm0, 16);
3672      memcpy(&src, &t_src, 16);
3673      memcpy(&dst, &t_dst, 16);
3674      do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst);
3675      printf("m blendvpd  ");
3676      showV128(&t_xmm0);
3677      printf(" ");
3678      showV128(&t_src);
3679      printf(" ");
3680      showV128(&t_dst);
3681      printf(" -> ");
3682      showV128(&dst);
3683      printf("\n");
3684   }
3685}
3686
3687/* ------------ BLENDVPS ------------ */
3688
3689void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3690{
3691   if (mem) {
3692      __asm__ __volatile__(
3693         "movupd   (%2), %%xmm0"         "\n\t"
3694         "movupd   (%1), %%xmm11"        "\n\t"
3695         "blendvps (%0), %%xmm11"        "\n\t"
3696         "movupd   %%xmm11, (%1)"        "\n"
3697         : /*OUT*/
3698         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3699         : /*TRASH*/ "xmm11","xmm0"
3700      );
3701   } else {
3702      __asm__ __volatile__(
3703         "movupd   (%2), %%xmm0"         "\n\t"
3704         "movupd   (%1), %%xmm11"        "\n\t"
3705         "movupd   (%0), %%xmm2"         "\n\t"
3706         "blendvps %%xmm2, %%xmm11"      "\n\t"
3707         "movupd   %%xmm11, (%1)"        "\n"
3708         : /*OUT*/
3709         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3710         : /*TRASH*/ "xmm11","xmm2","xmm0"
3711      );
3712   }
3713}
3714
3715void test_BLENDVPS ( void )
3716{
3717   V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3718   Int i;
3719   for (i = 0; i < 10; i++) {
3720      randV128(&t_xmm0);
3721      randV128(&t_src);
3722      randV128(&t_dst);
3723
3724      memcpy(&xmm0, &t_xmm0, 16);
3725      memcpy(&src, &t_src, 16);
3726      memcpy(&dst, &t_dst, 16);
3727      do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst);
3728      printf("r blendvps  ");
3729      showV128(&t_xmm0);
3730      printf(" ");
3731      showV128(&t_src);
3732      printf(" ");
3733      showV128(&t_dst);
3734      printf(" -> ");
3735      showV128(&dst);
3736      printf("\n");
3737
3738      memcpy(&xmm0, &t_xmm0, 16);
3739      memcpy(&src, &t_src, 16);
3740      memcpy(&dst, &t_dst, 16);
3741      do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst);
3742      printf("m blendvps  ");
3743      showV128(&t_xmm0);
3744      printf(" ");
3745      showV128(&t_src);
3746      printf(" ");
3747      showV128(&t_dst);
3748      printf(" -> ");
3749      showV128(&dst);
3750      printf("\n");
3751   }
3752}
3753
3754void test_MOVNTDQA ( void )
3755{
3756   V128 src, dst;
3757   Int i;
3758   for (i = 0; i < 10; i++) {
3759      randV128(&src);
3760      /* make sure the load actually happens */
3761      randV128(&dst);
3762      DO_m_r("movntdqa", src, dst);
3763   }
3764}
3765
3766/* ------------ main ------------ */
3767
3768int main ( int argc, char** argv )
3769{
3770#if 1
3771   // ------ SSE 4.1 ------
3772   test_BLENDPD();        // done Apr.01.2010
3773   test_BLENDPS();        // done Apr.02.2010
3774   test_PBLENDW();
3775   test_PBLENDVB();
3776   test_BLENDVPD();
3777   test_BLENDVPS();
3778   test_DPPD();           // done Apr.08.2010
3779   test_DPPS();           // done Apr.09.2010
3780   test_EXTRACTPS();
3781   test_INSERTPS();       // done Apr.01.2010
3782   test_PCMPEQQ();
3783   test_PEXTRB();         // done Apr.15.2010
3784   test_PEXTRD();         // done Apr.14.2010
3785   test_PEXTRQ();         // done Apr.14.2010
3786   test_PEXTRW();         // done Apr.14.2010
3787   test_PINSRQ();         // done Apr.16.2010
3788   test_PINSRD();         // todo
3789   test_PINSRW(); /* Umm, this is SSE2, not SSE4.  Right? */
3790   test_PINSRB();         // todo
3791   test_PMAXSB();
3792   test_PMAXSD();         // done Apr.09.2010
3793   test_PMAXUD();         // done Apr.16.2010
3794   test_PMAXUW();
3795   test_PMINSB();
3796   test_PMINSD();         // done Apr.09.2010
3797   test_PMINUD();
3798   test_PMINUW();
3799   test_PMOVSXBW();       // done Apr.02.2010
3800   test_PMOVSXBD();       // done Mar.30.2010
3801   test_PMOVSXBQ();       // done Mar.30.2010
3802   test_PMOVSXWD();       // done Mar.31.2010
3803   test_PMOVSXWQ();       // done Mar.31.2010
3804   test_PMOVSXDQ();       // done Mar.31.2010
3805   test_PMOVZXBW();       // done Mar.28.2010
3806   test_PMOVZXBD();       // done Mar.29.2010
3807   test_PMOVZXBQ();       // done Mar.29.2010
3808   test_PMOVZXWD();       // done Mar.28.2010
3809   test_PMOVZXWQ();       // done Mar.29.2010
3810   test_PMOVZXDQ();       // done Mar.29.2010
3811   test_POPCNTW();
3812   test_POPCNTL();
3813   test_POPCNTQ();
3814   test_PMULDQ();
3815   test_PMULLD();
3816   test_PTEST();
3817   test_ROUNDSD_w_immediate_rounding();
3818   test_ROUNDSS_w_immediate_rounding();
3819   test_ROUNDPD_w_immediate_rounding();
3820   test_ROUNDPS_w_immediate_rounding();
3821   test_ROUNDSD_w_mxcsr_rounding();
3822   test_ROUNDSS_w_mxcsr_rounding();
3823   test_ROUNDPD_w_mxcsr_rounding();
3824   test_ROUNDPS_w_mxcsr_rounding();
3825   // ------ SSE 4.2 ------
3826   test_PCMPGTQ();
3827   // CRC32B,Q
3828   test_PACKUSDW();
3829   test_PHMINPOSUW();
3830   test_MPSADBW();
3831   test_MOVNTDQA(); /* not sure whether this is 4.1 or 4.2 */
3832#else
3833   test_MPSADBW();
3834#endif
3835
3836   return 0;
3837}
3838
3839