1
2/* A program to test SSE4.1/SSE4.2 instructions.
3   Revisions:  Nov.208     - wrote this file
4               Apr.10.2010 - added PEXTR* tests
5               Apr.16.2010 - added PINS*  tests
6*/
7
8/* HOW TO COMPILE:
9   gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c
10*/
11
12#include <stdio.h>
13#include <stdlib.h>
14#include <assert.h>
15//#include "tests/malloc.h" // reenable when reintegrated
16#include <string.h>
17
18
19
20// rmme when reintegrated
21// Allocates a 16-aligned block.  Asserts if the allocation fails.
22#ifdef VGO_darwin
23#include <stdlib.h>
24#else
25#include <malloc.h>
26#endif
27__attribute__((unused))
28static void* memalign16(size_t szB)
29{
30   void* x;
31#if defined(VGO_darwin)
32   // Darwin lacks memalign, but its malloc is always 16-aligned anyway.
33   x = malloc(szB);
34#else
35   x = memalign(16, szB);
36#endif
37   assert(x);
38   assert(0 == ((16-1) & (unsigned long)x));
39   return x;
40}
41
42
43
44typedef  unsigned char           V128[16];
45typedef  unsigned int            UInt;
46typedef  signed int              Int;
47typedef  unsigned char           UChar;
48typedef  unsigned long long int  ULong;
49
50typedef  unsigned char           Bool;
51#define False ((Bool)0)
52#define True  ((Bool)1)
53
54
55typedef
56   struct {
57      V128 arg1;
58      V128 arg2;
59      V128 res;
60   }
61   RRArgs;
62
63typedef
64   struct {
65      V128 arg1;
66      V128 res;
67   }
68   RMArgs;
69
70static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo )
71{
72   // try to sidestep strict-aliasing snafus by memcpying explicitly
73   UChar* p = (UChar*)res;
74   memcpy(&p[8], (UChar*)&wHi, 8);
75   memcpy(&p[0], (UChar*)&wLo, 8);
76}
77
78static UChar randUChar ( void )
79{
80   static UInt seed = 80021;
81   seed = 1103515245 * seed + 12345;
82   return (seed >> 17) & 0xFF;
83}
84
85static ULong randULong ( void )
86{
87   Int i;
88   ULong r = 0;
89   for (i = 0; i < 8; i++) {
90      r = (r << 8) | (ULong)(0xFF & randUChar());
91   }
92   return r;
93}
94
95static void randV128 ( V128* v )
96{
97   Int i;
98   for (i = 0; i < 16; i++)
99      (*v)[i] = randUChar();
100}
101
102static void showV128 ( V128* v )
103{
104   Int i;
105   for (i = 15; i >= 0; i--)
106      printf("%02x", (Int)(*v)[i]);
107}
108
109static void showMaskedV128 ( V128* v, V128* mask )
110{
111   Int i;
112   for (i = 15; i >= 0; i--)
113      printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) ));
114}
115
116static void showIGVV( char* rOrM, char* op, Int imm,
117                      ULong src64, V128* dst, V128* res )
118{
119   printf("%s %10s $%d ", rOrM, op, imm);
120   printf("%016llx", src64);
121   printf(" ");
122   showV128(dst);
123   printf(" ");
124   showV128(res);
125   printf("\n");
126}
127
128static void showIAG ( char* rOrM, char* op, Int imm,
129                      V128* argL, ULong argR, ULong res )
130{
131   printf("%s %10s $%d ", rOrM, op, imm);
132   showV128(argL);
133   printf(" ");
134   printf("%016llx", argR);
135   printf(" ");
136   printf("%016llx", res);
137   printf("\n");
138}
139
140static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask )
141{
142   printf("%s %10s $%d ", rOrM, op, imm);
143   showV128(&rra->arg1);
144   printf(" ");
145   showV128(&rra->arg2);
146   printf(" ");
147   showMaskedV128(&rra->res, rmask);
148   printf("\n");
149}
150
151static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask )
152{
153   printf("%s %10s ", rOrM, op);
154   showV128(&rra->arg1);
155   printf(" ");
156   showV128(&rra->arg2);
157   printf(" ");
158   showMaskedV128(&rra->res, rmask);
159   printf("\n");
160}
161
162/* Note: these are little endian.  Hence first byte is the least
163   significant byte of lane zero. */
164
165/* Mask for insns where all result bits are non-approximated. */
166static V128 AllMask  = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
167                         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
168
169/* Mark for insns which produce approximated vector short results. */
170__attribute__((unused))
171static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF,
172                         0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF };
173
174/* Mark for insns which produce approximated scalar short results. */
175__attribute__((unused))
176static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF,
177                         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
178
179static V128 fives    = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55,
180                         0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 };
181
182static V128 zeroes   = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
183                         0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
184
185double mkPosInf ( void ) { return 1.0 / 0.0; }
186double mkNegInf ( void ) { return -mkPosInf(); }
187double mkPosNan ( void ) { return 0.0 / 0.0; }
188double mkNegNan ( void ) { return -mkPosNan(); }
189
190__attribute__((noinline))
191UInt get_mxcsr ( void )
192{
193   ULong w64;
194   __asm__ __volatile__(
195      "subq    $8, %%rsp"    "\n\t"
196      "stmxcsr (%%rsp)"      "\n\t"
197      "movq    (%%rsp), %0"  "\n"
198      "addq    $8, %%rsp"
199      : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc"
200   );
201   if (0) printf("get %08x\n", (UInt)w64);
202   return (UInt)w64;
203}
204
205__attribute__((noinline))
206void set_mxcsr ( UInt w32 )
207{
208   if (0) printf("set %08x\n", w32);
209   ULong w64 = (ULong)w32;
210   __asm__ __volatile__(
211      "subq    $8, %%rsp"    "\n\t"
212      "movq    %0, (%%rsp)"  "\n\t"
213      "ldmxcsr (%%rsp)"      "\n\t"
214      "addq    $8, %%rsp"
215      : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc"
216   );
217}
218
219UInt get_sse_roundingmode ( void )
220{
221   UInt w = get_mxcsr();
222   return (w >> 13) & 3;
223}
224
225void set_sse_roundingmode ( UInt m )
226{
227   UInt w;
228   assert(0 == (m & ~3));
229   w = get_mxcsr();
230   w &= ~(3 << 13);
231   w |= (m << 13);
232   set_mxcsr(w);
233}
234
235
236#define DO_imm_r_r(_opname, _imm, _src, _dst)  \
237   {  \
238      V128 _tmp;  \
239      __asm__ __volatile__(  \
240         "movupd (%0), %%xmm2"    "\n\t"  \
241         "movupd (%1), %%xmm11"   "\n\t"  \
242         _opname " $" #_imm ", %%xmm2, %%xmm11"  "\n\t"  \
243         "movupd %%xmm11, (%2)" "\n"  \
244         : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp))  \
245         : "cc", "memory", "xmm2", "xmm11"                            \
246      );  \
247      RRArgs rra;  \
248      memcpy(&rra.arg1, &(_src), sizeof(V128));  \
249      memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
250      memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
251      showIAA("r", (_opname), (_imm), &rra, &AllMask);  \
252   }
253
254#define DO_imm_m_r(_opname, _imm, _src, _dst)  \
255   {  \
256      V128 _tmp;  \
257      V128* _srcM = memalign16(sizeof(V128));  \
258      memcpy(_srcM, &(_src), sizeof(V128));  \
259      __asm__ __volatile__(  \
260         "movupd (%1), %%xmm11"   "\n\t"  \
261         _opname " $" #_imm ", (%0), %%xmm11"  "\n\t"  \
262         "movupd %%xmm11, (%2)" "\n"  \
263         : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp))  \
264         : "cc", "memory", "xmm11"  \
265      );  \
266      RRArgs rra;  \
267      memcpy(&rra.arg1, &(_src), sizeof(V128));  \
268      memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
269      memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
270      showIAA("m", (_opname), (_imm), &rra, &AllMask);  \
271      free(_srcM);  \
272   }
273
274#define DO_imm_mandr_r(_opname, _imm, _src, _dst)  \
275      DO_imm_r_r( _opname, _imm, _src, _dst ) \
276      DO_imm_m_r( _opname, _imm, _src, _dst )
277
278
279
280
281
282#define DO_r_r(_opname, _src, _dst)  \
283   {  \
284      V128 _tmp;  \
285      __asm__ __volatile__(  \
286         "movupd (%0), %%xmm2"    "\n\t"  \
287         "movupd (%1), %%xmm11"   "\n\t"  \
288         _opname " %%xmm2, %%xmm11"  "\n\t"  \
289         "movupd %%xmm11, (%2)" "\n"  \
290         : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp))  \
291         : "cc", "memory", "xmm2", "xmm11"  \
292      );  \
293      RRArgs rra;  \
294      memcpy(&rra.arg1, &(_src), sizeof(V128));  \
295      memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
296      memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
297      showAA("r", (_opname), &rra, &AllMask);  \
298   }
299
300#define DO_m_r(_opname, _src, _dst)  \
301   {  \
302      V128 _tmp;  \
303      V128* _srcM = memalign16(sizeof(V128));  \
304      memcpy(_srcM, &(_src), sizeof(V128));  \
305      __asm__ __volatile__(  \
306         "movupd (%1), %%xmm11"   "\n\t"  \
307         _opname " (%0), %%xmm11"  "\n\t"  \
308         "movupd %%xmm11, (%2)" "\n"  \
309         : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp))  \
310         : "cc", "memory", "xmm11"  \
311      );  \
312      RRArgs rra;  \
313      memcpy(&rra.arg1, &(_src), sizeof(V128));  \
314      memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
315      memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
316      showAA("m", (_opname), &rra, &AllMask);  \
317      free(_srcM);  \
318   }
319
320#define DO_mandr_r(_opname, _src, _dst)  \
321      DO_r_r(_opname, _src, _dst) \
322      DO_m_r(_opname, _src, _dst)
323
324
325
326
327#define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix)       \
328   {  \
329      ULong _scbefore = 0x5555555555555555ULL;  \
330      ULong _scafter  = 0xAAAAAAAAAAAAAAAAULL; \
331      /* This assumes that gcc won't make any of %0, %1, %2 */ \
332      /* be r11.  That should be ensured (cough, cough) */ \
333      /* by declaring r11 to be clobbered. */ \
334      __asm__ __volatile__(  \
335         "movupd (%0), %%xmm2"    "\n\t"  \
336         "movq   (%1), %%r11"   "\n\t"  \
337         _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix  "\n\t"  \
338         "movq   %%r11, (%2)" "\n"  \
339         : /*out*/ \
340         : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter))  \
341         : "cc", "memory", "xmm2", "r11"  \
342      );  \
343      showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter));  \
344   }
345
346#define DO_imm_r_to_mscalar(_opname, _imm, _src)   \
347   {  \
348      ULong _scbefore = 0x5555555555555555ULL;  \
349      ULong _scafter = _scbefore; \
350      __asm__ __volatile__(  \
351         "movupd (%0), %%xmm2"    "\n\t"  \
352         _opname " $" #_imm ", %%xmm2, (%1)"  "\n\t"  \
353         : /*out*/ \
354         : /*in*/ "r"(&(_src)), "r"(&(_scafter))  \
355         : "cc", "memory", "xmm2"  \
356      );  \
357      showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter));  \
358   }
359
360#define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix)   \
361      DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix )       \
362      DO_imm_r_to_mscalar( _opname, _imm, _src )
363
364
365
366
367
368
369
370
371#define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix)       \
372   {  \
373      V128  dstv;         \
374      V128  res;          \
375      ULong src64 = (ULong)(_src); \
376      memcpy(dstv, fives, sizeof(dstv)); \
377      memcpy(res,  zeroes, sizeof(res)); \
378      /* This assumes that gcc won't make any of %0, %1, %2 */ \
379      /* be r11.  That should be ensured (cough, cough) */ \
380      /* by declaring r11 to be clobbered. */ \
381      __asm__ __volatile__(  \
382         "movupd (%0), %%xmm2"    "\n\t"   /*dstv*/   \
383         "movq   (%1), %%r11"     "\n\t"   /*src64*/  \
384         _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2"   "\n\t"  \
385         "movupd  %%xmm2, (%2)" "\n" /*res*/                          \
386         : /*out*/ \
387         : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res)  \
388         : "cc", "memory", "xmm2", "r11"  \
389      );  \
390      showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \
391   }
392#define DO_imm_mscalar_to_r(_opname, _imm, _src)       \
393   {  \
394      V128  dstv;         \
395      V128  res;          \
396      ULong src64 = (ULong)(_src); \
397      memcpy(dstv, fives, sizeof(dstv)); \
398      memcpy(res,  zeroes, sizeof(res)); \
399      __asm__ __volatile__(  \
400         "movupd (%0), %%xmm2"    "\n\t"   /*dstv*/   \
401         _opname " $" #_imm ", (%1), %%xmm2"   "\n\t"  \
402         "movupd  %%xmm2, (%2)" "\n" /*res*/                          \
403         : /*out*/ \
404         : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res)  \
405         : "cc", "memory", "xmm2"  \
406      );  \
407      showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \
408   }
409
410#define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix)   \
411      DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix )       \
412      DO_imm_mscalar_to_r( _opname, _imm, _src )
413
414
415
416
417
418void test_BLENDPD ( void )
419{
420   V128 src, dst;
421   Int i;
422   for (i = 0; i < 10; i++) {
423      randV128(&src);
424      randV128(&dst);
425      DO_imm_mandr_r("blendpd", 0, src, dst);
426      DO_imm_mandr_r("blendpd", 1, src, dst);
427      DO_imm_mandr_r("blendpd", 2, src, dst);
428      DO_imm_mandr_r("blendpd", 3, src, dst);
429   }
430}
431
432void test_BLENDPS ( void )
433{
434   V128 src, dst;
435   Int i;
436   for (i = 0; i < 10; i++) {
437      randV128(&src);
438      randV128(&dst);
439      DO_imm_mandr_r("blendps", 0, src, dst);
440      DO_imm_mandr_r("blendps", 1, src, dst);
441      DO_imm_mandr_r("blendps", 2, src, dst);
442      DO_imm_mandr_r("blendps", 3, src, dst);
443      DO_imm_mandr_r("blendps", 4, src, dst);
444      DO_imm_mandr_r("blendps", 5, src, dst);
445      DO_imm_mandr_r("blendps", 6, src, dst);
446      DO_imm_mandr_r("blendps", 7, src, dst);
447      DO_imm_mandr_r("blendps", 8, src, dst);
448      DO_imm_mandr_r("blendps", 9, src, dst);
449      DO_imm_mandr_r("blendps", 10, src, dst);
450      DO_imm_mandr_r("blendps", 11, src, dst);
451      DO_imm_mandr_r("blendps", 12, src, dst);
452      DO_imm_mandr_r("blendps", 13, src, dst);
453      DO_imm_mandr_r("blendps", 14, src, dst);
454      DO_imm_mandr_r("blendps", 15, src, dst);
455   }
456}
457
458void test_DPPD ( void )
459{
460   V128 src, dst;
461   {
462      *(double*)(&src[0]) =  1.2345;
463      *(double*)(&src[8]) = -6.78910;
464      *(double*)(&dst[0]) = -11.121314;
465      *(double*)(&dst[8]) =  15.161718;
466      DO_imm_mandr_r("dppd", 0, src, dst);
467      DO_imm_mandr_r("dppd", 1, src, dst);
468      DO_imm_mandr_r("dppd", 2, src, dst);
469      DO_imm_mandr_r("dppd", 3, src, dst);
470      DO_imm_mandr_r("dppd", 4, src, dst);
471      DO_imm_mandr_r("dppd", 5, src, dst);
472      DO_imm_mandr_r("dppd", 6, src, dst);
473      DO_imm_mandr_r("dppd", 7, src, dst);
474      DO_imm_mandr_r("dppd", 8, src, dst);
475      DO_imm_mandr_r("dppd", 9, src, dst);
476      DO_imm_mandr_r("dppd", 10, src, dst);
477      DO_imm_mandr_r("dppd", 11, src, dst);
478      DO_imm_mandr_r("dppd", 12, src, dst);
479      DO_imm_mandr_r("dppd", 13, src, dst);
480      DO_imm_mandr_r("dppd", 14, src, dst);
481      DO_imm_mandr_r("dppd", 15, src, dst);
482      DO_imm_mandr_r("dppd", 16, src, dst);
483      DO_imm_mandr_r("dppd", 17, src, dst);
484      DO_imm_mandr_r("dppd", 18, src, dst);
485      DO_imm_mandr_r("dppd", 19, src, dst);
486      DO_imm_mandr_r("dppd", 20, src, dst);
487      DO_imm_mandr_r("dppd", 21, src, dst);
488      DO_imm_mandr_r("dppd", 22, src, dst);
489      DO_imm_mandr_r("dppd", 23, src, dst);
490      DO_imm_mandr_r("dppd", 24, src, dst);
491      DO_imm_mandr_r("dppd", 25, src, dst);
492      DO_imm_mandr_r("dppd", 26, src, dst);
493      DO_imm_mandr_r("dppd", 27, src, dst);
494      DO_imm_mandr_r("dppd", 28, src, dst);
495      DO_imm_mandr_r("dppd", 29, src, dst);
496      DO_imm_mandr_r("dppd", 30, src, dst);
497      DO_imm_mandr_r("dppd", 31, src, dst);
498      DO_imm_mandr_r("dppd", 32, src, dst);
499      DO_imm_mandr_r("dppd", 33, src, dst);
500      DO_imm_mandr_r("dppd", 34, src, dst);
501      DO_imm_mandr_r("dppd", 35, src, dst);
502      DO_imm_mandr_r("dppd", 36, src, dst);
503      DO_imm_mandr_r("dppd", 37, src, dst);
504      DO_imm_mandr_r("dppd", 38, src, dst);
505      DO_imm_mandr_r("dppd", 39, src, dst);
506      DO_imm_mandr_r("dppd", 40, src, dst);
507      DO_imm_mandr_r("dppd", 41, src, dst);
508      DO_imm_mandr_r("dppd", 42, src, dst);
509      DO_imm_mandr_r("dppd", 43, src, dst);
510      DO_imm_mandr_r("dppd", 44, src, dst);
511      DO_imm_mandr_r("dppd", 45, src, dst);
512      DO_imm_mandr_r("dppd", 46, src, dst);
513      DO_imm_mandr_r("dppd", 47, src, dst);
514      DO_imm_mandr_r("dppd", 48, src, dst);
515      DO_imm_mandr_r("dppd", 49, src, dst);
516      DO_imm_mandr_r("dppd", 50, src, dst);
517      DO_imm_mandr_r("dppd", 51, src, dst);
518      DO_imm_mandr_r("dppd", 52, src, dst);
519      DO_imm_mandr_r("dppd", 53, src, dst);
520      DO_imm_mandr_r("dppd", 54, src, dst);
521      DO_imm_mandr_r("dppd", 55, src, dst);
522      DO_imm_mandr_r("dppd", 56, src, dst);
523      DO_imm_mandr_r("dppd", 57, src, dst);
524      DO_imm_mandr_r("dppd", 58, src, dst);
525      DO_imm_mandr_r("dppd", 59, src, dst);
526      DO_imm_mandr_r("dppd", 60, src, dst);
527      DO_imm_mandr_r("dppd", 61, src, dst);
528      DO_imm_mandr_r("dppd", 62, src, dst);
529      DO_imm_mandr_r("dppd", 63, src, dst);
530      DO_imm_mandr_r("dppd", 64, src, dst);
531      DO_imm_mandr_r("dppd", 65, src, dst);
532      DO_imm_mandr_r("dppd", 66, src, dst);
533      DO_imm_mandr_r("dppd", 67, src, dst);
534      DO_imm_mandr_r("dppd", 68, src, dst);
535      DO_imm_mandr_r("dppd", 69, src, dst);
536      DO_imm_mandr_r("dppd", 70, src, dst);
537      DO_imm_mandr_r("dppd", 71, src, dst);
538      DO_imm_mandr_r("dppd", 72, src, dst);
539      DO_imm_mandr_r("dppd", 73, src, dst);
540      DO_imm_mandr_r("dppd", 74, src, dst);
541      DO_imm_mandr_r("dppd", 75, src, dst);
542      DO_imm_mandr_r("dppd", 76, src, dst);
543      DO_imm_mandr_r("dppd", 77, src, dst);
544      DO_imm_mandr_r("dppd", 78, src, dst);
545      DO_imm_mandr_r("dppd", 79, src, dst);
546      DO_imm_mandr_r("dppd", 80, src, dst);
547      DO_imm_mandr_r("dppd", 81, src, dst);
548      DO_imm_mandr_r("dppd", 82, src, dst);
549      DO_imm_mandr_r("dppd", 83, src, dst);
550      DO_imm_mandr_r("dppd", 84, src, dst);
551      DO_imm_mandr_r("dppd", 85, src, dst);
552      DO_imm_mandr_r("dppd", 86, src, dst);
553      DO_imm_mandr_r("dppd", 87, src, dst);
554      DO_imm_mandr_r("dppd", 88, src, dst);
555      DO_imm_mandr_r("dppd", 89, src, dst);
556      DO_imm_mandr_r("dppd", 90, src, dst);
557      DO_imm_mandr_r("dppd", 91, src, dst);
558      DO_imm_mandr_r("dppd", 92, src, dst);
559      DO_imm_mandr_r("dppd", 93, src, dst);
560      DO_imm_mandr_r("dppd", 94, src, dst);
561      DO_imm_mandr_r("dppd", 95, src, dst);
562      DO_imm_mandr_r("dppd", 96, src, dst);
563      DO_imm_mandr_r("dppd", 97, src, dst);
564      DO_imm_mandr_r("dppd", 98, src, dst);
565      DO_imm_mandr_r("dppd", 99, src, dst);
566      DO_imm_mandr_r("dppd", 100, src, dst);
567      DO_imm_mandr_r("dppd", 101, src, dst);
568      DO_imm_mandr_r("dppd", 102, src, dst);
569      DO_imm_mandr_r("dppd", 103, src, dst);
570      DO_imm_mandr_r("dppd", 104, src, dst);
571      DO_imm_mandr_r("dppd", 105, src, dst);
572      DO_imm_mandr_r("dppd", 106, src, dst);
573      DO_imm_mandr_r("dppd", 107, src, dst);
574      DO_imm_mandr_r("dppd", 108, src, dst);
575      DO_imm_mandr_r("dppd", 109, src, dst);
576      DO_imm_mandr_r("dppd", 110, src, dst);
577      DO_imm_mandr_r("dppd", 111, src, dst);
578      DO_imm_mandr_r("dppd", 112, src, dst);
579      DO_imm_mandr_r("dppd", 113, src, dst);
580      DO_imm_mandr_r("dppd", 114, src, dst);
581      DO_imm_mandr_r("dppd", 115, src, dst);
582      DO_imm_mandr_r("dppd", 116, src, dst);
583      DO_imm_mandr_r("dppd", 117, src, dst);
584      DO_imm_mandr_r("dppd", 118, src, dst);
585      DO_imm_mandr_r("dppd", 119, src, dst);
586      DO_imm_mandr_r("dppd", 120, src, dst);
587      DO_imm_mandr_r("dppd", 121, src, dst);
588      DO_imm_mandr_r("dppd", 122, src, dst);
589      DO_imm_mandr_r("dppd", 123, src, dst);
590      DO_imm_mandr_r("dppd", 124, src, dst);
591      DO_imm_mandr_r("dppd", 125, src, dst);
592      DO_imm_mandr_r("dppd", 126, src, dst);
593      DO_imm_mandr_r("dppd", 127, src, dst);
594      DO_imm_mandr_r("dppd", 128, src, dst);
595      DO_imm_mandr_r("dppd", 129, src, dst);
596      DO_imm_mandr_r("dppd", 130, src, dst);
597      DO_imm_mandr_r("dppd", 131, src, dst);
598      DO_imm_mandr_r("dppd", 132, src, dst);
599      DO_imm_mandr_r("dppd", 133, src, dst);
600      DO_imm_mandr_r("dppd", 134, src, dst);
601      DO_imm_mandr_r("dppd", 135, src, dst);
602      DO_imm_mandr_r("dppd", 136, src, dst);
603      DO_imm_mandr_r("dppd", 137, src, dst);
604      DO_imm_mandr_r("dppd", 138, src, dst);
605      DO_imm_mandr_r("dppd", 139, src, dst);
606      DO_imm_mandr_r("dppd", 140, src, dst);
607      DO_imm_mandr_r("dppd", 141, src, dst);
608      DO_imm_mandr_r("dppd", 142, src, dst);
609      DO_imm_mandr_r("dppd", 143, src, dst);
610      DO_imm_mandr_r("dppd", 144, src, dst);
611      DO_imm_mandr_r("dppd", 145, src, dst);
612      DO_imm_mandr_r("dppd", 146, src, dst);
613      DO_imm_mandr_r("dppd", 147, src, dst);
614      DO_imm_mandr_r("dppd", 148, src, dst);
615      DO_imm_mandr_r("dppd", 149, src, dst);
616      DO_imm_mandr_r("dppd", 150, src, dst);
617      DO_imm_mandr_r("dppd", 151, src, dst);
618      DO_imm_mandr_r("dppd", 152, src, dst);
619      DO_imm_mandr_r("dppd", 153, src, dst);
620      DO_imm_mandr_r("dppd", 154, src, dst);
621      DO_imm_mandr_r("dppd", 155, src, dst);
622      DO_imm_mandr_r("dppd", 156, src, dst);
623      DO_imm_mandr_r("dppd", 157, src, dst);
624      DO_imm_mandr_r("dppd", 158, src, dst);
625      DO_imm_mandr_r("dppd", 159, src, dst);
626      DO_imm_mandr_r("dppd", 160, src, dst);
627      DO_imm_mandr_r("dppd", 161, src, dst);
628      DO_imm_mandr_r("dppd", 162, src, dst);
629      DO_imm_mandr_r("dppd", 163, src, dst);
630      DO_imm_mandr_r("dppd", 164, src, dst);
631      DO_imm_mandr_r("dppd", 165, src, dst);
632      DO_imm_mandr_r("dppd", 166, src, dst);
633      DO_imm_mandr_r("dppd", 167, src, dst);
634      DO_imm_mandr_r("dppd", 168, src, dst);
635      DO_imm_mandr_r("dppd", 169, src, dst);
636      DO_imm_mandr_r("dppd", 170, src, dst);
637      DO_imm_mandr_r("dppd", 171, src, dst);
638      DO_imm_mandr_r("dppd", 172, src, dst);
639      DO_imm_mandr_r("dppd", 173, src, dst);
640      DO_imm_mandr_r("dppd", 174, src, dst);
641      DO_imm_mandr_r("dppd", 175, src, dst);
642      DO_imm_mandr_r("dppd", 176, src, dst);
643      DO_imm_mandr_r("dppd", 177, src, dst);
644      DO_imm_mandr_r("dppd", 178, src, dst);
645      DO_imm_mandr_r("dppd", 179, src, dst);
646      DO_imm_mandr_r("dppd", 180, src, dst);
647      DO_imm_mandr_r("dppd", 181, src, dst);
648      DO_imm_mandr_r("dppd", 182, src, dst);
649      DO_imm_mandr_r("dppd", 183, src, dst);
650      DO_imm_mandr_r("dppd", 184, src, dst);
651      DO_imm_mandr_r("dppd", 185, src, dst);
652      DO_imm_mandr_r("dppd", 186, src, dst);
653      DO_imm_mandr_r("dppd", 187, src, dst);
654      DO_imm_mandr_r("dppd", 188, src, dst);
655      DO_imm_mandr_r("dppd", 189, src, dst);
656      DO_imm_mandr_r("dppd", 190, src, dst);
657      DO_imm_mandr_r("dppd", 191, src, dst);
658      DO_imm_mandr_r("dppd", 192, src, dst);
659      DO_imm_mandr_r("dppd", 193, src, dst);
660      DO_imm_mandr_r("dppd", 194, src, dst);
661      DO_imm_mandr_r("dppd", 195, src, dst);
662      DO_imm_mandr_r("dppd", 196, src, dst);
663      DO_imm_mandr_r("dppd", 197, src, dst);
664      DO_imm_mandr_r("dppd", 198, src, dst);
665      DO_imm_mandr_r("dppd", 199, src, dst);
666      DO_imm_mandr_r("dppd", 200, src, dst);
667      DO_imm_mandr_r("dppd", 201, src, dst);
668      DO_imm_mandr_r("dppd", 202, src, dst);
669      DO_imm_mandr_r("dppd", 203, src, dst);
670      DO_imm_mandr_r("dppd", 204, src, dst);
671      DO_imm_mandr_r("dppd", 205, src, dst);
672      DO_imm_mandr_r("dppd", 206, src, dst);
673      DO_imm_mandr_r("dppd", 207, src, dst);
674      DO_imm_mandr_r("dppd", 208, src, dst);
675      DO_imm_mandr_r("dppd", 209, src, dst);
676      DO_imm_mandr_r("dppd", 210, src, dst);
677      DO_imm_mandr_r("dppd", 211, src, dst);
678      DO_imm_mandr_r("dppd", 212, src, dst);
679      DO_imm_mandr_r("dppd", 213, src, dst);
680      DO_imm_mandr_r("dppd", 214, src, dst);
681      DO_imm_mandr_r("dppd", 215, src, dst);
682      DO_imm_mandr_r("dppd", 216, src, dst);
683      DO_imm_mandr_r("dppd", 217, src, dst);
684      DO_imm_mandr_r("dppd", 218, src, dst);
685      DO_imm_mandr_r("dppd", 219, src, dst);
686      DO_imm_mandr_r("dppd", 220, src, dst);
687      DO_imm_mandr_r("dppd", 221, src, dst);
688      DO_imm_mandr_r("dppd", 222, src, dst);
689      DO_imm_mandr_r("dppd", 223, src, dst);
690      DO_imm_mandr_r("dppd", 224, src, dst);
691      DO_imm_mandr_r("dppd", 225, src, dst);
692      DO_imm_mandr_r("dppd", 226, src, dst);
693      DO_imm_mandr_r("dppd", 227, src, dst);
694      DO_imm_mandr_r("dppd", 228, src, dst);
695      DO_imm_mandr_r("dppd", 229, src, dst);
696      DO_imm_mandr_r("dppd", 230, src, dst);
697      DO_imm_mandr_r("dppd", 231, src, dst);
698      DO_imm_mandr_r("dppd", 232, src, dst);
699      DO_imm_mandr_r("dppd", 233, src, dst);
700      DO_imm_mandr_r("dppd", 234, src, dst);
701      DO_imm_mandr_r("dppd", 235, src, dst);
702      DO_imm_mandr_r("dppd", 236, src, dst);
703      DO_imm_mandr_r("dppd", 237, src, dst);
704      DO_imm_mandr_r("dppd", 238, src, dst);
705      DO_imm_mandr_r("dppd", 239, src, dst);
706      DO_imm_mandr_r("dppd", 240, src, dst);
707      DO_imm_mandr_r("dppd", 241, src, dst);
708      DO_imm_mandr_r("dppd", 242, src, dst);
709      DO_imm_mandr_r("dppd", 243, src, dst);
710      DO_imm_mandr_r("dppd", 244, src, dst);
711      DO_imm_mandr_r("dppd", 245, src, dst);
712      DO_imm_mandr_r("dppd", 246, src, dst);
713      DO_imm_mandr_r("dppd", 247, src, dst);
714      DO_imm_mandr_r("dppd", 248, src, dst);
715      DO_imm_mandr_r("dppd", 249, src, dst);
716      DO_imm_mandr_r("dppd", 250, src, dst);
717      DO_imm_mandr_r("dppd", 251, src, dst);
718      DO_imm_mandr_r("dppd", 252, src, dst);
719      DO_imm_mandr_r("dppd", 253, src, dst);
720      DO_imm_mandr_r("dppd", 254, src, dst);
721      DO_imm_mandr_r("dppd", 255, src, dst);
722   }
723}
724
725void test_DPPS ( void )
726{
727   V128 src, dst;
728   {
729      *(float*)(&src[0])  =   1.2;
730      *(float*)(&src[4])  =  -3.4;
731      *(float*)(&src[8])  =  -6.7;
732      *(float*)(&src[12]) =   8.9;
733      *(float*)(&dst[0])  = -10.11;
734      *(float*)(&dst[4])  =  12.13;
735      *(float*)(&dst[8])  =  14.15;
736      *(float*)(&dst[12]) = -16.17;
737      DO_imm_mandr_r("dpps", 0, src, dst);
738      DO_imm_mandr_r("dpps", 1, src, dst);
739      DO_imm_mandr_r("dpps", 2, src, dst);
740      DO_imm_mandr_r("dpps", 3, src, dst);
741      DO_imm_mandr_r("dpps", 4, src, dst);
742      DO_imm_mandr_r("dpps", 5, src, dst);
743      DO_imm_mandr_r("dpps", 6, src, dst);
744      DO_imm_mandr_r("dpps", 7, src, dst);
745      DO_imm_mandr_r("dpps", 8, src, dst);
746      DO_imm_mandr_r("dpps", 9, src, dst);
747      DO_imm_mandr_r("dpps", 10, src, dst);
748      DO_imm_mandr_r("dpps", 11, src, dst);
749      DO_imm_mandr_r("dpps", 12, src, dst);
750      DO_imm_mandr_r("dpps", 13, src, dst);
751      DO_imm_mandr_r("dpps", 14, src, dst);
752      DO_imm_mandr_r("dpps", 15, src, dst);
753      DO_imm_mandr_r("dpps", 16, src, dst);
754      DO_imm_mandr_r("dpps", 17, src, dst);
755      DO_imm_mandr_r("dpps", 18, src, dst);
756      DO_imm_mandr_r("dpps", 19, src, dst);
757      DO_imm_mandr_r("dpps", 20, src, dst);
758      DO_imm_mandr_r("dpps", 21, src, dst);
759      DO_imm_mandr_r("dpps", 22, src, dst);
760      DO_imm_mandr_r("dpps", 23, src, dst);
761      DO_imm_mandr_r("dpps", 24, src, dst);
762      DO_imm_mandr_r("dpps", 25, src, dst);
763      DO_imm_mandr_r("dpps", 26, src, dst);
764      DO_imm_mandr_r("dpps", 27, src, dst);
765      DO_imm_mandr_r("dpps", 28, src, dst);
766      DO_imm_mandr_r("dpps", 29, src, dst);
767      DO_imm_mandr_r("dpps", 30, src, dst);
768      DO_imm_mandr_r("dpps", 31, src, dst);
769      DO_imm_mandr_r("dpps", 32, src, dst);
770      DO_imm_mandr_r("dpps", 33, src, dst);
771      DO_imm_mandr_r("dpps", 34, src, dst);
772      DO_imm_mandr_r("dpps", 35, src, dst);
773      DO_imm_mandr_r("dpps", 36, src, dst);
774      DO_imm_mandr_r("dpps", 37, src, dst);
775      DO_imm_mandr_r("dpps", 38, src, dst);
776      DO_imm_mandr_r("dpps", 39, src, dst);
777      DO_imm_mandr_r("dpps", 40, src, dst);
778      DO_imm_mandr_r("dpps", 41, src, dst);
779      DO_imm_mandr_r("dpps", 42, src, dst);
780      DO_imm_mandr_r("dpps", 43, src, dst);
781      DO_imm_mandr_r("dpps", 44, src, dst);
782      DO_imm_mandr_r("dpps", 45, src, dst);
783      DO_imm_mandr_r("dpps", 46, src, dst);
784      DO_imm_mandr_r("dpps", 47, src, dst);
785      DO_imm_mandr_r("dpps", 48, src, dst);
786      DO_imm_mandr_r("dpps", 49, src, dst);
787      DO_imm_mandr_r("dpps", 50, src, dst);
788      DO_imm_mandr_r("dpps", 51, src, dst);
789      DO_imm_mandr_r("dpps", 52, src, dst);
790      DO_imm_mandr_r("dpps", 53, src, dst);
791      DO_imm_mandr_r("dpps", 54, src, dst);
792      DO_imm_mandr_r("dpps", 55, src, dst);
793      DO_imm_mandr_r("dpps", 56, src, dst);
794      DO_imm_mandr_r("dpps", 57, src, dst);
795      DO_imm_mandr_r("dpps", 58, src, dst);
796      DO_imm_mandr_r("dpps", 59, src, dst);
797      DO_imm_mandr_r("dpps", 60, src, dst);
798      DO_imm_mandr_r("dpps", 61, src, dst);
799      DO_imm_mandr_r("dpps", 62, src, dst);
800      DO_imm_mandr_r("dpps", 63, src, dst);
801      DO_imm_mandr_r("dpps", 64, src, dst);
802      DO_imm_mandr_r("dpps", 65, src, dst);
803      DO_imm_mandr_r("dpps", 66, src, dst);
804      DO_imm_mandr_r("dpps", 67, src, dst);
805      DO_imm_mandr_r("dpps", 68, src, dst);
806      DO_imm_mandr_r("dpps", 69, src, dst);
807      DO_imm_mandr_r("dpps", 70, src, dst);
808      DO_imm_mandr_r("dpps", 71, src, dst);
809      DO_imm_mandr_r("dpps", 72, src, dst);
810      DO_imm_mandr_r("dpps", 73, src, dst);
811      DO_imm_mandr_r("dpps", 74, src, dst);
812      DO_imm_mandr_r("dpps", 75, src, dst);
813      DO_imm_mandr_r("dpps", 76, src, dst);
814      DO_imm_mandr_r("dpps", 77, src, dst);
815      DO_imm_mandr_r("dpps", 78, src, dst);
816      DO_imm_mandr_r("dpps", 79, src, dst);
817      DO_imm_mandr_r("dpps", 80, src, dst);
818      DO_imm_mandr_r("dpps", 81, src, dst);
819      DO_imm_mandr_r("dpps", 82, src, dst);
820      DO_imm_mandr_r("dpps", 83, src, dst);
821      DO_imm_mandr_r("dpps", 84, src, dst);
822      DO_imm_mandr_r("dpps", 85, src, dst);
823      DO_imm_mandr_r("dpps", 86, src, dst);
824      DO_imm_mandr_r("dpps", 87, src, dst);
825      DO_imm_mandr_r("dpps", 88, src, dst);
826      DO_imm_mandr_r("dpps", 89, src, dst);
827      DO_imm_mandr_r("dpps", 90, src, dst);
828      DO_imm_mandr_r("dpps", 91, src, dst);
829      DO_imm_mandr_r("dpps", 92, src, dst);
830      DO_imm_mandr_r("dpps", 93, src, dst);
831      DO_imm_mandr_r("dpps", 94, src, dst);
832      DO_imm_mandr_r("dpps", 95, src, dst);
833      DO_imm_mandr_r("dpps", 96, src, dst);
834      DO_imm_mandr_r("dpps", 97, src, dst);
835      DO_imm_mandr_r("dpps", 98, src, dst);
836      DO_imm_mandr_r("dpps", 99, src, dst);
837      DO_imm_mandr_r("dpps", 100, src, dst);
838      DO_imm_mandr_r("dpps", 101, src, dst);
839      DO_imm_mandr_r("dpps", 102, src, dst);
840      DO_imm_mandr_r("dpps", 103, src, dst);
841      DO_imm_mandr_r("dpps", 104, src, dst);
842      DO_imm_mandr_r("dpps", 105, src, dst);
843      DO_imm_mandr_r("dpps", 106, src, dst);
844      DO_imm_mandr_r("dpps", 107, src, dst);
845      DO_imm_mandr_r("dpps", 108, src, dst);
846      DO_imm_mandr_r("dpps", 109, src, dst);
847      DO_imm_mandr_r("dpps", 110, src, dst);
848      DO_imm_mandr_r("dpps", 111, src, dst);
849      DO_imm_mandr_r("dpps", 112, src, dst);
850      DO_imm_mandr_r("dpps", 113, src, dst);
851      DO_imm_mandr_r("dpps", 114, src, dst);
852      DO_imm_mandr_r("dpps", 115, src, dst);
853      DO_imm_mandr_r("dpps", 116, src, dst);
854      DO_imm_mandr_r("dpps", 117, src, dst);
855      DO_imm_mandr_r("dpps", 118, src, dst);
856      DO_imm_mandr_r("dpps", 119, src, dst);
857      DO_imm_mandr_r("dpps", 120, src, dst);
858      DO_imm_mandr_r("dpps", 121, src, dst);
859      DO_imm_mandr_r("dpps", 122, src, dst);
860      DO_imm_mandr_r("dpps", 123, src, dst);
861      DO_imm_mandr_r("dpps", 124, src, dst);
862      DO_imm_mandr_r("dpps", 125, src, dst);
863      DO_imm_mandr_r("dpps", 126, src, dst);
864      DO_imm_mandr_r("dpps", 127, src, dst);
865      DO_imm_mandr_r("dpps", 128, src, dst);
866      DO_imm_mandr_r("dpps", 129, src, dst);
867      DO_imm_mandr_r("dpps", 130, src, dst);
868      DO_imm_mandr_r("dpps", 131, src, dst);
869      DO_imm_mandr_r("dpps", 132, src, dst);
870      DO_imm_mandr_r("dpps", 133, src, dst);
871      DO_imm_mandr_r("dpps", 134, src, dst);
872      DO_imm_mandr_r("dpps", 135, src, dst);
873      DO_imm_mandr_r("dpps", 136, src, dst);
874      DO_imm_mandr_r("dpps", 137, src, dst);
875      DO_imm_mandr_r("dpps", 138, src, dst);
876      DO_imm_mandr_r("dpps", 139, src, dst);
877      DO_imm_mandr_r("dpps", 140, src, dst);
878      DO_imm_mandr_r("dpps", 141, src, dst);
879      DO_imm_mandr_r("dpps", 142, src, dst);
880      DO_imm_mandr_r("dpps", 143, src, dst);
881      DO_imm_mandr_r("dpps", 144, src, dst);
882      DO_imm_mandr_r("dpps", 145, src, dst);
883      DO_imm_mandr_r("dpps", 146, src, dst);
884      DO_imm_mandr_r("dpps", 147, src, dst);
885      DO_imm_mandr_r("dpps", 148, src, dst);
886      DO_imm_mandr_r("dpps", 149, src, dst);
887      DO_imm_mandr_r("dpps", 150, src, dst);
888      DO_imm_mandr_r("dpps", 151, src, dst);
889      DO_imm_mandr_r("dpps", 152, src, dst);
890      DO_imm_mandr_r("dpps", 153, src, dst);
891      DO_imm_mandr_r("dpps", 154, src, dst);
892      DO_imm_mandr_r("dpps", 155, src, dst);
893      DO_imm_mandr_r("dpps", 156, src, dst);
894      DO_imm_mandr_r("dpps", 157, src, dst);
895      DO_imm_mandr_r("dpps", 158, src, dst);
896      DO_imm_mandr_r("dpps", 159, src, dst);
897      DO_imm_mandr_r("dpps", 160, src, dst);
898      DO_imm_mandr_r("dpps", 161, src, dst);
899      DO_imm_mandr_r("dpps", 162, src, dst);
900      DO_imm_mandr_r("dpps", 163, src, dst);
901      DO_imm_mandr_r("dpps", 164, src, dst);
902      DO_imm_mandr_r("dpps", 165, src, dst);
903      DO_imm_mandr_r("dpps", 166, src, dst);
904      DO_imm_mandr_r("dpps", 167, src, dst);
905      DO_imm_mandr_r("dpps", 168, src, dst);
906      DO_imm_mandr_r("dpps", 169, src, dst);
907      DO_imm_mandr_r("dpps", 170, src, dst);
908      DO_imm_mandr_r("dpps", 171, src, dst);
909      DO_imm_mandr_r("dpps", 172, src, dst);
910      DO_imm_mandr_r("dpps", 173, src, dst);
911      DO_imm_mandr_r("dpps", 174, src, dst);
912      DO_imm_mandr_r("dpps", 175, src, dst);
913      DO_imm_mandr_r("dpps", 176, src, dst);
914      DO_imm_mandr_r("dpps", 177, src, dst);
915      DO_imm_mandr_r("dpps", 178, src, dst);
916      DO_imm_mandr_r("dpps", 179, src, dst);
917      DO_imm_mandr_r("dpps", 180, src, dst);
918      DO_imm_mandr_r("dpps", 181, src, dst);
919      DO_imm_mandr_r("dpps", 182, src, dst);
920      DO_imm_mandr_r("dpps", 183, src, dst);
921      DO_imm_mandr_r("dpps", 184, src, dst);
922      DO_imm_mandr_r("dpps", 185, src, dst);
923      DO_imm_mandr_r("dpps", 186, src, dst);
924      DO_imm_mandr_r("dpps", 187, src, dst);
925      DO_imm_mandr_r("dpps", 188, src, dst);
926      DO_imm_mandr_r("dpps", 189, src, dst);
927      DO_imm_mandr_r("dpps", 190, src, dst);
928      DO_imm_mandr_r("dpps", 191, src, dst);
929      DO_imm_mandr_r("dpps", 192, src, dst);
930      DO_imm_mandr_r("dpps", 193, src, dst);
931      DO_imm_mandr_r("dpps", 194, src, dst);
932      DO_imm_mandr_r("dpps", 195, src, dst);
933      DO_imm_mandr_r("dpps", 196, src, dst);
934      DO_imm_mandr_r("dpps", 197, src, dst);
935      DO_imm_mandr_r("dpps", 198, src, dst);
936      DO_imm_mandr_r("dpps", 199, src, dst);
937      DO_imm_mandr_r("dpps", 200, src, dst);
938      DO_imm_mandr_r("dpps", 201, src, dst);
939      DO_imm_mandr_r("dpps", 202, src, dst);
940      DO_imm_mandr_r("dpps", 203, src, dst);
941      DO_imm_mandr_r("dpps", 204, src, dst);
942      DO_imm_mandr_r("dpps", 205, src, dst);
943      DO_imm_mandr_r("dpps", 206, src, dst);
944      DO_imm_mandr_r("dpps", 207, src, dst);
945      DO_imm_mandr_r("dpps", 208, src, dst);
946      DO_imm_mandr_r("dpps", 209, src, dst);
947      DO_imm_mandr_r("dpps", 210, src, dst);
948      DO_imm_mandr_r("dpps", 211, src, dst);
949      DO_imm_mandr_r("dpps", 212, src, dst);
950      DO_imm_mandr_r("dpps", 213, src, dst);
951      DO_imm_mandr_r("dpps", 214, src, dst);
952      DO_imm_mandr_r("dpps", 215, src, dst);
953      DO_imm_mandr_r("dpps", 216, src, dst);
954      DO_imm_mandr_r("dpps", 217, src, dst);
955      DO_imm_mandr_r("dpps", 218, src, dst);
956      DO_imm_mandr_r("dpps", 219, src, dst);
957      DO_imm_mandr_r("dpps", 220, src, dst);
958      DO_imm_mandr_r("dpps", 221, src, dst);
959      DO_imm_mandr_r("dpps", 222, src, dst);
960      DO_imm_mandr_r("dpps", 223, src, dst);
961      DO_imm_mandr_r("dpps", 224, src, dst);
962      DO_imm_mandr_r("dpps", 225, src, dst);
963      DO_imm_mandr_r("dpps", 226, src, dst);
964      DO_imm_mandr_r("dpps", 227, src, dst);
965      DO_imm_mandr_r("dpps", 228, src, dst);
966      DO_imm_mandr_r("dpps", 229, src, dst);
967      DO_imm_mandr_r("dpps", 230, src, dst);
968      DO_imm_mandr_r("dpps", 231, src, dst);
969      DO_imm_mandr_r("dpps", 232, src, dst);
970      DO_imm_mandr_r("dpps", 233, src, dst);
971      DO_imm_mandr_r("dpps", 234, src, dst);
972      DO_imm_mandr_r("dpps", 235, src, dst);
973      DO_imm_mandr_r("dpps", 236, src, dst);
974      DO_imm_mandr_r("dpps", 237, src, dst);
975      DO_imm_mandr_r("dpps", 238, src, dst);
976      DO_imm_mandr_r("dpps", 239, src, dst);
977      DO_imm_mandr_r("dpps", 240, src, dst);
978      DO_imm_mandr_r("dpps", 241, src, dst);
979      DO_imm_mandr_r("dpps", 242, src, dst);
980      DO_imm_mandr_r("dpps", 243, src, dst);
981      DO_imm_mandr_r("dpps", 244, src, dst);
982      DO_imm_mandr_r("dpps", 245, src, dst);
983      DO_imm_mandr_r("dpps", 246, src, dst);
984      DO_imm_mandr_r("dpps", 247, src, dst);
985      DO_imm_mandr_r("dpps", 248, src, dst);
986      DO_imm_mandr_r("dpps", 249, src, dst);
987      DO_imm_mandr_r("dpps", 250, src, dst);
988      DO_imm_mandr_r("dpps", 251, src, dst);
989      DO_imm_mandr_r("dpps", 252, src, dst);
990      DO_imm_mandr_r("dpps", 253, src, dst);
991      DO_imm_mandr_r("dpps", 254, src, dst);
992      DO_imm_mandr_r("dpps", 255, src, dst);
993   }
994}
995
996void test_INSERTPS ( void )
997{
998   V128 src, dst;
999   {
1000      *(float*)(&src[0])  =   1.2;
1001      *(float*)(&src[4])  =  -3.4;
1002      *(float*)(&src[8])  =  -6.7;
1003      *(float*)(&src[12]) =   8.9;
1004      *(float*)(&dst[0])  = -10.11;
1005      *(float*)(&dst[4])  =  12.13;
1006      *(float*)(&dst[8])  =  14.15;
1007      *(float*)(&dst[12]) = -16.17;
1008      DO_imm_mandr_r("insertps", 0, src, dst);
1009      DO_imm_mandr_r("insertps", 1, src, dst);
1010      DO_imm_mandr_r("insertps", 2, src, dst);
1011      DO_imm_mandr_r("insertps", 3, src, dst);
1012      DO_imm_mandr_r("insertps", 4, src, dst);
1013      DO_imm_mandr_r("insertps", 5, src, dst);
1014      DO_imm_mandr_r("insertps", 6, src, dst);
1015      DO_imm_mandr_r("insertps", 7, src, dst);
1016      DO_imm_mandr_r("insertps", 8, src, dst);
1017      DO_imm_mandr_r("insertps", 9, src, dst);
1018      DO_imm_mandr_r("insertps", 10, src, dst);
1019      DO_imm_mandr_r("insertps", 11, src, dst);
1020      DO_imm_mandr_r("insertps", 12, src, dst);
1021      DO_imm_mandr_r("insertps", 13, src, dst);
1022      DO_imm_mandr_r("insertps", 14, src, dst);
1023      DO_imm_mandr_r("insertps", 15, src, dst);
1024      DO_imm_mandr_r("insertps", 16, src, dst);
1025      DO_imm_mandr_r("insertps", 17, src, dst);
1026      DO_imm_mandr_r("insertps", 18, src, dst);
1027      DO_imm_mandr_r("insertps", 19, src, dst);
1028      DO_imm_mandr_r("insertps", 20, src, dst);
1029      DO_imm_mandr_r("insertps", 21, src, dst);
1030      DO_imm_mandr_r("insertps", 22, src, dst);
1031      DO_imm_mandr_r("insertps", 23, src, dst);
1032      DO_imm_mandr_r("insertps", 24, src, dst);
1033      DO_imm_mandr_r("insertps", 25, src, dst);
1034      DO_imm_mandr_r("insertps", 26, src, dst);
1035      DO_imm_mandr_r("insertps", 27, src, dst);
1036      DO_imm_mandr_r("insertps", 28, src, dst);
1037      DO_imm_mandr_r("insertps", 29, src, dst);
1038      DO_imm_mandr_r("insertps", 30, src, dst);
1039      DO_imm_mandr_r("insertps", 31, src, dst);
1040      DO_imm_mandr_r("insertps", 32, src, dst);
1041      DO_imm_mandr_r("insertps", 33, src, dst);
1042      DO_imm_mandr_r("insertps", 34, src, dst);
1043      DO_imm_mandr_r("insertps", 35, src, dst);
1044      DO_imm_mandr_r("insertps", 36, src, dst);
1045      DO_imm_mandr_r("insertps", 37, src, dst);
1046      DO_imm_mandr_r("insertps", 38, src, dst);
1047      DO_imm_mandr_r("insertps", 39, src, dst);
1048      DO_imm_mandr_r("insertps", 40, src, dst);
1049      DO_imm_mandr_r("insertps", 41, src, dst);
1050      DO_imm_mandr_r("insertps", 42, src, dst);
1051      DO_imm_mandr_r("insertps", 43, src, dst);
1052      DO_imm_mandr_r("insertps", 44, src, dst);
1053      DO_imm_mandr_r("insertps", 45, src, dst);
1054      DO_imm_mandr_r("insertps", 46, src, dst);
1055      DO_imm_mandr_r("insertps", 47, src, dst);
1056      DO_imm_mandr_r("insertps", 48, src, dst);
1057      DO_imm_mandr_r("insertps", 49, src, dst);
1058      DO_imm_mandr_r("insertps", 50, src, dst);
1059      DO_imm_mandr_r("insertps", 51, src, dst);
1060      DO_imm_mandr_r("insertps", 52, src, dst);
1061      DO_imm_mandr_r("insertps", 53, src, dst);
1062      DO_imm_mandr_r("insertps", 54, src, dst);
1063      DO_imm_mandr_r("insertps", 55, src, dst);
1064      DO_imm_mandr_r("insertps", 56, src, dst);
1065      DO_imm_mandr_r("insertps", 57, src, dst);
1066      DO_imm_mandr_r("insertps", 58, src, dst);
1067      DO_imm_mandr_r("insertps", 59, src, dst);
1068      DO_imm_mandr_r("insertps", 60, src, dst);
1069      DO_imm_mandr_r("insertps", 61, src, dst);
1070      DO_imm_mandr_r("insertps", 62, src, dst);
1071      DO_imm_mandr_r("insertps", 63, src, dst);
1072      DO_imm_mandr_r("insertps", 64, src, dst);
1073      DO_imm_mandr_r("insertps", 65, src, dst);
1074      DO_imm_mandr_r("insertps", 66, src, dst);
1075      DO_imm_mandr_r("insertps", 67, src, dst);
1076      DO_imm_mandr_r("insertps", 68, src, dst);
1077      DO_imm_mandr_r("insertps", 69, src, dst);
1078      DO_imm_mandr_r("insertps", 70, src, dst);
1079      DO_imm_mandr_r("insertps", 71, src, dst);
1080      DO_imm_mandr_r("insertps", 72, src, dst);
1081      DO_imm_mandr_r("insertps", 73, src, dst);
1082      DO_imm_mandr_r("insertps", 74, src, dst);
1083      DO_imm_mandr_r("insertps", 75, src, dst);
1084      DO_imm_mandr_r("insertps", 76, src, dst);
1085      DO_imm_mandr_r("insertps", 77, src, dst);
1086      DO_imm_mandr_r("insertps", 78, src, dst);
1087      DO_imm_mandr_r("insertps", 79, src, dst);
1088      DO_imm_mandr_r("insertps", 80, src, dst);
1089      DO_imm_mandr_r("insertps", 81, src, dst);
1090      DO_imm_mandr_r("insertps", 82, src, dst);
1091      DO_imm_mandr_r("insertps", 83, src, dst);
1092      DO_imm_mandr_r("insertps", 84, src, dst);
1093      DO_imm_mandr_r("insertps", 85, src, dst);
1094      DO_imm_mandr_r("insertps", 86, src, dst);
1095      DO_imm_mandr_r("insertps", 87, src, dst);
1096      DO_imm_mandr_r("insertps", 88, src, dst);
1097      DO_imm_mandr_r("insertps", 89, src, dst);
1098      DO_imm_mandr_r("insertps", 90, src, dst);
1099      DO_imm_mandr_r("insertps", 91, src, dst);
1100      DO_imm_mandr_r("insertps", 92, src, dst);
1101      DO_imm_mandr_r("insertps", 93, src, dst);
1102      DO_imm_mandr_r("insertps", 94, src, dst);
1103      DO_imm_mandr_r("insertps", 95, src, dst);
1104      DO_imm_mandr_r("insertps", 96, src, dst);
1105      DO_imm_mandr_r("insertps", 97, src, dst);
1106      DO_imm_mandr_r("insertps", 98, src, dst);
1107      DO_imm_mandr_r("insertps", 99, src, dst);
1108      DO_imm_mandr_r("insertps", 100, src, dst);
1109      DO_imm_mandr_r("insertps", 101, src, dst);
1110      DO_imm_mandr_r("insertps", 102, src, dst);
1111      DO_imm_mandr_r("insertps", 103, src, dst);
1112      DO_imm_mandr_r("insertps", 104, src, dst);
1113      DO_imm_mandr_r("insertps", 105, src, dst);
1114      DO_imm_mandr_r("insertps", 106, src, dst);
1115      DO_imm_mandr_r("insertps", 107, src, dst);
1116      DO_imm_mandr_r("insertps", 108, src, dst);
1117      DO_imm_mandr_r("insertps", 109, src, dst);
1118      DO_imm_mandr_r("insertps", 110, src, dst);
1119      DO_imm_mandr_r("insertps", 111, src, dst);
1120      DO_imm_mandr_r("insertps", 112, src, dst);
1121      DO_imm_mandr_r("insertps", 113, src, dst);
1122      DO_imm_mandr_r("insertps", 114, src, dst);
1123      DO_imm_mandr_r("insertps", 115, src, dst);
1124      DO_imm_mandr_r("insertps", 116, src, dst);
1125      DO_imm_mandr_r("insertps", 117, src, dst);
1126      DO_imm_mandr_r("insertps", 118, src, dst);
1127      DO_imm_mandr_r("insertps", 119, src, dst);
1128      DO_imm_mandr_r("insertps", 120, src, dst);
1129      DO_imm_mandr_r("insertps", 121, src, dst);
1130      DO_imm_mandr_r("insertps", 122, src, dst);
1131      DO_imm_mandr_r("insertps", 123, src, dst);
1132      DO_imm_mandr_r("insertps", 124, src, dst);
1133      DO_imm_mandr_r("insertps", 125, src, dst);
1134      DO_imm_mandr_r("insertps", 126, src, dst);
1135      DO_imm_mandr_r("insertps", 127, src, dst);
1136      DO_imm_mandr_r("insertps", 128, src, dst);
1137      DO_imm_mandr_r("insertps", 129, src, dst);
1138      DO_imm_mandr_r("insertps", 130, src, dst);
1139      DO_imm_mandr_r("insertps", 131, src, dst);
1140      DO_imm_mandr_r("insertps", 132, src, dst);
1141      DO_imm_mandr_r("insertps", 133, src, dst);
1142      DO_imm_mandr_r("insertps", 134, src, dst);
1143      DO_imm_mandr_r("insertps", 135, src, dst);
1144      DO_imm_mandr_r("insertps", 136, src, dst);
1145      DO_imm_mandr_r("insertps", 137, src, dst);
1146      DO_imm_mandr_r("insertps", 138, src, dst);
1147      DO_imm_mandr_r("insertps", 139, src, dst);
1148      DO_imm_mandr_r("insertps", 140, src, dst);
1149      DO_imm_mandr_r("insertps", 141, src, dst);
1150      DO_imm_mandr_r("insertps", 142, src, dst);
1151      DO_imm_mandr_r("insertps", 143, src, dst);
1152      DO_imm_mandr_r("insertps", 144, src, dst);
1153      DO_imm_mandr_r("insertps", 145, src, dst);
1154      DO_imm_mandr_r("insertps", 146, src, dst);
1155      DO_imm_mandr_r("insertps", 147, src, dst);
1156      DO_imm_mandr_r("insertps", 148, src, dst);
1157      DO_imm_mandr_r("insertps", 149, src, dst);
1158      DO_imm_mandr_r("insertps", 150, src, dst);
1159      DO_imm_mandr_r("insertps", 151, src, dst);
1160      DO_imm_mandr_r("insertps", 152, src, dst);
1161      DO_imm_mandr_r("insertps", 153, src, dst);
1162      DO_imm_mandr_r("insertps", 154, src, dst);
1163      DO_imm_mandr_r("insertps", 155, src, dst);
1164      DO_imm_mandr_r("insertps", 156, src, dst);
1165      DO_imm_mandr_r("insertps", 157, src, dst);
1166      DO_imm_mandr_r("insertps", 158, src, dst);
1167      DO_imm_mandr_r("insertps", 159, src, dst);
1168      DO_imm_mandr_r("insertps", 160, src, dst);
1169      DO_imm_mandr_r("insertps", 161, src, dst);
1170      DO_imm_mandr_r("insertps", 162, src, dst);
1171      DO_imm_mandr_r("insertps", 163, src, dst);
1172      DO_imm_mandr_r("insertps", 164, src, dst);
1173      DO_imm_mandr_r("insertps", 165, src, dst);
1174      DO_imm_mandr_r("insertps", 166, src, dst);
1175      DO_imm_mandr_r("insertps", 167, src, dst);
1176      DO_imm_mandr_r("insertps", 168, src, dst);
1177      DO_imm_mandr_r("insertps", 169, src, dst);
1178      DO_imm_mandr_r("insertps", 170, src, dst);
1179      DO_imm_mandr_r("insertps", 171, src, dst);
1180      DO_imm_mandr_r("insertps", 172, src, dst);
1181      DO_imm_mandr_r("insertps", 173, src, dst);
1182      DO_imm_mandr_r("insertps", 174, src, dst);
1183      DO_imm_mandr_r("insertps", 175, src, dst);
1184      DO_imm_mandr_r("insertps", 176, src, dst);
1185      DO_imm_mandr_r("insertps", 177, src, dst);
1186      DO_imm_mandr_r("insertps", 178, src, dst);
1187      DO_imm_mandr_r("insertps", 179, src, dst);
1188      DO_imm_mandr_r("insertps", 180, src, dst);
1189      DO_imm_mandr_r("insertps", 181, src, dst);
1190      DO_imm_mandr_r("insertps", 182, src, dst);
1191      DO_imm_mandr_r("insertps", 183, src, dst);
1192      DO_imm_mandr_r("insertps", 184, src, dst);
1193      DO_imm_mandr_r("insertps", 185, src, dst);
1194      DO_imm_mandr_r("insertps", 186, src, dst);
1195      DO_imm_mandr_r("insertps", 187, src, dst);
1196      DO_imm_mandr_r("insertps", 188, src, dst);
1197      DO_imm_mandr_r("insertps", 189, src, dst);
1198      DO_imm_mandr_r("insertps", 190, src, dst);
1199      DO_imm_mandr_r("insertps", 191, src, dst);
1200      DO_imm_mandr_r("insertps", 192, src, dst);
1201      DO_imm_mandr_r("insertps", 193, src, dst);
1202      DO_imm_mandr_r("insertps", 194, src, dst);
1203      DO_imm_mandr_r("insertps", 195, src, dst);
1204      DO_imm_mandr_r("insertps", 196, src, dst);
1205      DO_imm_mandr_r("insertps", 197, src, dst);
1206      DO_imm_mandr_r("insertps", 198, src, dst);
1207      DO_imm_mandr_r("insertps", 199, src, dst);
1208      DO_imm_mandr_r("insertps", 200, src, dst);
1209      DO_imm_mandr_r("insertps", 201, src, dst);
1210      DO_imm_mandr_r("insertps", 202, src, dst);
1211      DO_imm_mandr_r("insertps", 203, src, dst);
1212      DO_imm_mandr_r("insertps", 204, src, dst);
1213      DO_imm_mandr_r("insertps", 205, src, dst);
1214      DO_imm_mandr_r("insertps", 206, src, dst);
1215      DO_imm_mandr_r("insertps", 207, src, dst);
1216      DO_imm_mandr_r("insertps", 208, src, dst);
1217      DO_imm_mandr_r("insertps", 209, src, dst);
1218      DO_imm_mandr_r("insertps", 210, src, dst);
1219      DO_imm_mandr_r("insertps", 211, src, dst);
1220      DO_imm_mandr_r("insertps", 212, src, dst);
1221      DO_imm_mandr_r("insertps", 213, src, dst);
1222      DO_imm_mandr_r("insertps", 214, src, dst);
1223      DO_imm_mandr_r("insertps", 215, src, dst);
1224      DO_imm_mandr_r("insertps", 216, src, dst);
1225      DO_imm_mandr_r("insertps", 217, src, dst);
1226      DO_imm_mandr_r("insertps", 218, src, dst);
1227      DO_imm_mandr_r("insertps", 219, src, dst);
1228      DO_imm_mandr_r("insertps", 220, src, dst);
1229      DO_imm_mandr_r("insertps", 221, src, dst);
1230      DO_imm_mandr_r("insertps", 222, src, dst);
1231      DO_imm_mandr_r("insertps", 223, src, dst);
1232      DO_imm_mandr_r("insertps", 224, src, dst);
1233      DO_imm_mandr_r("insertps", 225, src, dst);
1234      DO_imm_mandr_r("insertps", 226, src, dst);
1235      DO_imm_mandr_r("insertps", 227, src, dst);
1236      DO_imm_mandr_r("insertps", 228, src, dst);
1237      DO_imm_mandr_r("insertps", 229, src, dst);
1238      DO_imm_mandr_r("insertps", 230, src, dst);
1239      DO_imm_mandr_r("insertps", 231, src, dst);
1240      DO_imm_mandr_r("insertps", 232, src, dst);
1241      DO_imm_mandr_r("insertps", 233, src, dst);
1242      DO_imm_mandr_r("insertps", 234, src, dst);
1243      DO_imm_mandr_r("insertps", 235, src, dst);
1244      DO_imm_mandr_r("insertps", 236, src, dst);
1245      DO_imm_mandr_r("insertps", 237, src, dst);
1246      DO_imm_mandr_r("insertps", 238, src, dst);
1247      DO_imm_mandr_r("insertps", 239, src, dst);
1248      DO_imm_mandr_r("insertps", 240, src, dst);
1249      DO_imm_mandr_r("insertps", 241, src, dst);
1250      DO_imm_mandr_r("insertps", 242, src, dst);
1251      DO_imm_mandr_r("insertps", 243, src, dst);
1252      DO_imm_mandr_r("insertps", 244, src, dst);
1253      DO_imm_mandr_r("insertps", 245, src, dst);
1254      DO_imm_mandr_r("insertps", 246, src, dst);
1255      DO_imm_mandr_r("insertps", 247, src, dst);
1256      DO_imm_mandr_r("insertps", 248, src, dst);
1257      DO_imm_mandr_r("insertps", 249, src, dst);
1258      DO_imm_mandr_r("insertps", 250, src, dst);
1259      DO_imm_mandr_r("insertps", 251, src, dst);
1260      DO_imm_mandr_r("insertps", 252, src, dst);
1261      DO_imm_mandr_r("insertps", 253, src, dst);
1262      DO_imm_mandr_r("insertps", 254, src, dst);
1263      DO_imm_mandr_r("insertps", 255, src, dst);
1264   }
1265}
1266
1267void test_MPSADBW ( void )
1268{
1269   V128 src, dst;
1270   Int i;
1271   for (i = 0; i < 50; i++) {
1272      randV128(&src);
1273      randV128(&dst);
1274      DO_imm_mandr_r("mpsadbw", 0, src, dst);
1275      DO_imm_mandr_r("mpsadbw", 1, src, dst);
1276      DO_imm_mandr_r("mpsadbw", 2, src, dst);
1277      DO_imm_mandr_r("mpsadbw", 3, src, dst);
1278      DO_imm_mandr_r("mpsadbw", 4, src, dst);
1279      DO_imm_mandr_r("mpsadbw", 5, src, dst);
1280      DO_imm_mandr_r("mpsadbw", 6, src, dst);
1281      DO_imm_mandr_r("mpsadbw", 7, src, dst);
1282   }
1283}
1284
1285void test_PACKUSDW ( void )
1286{
1287   V128 src, dst;
1288   Int i;
1289   for (i = 0; i < 10; i++) {
1290      if (i < 9) {
1291         randV128(&src);
1292         randV128(&dst);
1293      } else {
1294         memset(&src, 0, sizeof(src));
1295         memset(&dst, 0, sizeof(src));
1296         src[0] = 0x11; src[1] = 0x22;
1297         src[4] = 0x33; src[5] = 0x44;
1298         src[8] = 0x55; src[9] = 0x66;
1299         src[12] = 0x77; src[13] = 0x88;
1300         dst[0] = 0xaa; dst[1] = 0xbb;
1301         dst[4] = 0xcc; dst[5] = 0xdd;
1302         dst[8] = 0xee; dst[9] = 0xff;
1303         dst[12] = 0xa1; dst[13] = 0xb2;
1304      }
1305      DO_mandr_r("packusdw", src, dst);
1306   }
1307}
1308
1309void test_PBLENDW ( void )
1310{
1311   V128 src, dst;
1312   randV128(&src);
1313   randV128(&dst);
1314   {
1315      DO_imm_mandr_r("pblendw", 0, src, dst);
1316      DO_imm_mandr_r("pblendw", 1, src, dst);
1317      DO_imm_mandr_r("pblendw", 2, src, dst);
1318      DO_imm_mandr_r("pblendw", 3, src, dst);
1319      DO_imm_mandr_r("pblendw", 4, src, dst);
1320      DO_imm_mandr_r("pblendw", 5, src, dst);
1321      DO_imm_mandr_r("pblendw", 6, src, dst);
1322      DO_imm_mandr_r("pblendw", 7, src, dst);
1323      DO_imm_mandr_r("pblendw", 8, src, dst);
1324      DO_imm_mandr_r("pblendw", 9, src, dst);
1325      DO_imm_mandr_r("pblendw", 10, src, dst);
1326      DO_imm_mandr_r("pblendw", 11, src, dst);
1327      DO_imm_mandr_r("pblendw", 12, src, dst);
1328      DO_imm_mandr_r("pblendw", 13, src, dst);
1329      DO_imm_mandr_r("pblendw", 14, src, dst);
1330      DO_imm_mandr_r("pblendw", 15, src, dst);
1331      DO_imm_mandr_r("pblendw", 16, src, dst);
1332      DO_imm_mandr_r("pblendw", 17, src, dst);
1333      DO_imm_mandr_r("pblendw", 18, src, dst);
1334      DO_imm_mandr_r("pblendw", 19, src, dst);
1335      DO_imm_mandr_r("pblendw", 20, src, dst);
1336      DO_imm_mandr_r("pblendw", 21, src, dst);
1337      DO_imm_mandr_r("pblendw", 22, src, dst);
1338      DO_imm_mandr_r("pblendw", 23, src, dst);
1339      DO_imm_mandr_r("pblendw", 24, src, dst);
1340      DO_imm_mandr_r("pblendw", 25, src, dst);
1341      DO_imm_mandr_r("pblendw", 26, src, dst);
1342      DO_imm_mandr_r("pblendw", 27, src, dst);
1343      DO_imm_mandr_r("pblendw", 28, src, dst);
1344      DO_imm_mandr_r("pblendw", 29, src, dst);
1345      DO_imm_mandr_r("pblendw", 30, src, dst);
1346      DO_imm_mandr_r("pblendw", 31, src, dst);
1347      DO_imm_mandr_r("pblendw", 32, src, dst);
1348      DO_imm_mandr_r("pblendw", 33, src, dst);
1349      DO_imm_mandr_r("pblendw", 34, src, dst);
1350      DO_imm_mandr_r("pblendw", 35, src, dst);
1351      DO_imm_mandr_r("pblendw", 36, src, dst);
1352      DO_imm_mandr_r("pblendw", 37, src, dst);
1353      DO_imm_mandr_r("pblendw", 38, src, dst);
1354      DO_imm_mandr_r("pblendw", 39, src, dst);
1355      DO_imm_mandr_r("pblendw", 40, src, dst);
1356      DO_imm_mandr_r("pblendw", 41, src, dst);
1357      DO_imm_mandr_r("pblendw", 42, src, dst);
1358      DO_imm_mandr_r("pblendw", 43, src, dst);
1359      DO_imm_mandr_r("pblendw", 44, src, dst);
1360      DO_imm_mandr_r("pblendw", 45, src, dst);
1361      DO_imm_mandr_r("pblendw", 46, src, dst);
1362      DO_imm_mandr_r("pblendw", 47, src, dst);
1363      DO_imm_mandr_r("pblendw", 48, src, dst);
1364      DO_imm_mandr_r("pblendw", 49, src, dst);
1365      DO_imm_mandr_r("pblendw", 50, src, dst);
1366      DO_imm_mandr_r("pblendw", 51, src, dst);
1367      DO_imm_mandr_r("pblendw", 52, src, dst);
1368      DO_imm_mandr_r("pblendw", 53, src, dst);
1369      DO_imm_mandr_r("pblendw", 54, src, dst);
1370      DO_imm_mandr_r("pblendw", 55, src, dst);
1371      DO_imm_mandr_r("pblendw", 56, src, dst);
1372      DO_imm_mandr_r("pblendw", 57, src, dst);
1373      DO_imm_mandr_r("pblendw", 58, src, dst);
1374      DO_imm_mandr_r("pblendw", 59, src, dst);
1375      DO_imm_mandr_r("pblendw", 60, src, dst);
1376      DO_imm_mandr_r("pblendw", 61, src, dst);
1377      DO_imm_mandr_r("pblendw", 62, src, dst);
1378      DO_imm_mandr_r("pblendw", 63, src, dst);
1379      DO_imm_mandr_r("pblendw", 64, src, dst);
1380      DO_imm_mandr_r("pblendw", 65, src, dst);
1381      DO_imm_mandr_r("pblendw", 66, src, dst);
1382      DO_imm_mandr_r("pblendw", 67, src, dst);
1383      DO_imm_mandr_r("pblendw", 68, src, dst);
1384      DO_imm_mandr_r("pblendw", 69, src, dst);
1385      DO_imm_mandr_r("pblendw", 70, src, dst);
1386      DO_imm_mandr_r("pblendw", 71, src, dst);
1387      DO_imm_mandr_r("pblendw", 72, src, dst);
1388      DO_imm_mandr_r("pblendw", 73, src, dst);
1389      DO_imm_mandr_r("pblendw", 74, src, dst);
1390      DO_imm_mandr_r("pblendw", 75, src, dst);
1391      DO_imm_mandr_r("pblendw", 76, src, dst);
1392      DO_imm_mandr_r("pblendw", 77, src, dst);
1393      DO_imm_mandr_r("pblendw", 78, src, dst);
1394      DO_imm_mandr_r("pblendw", 79, src, dst);
1395      DO_imm_mandr_r("pblendw", 80, src, dst);
1396      DO_imm_mandr_r("pblendw", 81, src, dst);
1397      DO_imm_mandr_r("pblendw", 82, src, dst);
1398      DO_imm_mandr_r("pblendw", 83, src, dst);
1399      DO_imm_mandr_r("pblendw", 84, src, dst);
1400      DO_imm_mandr_r("pblendw", 85, src, dst);
1401      DO_imm_mandr_r("pblendw", 86, src, dst);
1402      DO_imm_mandr_r("pblendw", 87, src, dst);
1403      DO_imm_mandr_r("pblendw", 88, src, dst);
1404      DO_imm_mandr_r("pblendw", 89, src, dst);
1405      DO_imm_mandr_r("pblendw", 90, src, dst);
1406      DO_imm_mandr_r("pblendw", 91, src, dst);
1407      DO_imm_mandr_r("pblendw", 92, src, dst);
1408      DO_imm_mandr_r("pblendw", 93, src, dst);
1409      DO_imm_mandr_r("pblendw", 94, src, dst);
1410      DO_imm_mandr_r("pblendw", 95, src, dst);
1411      DO_imm_mandr_r("pblendw", 96, src, dst);
1412      DO_imm_mandr_r("pblendw", 97, src, dst);
1413      DO_imm_mandr_r("pblendw", 98, src, dst);
1414      DO_imm_mandr_r("pblendw", 99, src, dst);
1415      DO_imm_mandr_r("pblendw", 100, src, dst);
1416      DO_imm_mandr_r("pblendw", 101, src, dst);
1417      DO_imm_mandr_r("pblendw", 102, src, dst);
1418      DO_imm_mandr_r("pblendw", 103, src, dst);
1419      DO_imm_mandr_r("pblendw", 104, src, dst);
1420      DO_imm_mandr_r("pblendw", 105, src, dst);
1421      DO_imm_mandr_r("pblendw", 106, src, dst);
1422      DO_imm_mandr_r("pblendw", 107, src, dst);
1423      DO_imm_mandr_r("pblendw", 108, src, dst);
1424      DO_imm_mandr_r("pblendw", 109, src, dst);
1425      DO_imm_mandr_r("pblendw", 110, src, dst);
1426      DO_imm_mandr_r("pblendw", 111, src, dst);
1427      DO_imm_mandr_r("pblendw", 112, src, dst);
1428      DO_imm_mandr_r("pblendw", 113, src, dst);
1429      DO_imm_mandr_r("pblendw", 114, src, dst);
1430      DO_imm_mandr_r("pblendw", 115, src, dst);
1431      DO_imm_mandr_r("pblendw", 116, src, dst);
1432      DO_imm_mandr_r("pblendw", 117, src, dst);
1433      DO_imm_mandr_r("pblendw", 118, src, dst);
1434      DO_imm_mandr_r("pblendw", 119, src, dst);
1435      DO_imm_mandr_r("pblendw", 120, src, dst);
1436      DO_imm_mandr_r("pblendw", 121, src, dst);
1437      DO_imm_mandr_r("pblendw", 122, src, dst);
1438      DO_imm_mandr_r("pblendw", 123, src, dst);
1439      DO_imm_mandr_r("pblendw", 124, src, dst);
1440      DO_imm_mandr_r("pblendw", 125, src, dst);
1441      DO_imm_mandr_r("pblendw", 126, src, dst);
1442      DO_imm_mandr_r("pblendw", 127, src, dst);
1443      DO_imm_mandr_r("pblendw", 128, src, dst);
1444      DO_imm_mandr_r("pblendw", 129, src, dst);
1445      DO_imm_mandr_r("pblendw", 130, src, dst);
1446      DO_imm_mandr_r("pblendw", 131, src, dst);
1447      DO_imm_mandr_r("pblendw", 132, src, dst);
1448      DO_imm_mandr_r("pblendw", 133, src, dst);
1449      DO_imm_mandr_r("pblendw", 134, src, dst);
1450      DO_imm_mandr_r("pblendw", 135, src, dst);
1451      DO_imm_mandr_r("pblendw", 136, src, dst);
1452      DO_imm_mandr_r("pblendw", 137, src, dst);
1453      DO_imm_mandr_r("pblendw", 138, src, dst);
1454      DO_imm_mandr_r("pblendw", 139, src, dst);
1455      DO_imm_mandr_r("pblendw", 140, src, dst);
1456      DO_imm_mandr_r("pblendw", 141, src, dst);
1457      DO_imm_mandr_r("pblendw", 142, src, dst);
1458      DO_imm_mandr_r("pblendw", 143, src, dst);
1459      DO_imm_mandr_r("pblendw", 144, src, dst);
1460      DO_imm_mandr_r("pblendw", 145, src, dst);
1461      DO_imm_mandr_r("pblendw", 146, src, dst);
1462      DO_imm_mandr_r("pblendw", 147, src, dst);
1463      DO_imm_mandr_r("pblendw", 148, src, dst);
1464      DO_imm_mandr_r("pblendw", 149, src, dst);
1465      DO_imm_mandr_r("pblendw", 150, src, dst);
1466      DO_imm_mandr_r("pblendw", 151, src, dst);
1467      DO_imm_mandr_r("pblendw", 152, src, dst);
1468      DO_imm_mandr_r("pblendw", 153, src, dst);
1469      DO_imm_mandr_r("pblendw", 154, src, dst);
1470      DO_imm_mandr_r("pblendw", 155, src, dst);
1471      DO_imm_mandr_r("pblendw", 156, src, dst);
1472      DO_imm_mandr_r("pblendw", 157, src, dst);
1473      DO_imm_mandr_r("pblendw", 158, src, dst);
1474      DO_imm_mandr_r("pblendw", 159, src, dst);
1475      DO_imm_mandr_r("pblendw", 160, src, dst);
1476      DO_imm_mandr_r("pblendw", 161, src, dst);
1477      DO_imm_mandr_r("pblendw", 162, src, dst);
1478      DO_imm_mandr_r("pblendw", 163, src, dst);
1479      DO_imm_mandr_r("pblendw", 164, src, dst);
1480      DO_imm_mandr_r("pblendw", 165, src, dst);
1481      DO_imm_mandr_r("pblendw", 166, src, dst);
1482      DO_imm_mandr_r("pblendw", 167, src, dst);
1483      DO_imm_mandr_r("pblendw", 168, src, dst);
1484      DO_imm_mandr_r("pblendw", 169, src, dst);
1485      DO_imm_mandr_r("pblendw", 170, src, dst);
1486      DO_imm_mandr_r("pblendw", 171, src, dst);
1487      DO_imm_mandr_r("pblendw", 172, src, dst);
1488      DO_imm_mandr_r("pblendw", 173, src, dst);
1489      DO_imm_mandr_r("pblendw", 174, src, dst);
1490      DO_imm_mandr_r("pblendw", 175, src, dst);
1491      DO_imm_mandr_r("pblendw", 176, src, dst);
1492      DO_imm_mandr_r("pblendw", 177, src, dst);
1493      DO_imm_mandr_r("pblendw", 178, src, dst);
1494      DO_imm_mandr_r("pblendw", 179, src, dst);
1495      DO_imm_mandr_r("pblendw", 180, src, dst);
1496      DO_imm_mandr_r("pblendw", 181, src, dst);
1497      DO_imm_mandr_r("pblendw", 182, src, dst);
1498      DO_imm_mandr_r("pblendw", 183, src, dst);
1499      DO_imm_mandr_r("pblendw", 184, src, dst);
1500      DO_imm_mandr_r("pblendw", 185, src, dst);
1501      DO_imm_mandr_r("pblendw", 186, src, dst);
1502      DO_imm_mandr_r("pblendw", 187, src, dst);
1503      DO_imm_mandr_r("pblendw", 188, src, dst);
1504      DO_imm_mandr_r("pblendw", 189, src, dst);
1505      DO_imm_mandr_r("pblendw", 190, src, dst);
1506      DO_imm_mandr_r("pblendw", 191, src, dst);
1507      DO_imm_mandr_r("pblendw", 192, src, dst);
1508      DO_imm_mandr_r("pblendw", 193, src, dst);
1509      DO_imm_mandr_r("pblendw", 194, src, dst);
1510      DO_imm_mandr_r("pblendw", 195, src, dst);
1511      DO_imm_mandr_r("pblendw", 196, src, dst);
1512      DO_imm_mandr_r("pblendw", 197, src, dst);
1513      DO_imm_mandr_r("pblendw", 198, src, dst);
1514      DO_imm_mandr_r("pblendw", 199, src, dst);
1515      DO_imm_mandr_r("pblendw", 200, src, dst);
1516      DO_imm_mandr_r("pblendw", 201, src, dst);
1517      DO_imm_mandr_r("pblendw", 202, src, dst);
1518      DO_imm_mandr_r("pblendw", 203, src, dst);
1519      DO_imm_mandr_r("pblendw", 204, src, dst);
1520      DO_imm_mandr_r("pblendw", 205, src, dst);
1521      DO_imm_mandr_r("pblendw", 206, src, dst);
1522      DO_imm_mandr_r("pblendw", 207, src, dst);
1523      DO_imm_mandr_r("pblendw", 208, src, dst);
1524      DO_imm_mandr_r("pblendw", 209, src, dst);
1525      DO_imm_mandr_r("pblendw", 210, src, dst);
1526      DO_imm_mandr_r("pblendw", 211, src, dst);
1527      DO_imm_mandr_r("pblendw", 212, src, dst);
1528      DO_imm_mandr_r("pblendw", 213, src, dst);
1529      DO_imm_mandr_r("pblendw", 214, src, dst);
1530      DO_imm_mandr_r("pblendw", 215, src, dst);
1531      DO_imm_mandr_r("pblendw", 216, src, dst);
1532      DO_imm_mandr_r("pblendw", 217, src, dst);
1533      DO_imm_mandr_r("pblendw", 218, src, dst);
1534      DO_imm_mandr_r("pblendw", 219, src, dst);
1535      DO_imm_mandr_r("pblendw", 220, src, dst);
1536      DO_imm_mandr_r("pblendw", 221, src, dst);
1537      DO_imm_mandr_r("pblendw", 222, src, dst);
1538      DO_imm_mandr_r("pblendw", 223, src, dst);
1539      DO_imm_mandr_r("pblendw", 224, src, dst);
1540      DO_imm_mandr_r("pblendw", 225, src, dst);
1541      DO_imm_mandr_r("pblendw", 226, src, dst);
1542      DO_imm_mandr_r("pblendw", 227, src, dst);
1543      DO_imm_mandr_r("pblendw", 228, src, dst);
1544      DO_imm_mandr_r("pblendw", 229, src, dst);
1545      DO_imm_mandr_r("pblendw", 230, src, dst);
1546      DO_imm_mandr_r("pblendw", 231, src, dst);
1547      DO_imm_mandr_r("pblendw", 232, src, dst);
1548      DO_imm_mandr_r("pblendw", 233, src, dst);
1549      DO_imm_mandr_r("pblendw", 234, src, dst);
1550      DO_imm_mandr_r("pblendw", 235, src, dst);
1551      DO_imm_mandr_r("pblendw", 236, src, dst);
1552      DO_imm_mandr_r("pblendw", 237, src, dst);
1553      DO_imm_mandr_r("pblendw", 238, src, dst);
1554      DO_imm_mandr_r("pblendw", 239, src, dst);
1555      DO_imm_mandr_r("pblendw", 240, src, dst);
1556      DO_imm_mandr_r("pblendw", 241, src, dst);
1557      DO_imm_mandr_r("pblendw", 242, src, dst);
1558      DO_imm_mandr_r("pblendw", 243, src, dst);
1559      DO_imm_mandr_r("pblendw", 244, src, dst);
1560      DO_imm_mandr_r("pblendw", 245, src, dst);
1561      DO_imm_mandr_r("pblendw", 246, src, dst);
1562      DO_imm_mandr_r("pblendw", 247, src, dst);
1563      DO_imm_mandr_r("pblendw", 248, src, dst);
1564      DO_imm_mandr_r("pblendw", 249, src, dst);
1565      DO_imm_mandr_r("pblendw", 250, src, dst);
1566      DO_imm_mandr_r("pblendw", 251, src, dst);
1567      DO_imm_mandr_r("pblendw", 252, src, dst);
1568      DO_imm_mandr_r("pblendw", 253, src, dst);
1569      DO_imm_mandr_r("pblendw", 254, src, dst);
1570      DO_imm_mandr_r("pblendw", 255, src, dst);
1571   }
1572}
1573
1574
1575void test_PCMPEQQ ( void )
1576{
1577   V128 src, dst;
1578   Int i;
1579   for (i = 0; i < 10; i++) {
1580      randV128(&src);
1581      randV128(&dst);
1582      switch (i - 6) {
1583         case 0: memset(&src[0], 0x55, 8);
1584                 memset(&dst[0], 0x55, 8); break;
1585         case 1: memset(&src[8], 0x55, 8);
1586                 memset(&dst[8], 0x55, 8); break;
1587         default:
1588            break;
1589      }
1590      DO_mandr_r("pcmpeqq", src, dst);
1591   }
1592}
1593
1594
1595void test_PEXTRB ( void )
1596{
1597   V128 src;
1598   randV128(&src);
1599   DO_imm_r_to_mandrscalar("pextrb", 0, src, "d");
1600   DO_imm_r_to_mandrscalar("pextrb", 1, src, "d");
1601   DO_imm_r_to_mandrscalar("pextrb", 2, src, "d");
1602   DO_imm_r_to_mandrscalar("pextrb", 3, src, "d");
1603   DO_imm_r_to_mandrscalar("pextrb", 4, src, "d");
1604   DO_imm_r_to_mandrscalar("pextrb", 5, src, "d");
1605   DO_imm_r_to_mandrscalar("pextrb", 6, src, "d");
1606   DO_imm_r_to_mandrscalar("pextrb", 7, src, "d");
1607   DO_imm_r_to_mandrscalar("pextrb", 8, src, "d");
1608   DO_imm_r_to_mandrscalar("pextrb", 9, src, "d");
1609   DO_imm_r_to_mandrscalar("pextrb", 10, src, "d");
1610   DO_imm_r_to_mandrscalar("pextrb", 11, src, "d");
1611   DO_imm_r_to_mandrscalar("pextrb", 12, src, "d");
1612   DO_imm_r_to_mandrscalar("pextrb", 13, src, "d");
1613   DO_imm_r_to_mandrscalar("pextrb", 14, src, "d");
1614   DO_imm_r_to_mandrscalar("pextrb", 15, src, "d");
1615}
1616
1617void test_PINSRB ( void )
1618{
1619   ULong src;
1620   src = randULong();
1621   DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d");
1622   src = randULong();
1623   DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d");
1624   src = randULong();
1625   DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d");
1626   src = randULong();
1627   DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d");
1628   src = randULong();
1629   DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d");
1630   src = randULong();
1631   DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d");
1632   src = randULong();
1633   DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d");
1634   src = randULong();
1635   DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d");
1636   src = randULong();
1637   DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d");
1638   src = randULong();
1639   DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d");
1640   src = randULong();
1641   DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d");
1642   src = randULong();
1643   DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d");
1644   src = randULong();
1645   DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d");
1646   src = randULong();
1647   DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d");
1648   src = randULong();
1649   DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d");
1650   src = randULong();
1651   DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d");
1652}
1653
1654
1655void test_PEXTRW ( void )
1656{
1657   V128 src;
1658   randV128(&src);
1659   DO_imm_r_to_mandrscalar("pextrw", 0, src, "d");
1660   DO_imm_r_to_mandrscalar("pextrw", 1, src, "d");
1661   DO_imm_r_to_mandrscalar("pextrw", 2, src, "d");
1662   DO_imm_r_to_mandrscalar("pextrw", 3, src, "d");
1663   DO_imm_r_to_mandrscalar("pextrw", 4, src, "d");
1664   DO_imm_r_to_mandrscalar("pextrw", 5, src, "d");
1665   DO_imm_r_to_mandrscalar("pextrw", 6, src, "d");
1666   DO_imm_r_to_mandrscalar("pextrw", 7, src, "d");
1667}
1668
1669void test_PINSRW ( void )
1670{
1671   ULong src;
1672   src = randULong();
1673   DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d");
1674   src = randULong();
1675   DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d");
1676   src = randULong();
1677   DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d");
1678   src = randULong();
1679   DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d");
1680   src = randULong();
1681   DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d");
1682   src = randULong();
1683   DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d");
1684   src = randULong();
1685   DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d");
1686   src = randULong();
1687   DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d");
1688}
1689
1690
1691void test_PEXTRD ( void )
1692{
1693   V128 src;
1694   randV128(&src);
1695   DO_imm_r_to_mandrscalar("pextrd", 0, src, "d");
1696   DO_imm_r_to_mandrscalar("pextrd", 1, src, "d");
1697   DO_imm_r_to_mandrscalar("pextrd", 2, src, "d");
1698   DO_imm_r_to_mandrscalar("pextrd", 3, src, "d");
1699}
1700
1701void test_PINSRD ( void )
1702{
1703   ULong src;
1704   src = randULong();
1705   DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d");
1706   src = randULong();
1707   DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d");
1708   src = randULong();
1709   DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d");
1710   src = randULong();
1711   DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d");
1712}
1713
1714
1715void test_PEXTRQ ( void )
1716{
1717   V128 src;
1718   randV128(&src);
1719   DO_imm_r_to_mandrscalar("pextrq", 0, src, "");
1720   DO_imm_r_to_mandrscalar("pextrq", 1, src, "");
1721}
1722
1723void test_PINSRQ ( void )
1724{
1725   ULong src;
1726   src = randULong();
1727   DO_imm_mandrscalar_to_r("pinsrq", 0, src, "");
1728   src = randULong();
1729   DO_imm_mandrscalar_to_r("pinsrq", 1, src, "");
1730}
1731
1732
1733void test_EXTRACTPS ( void )
1734{
1735   V128 src;
1736   randV128(&src);
1737   DO_imm_r_to_mandrscalar("extractps", 0, src, "d");
1738   DO_imm_r_to_mandrscalar("extractps", 1, src, "d");
1739   DO_imm_r_to_mandrscalar("extractps", 2, src, "d");
1740   DO_imm_r_to_mandrscalar("extractps", 3, src, "d");
1741}
1742
1743
1744void test_PHMINPOSUW ( void )
1745{
1746   V128 src, dst;
1747   Int i;
1748   for (i = 0; i < 20; i++) {
1749      randV128(&src);
1750      randV128(&dst);
1751      DO_mandr_r("phminposuw", src, dst);
1752   }
1753   memset(src, 0x55, sizeof(src));
1754   memset(dst, 0xAA, sizeof(dst));
1755   DO_mandr_r("phminposuw", src, dst);
1756}
1757
1758void test_PMAXSB ( void )
1759{
1760   V128 src, dst;
1761   Int i;
1762   for (i = 0; i < 10; i++) {
1763      randV128(&src);
1764      randV128(&dst);
1765      DO_mandr_r("pmaxsb", src, dst);
1766   }
1767}
1768
1769void test_PMAXSD ( void )
1770{
1771   V128 src, dst;
1772   Int i;
1773   for (i = 0; i < 10; i++) {
1774      randV128(&src);
1775      randV128(&dst);
1776      DO_mandr_r("pmaxsd", src, dst);
1777   }
1778}
1779
1780void test_PMAXUD ( void )
1781{
1782   V128 src, dst;
1783   Int i;
1784   for (i = 0; i < 10; i++) {
1785      randV128(&src);
1786      randV128(&dst);
1787      DO_mandr_r("pmaxud", src, dst);
1788   }
1789}
1790
1791void test_PMAXUW ( void )
1792{
1793   V128 src, dst;
1794   Int i;
1795   for (i = 0; i < 10; i++) {
1796      randV128(&src);
1797      randV128(&dst);
1798      DO_mandr_r("pmaxuw", src, dst);
1799   }
1800}
1801
1802void test_PMINSB ( void )
1803{
1804   V128 src, dst;
1805   Int i;
1806   for (i = 0; i < 10; i++) {
1807      randV128(&src);
1808      randV128(&dst);
1809      DO_mandr_r("pminsb", src, dst);
1810   }
1811}
1812
1813void test_PMINSD ( void )
1814{
1815   V128 src, dst;
1816   Int i;
1817   for (i = 0; i < 10; i++) {
1818      randV128(&src);
1819      randV128(&dst);
1820      DO_mandr_r("pminsd", src, dst);
1821   }
1822}
1823
1824void test_PMINUD ( void )
1825{
1826   V128 src, dst;
1827   Int i;
1828   for (i = 0; i < 10; i++) {
1829      randV128(&src);
1830      randV128(&dst);
1831      DO_mandr_r("pminud", src, dst);
1832   }
1833}
1834
1835void test_PMINUW ( void )
1836{
1837   V128 src, dst;
1838   Int i;
1839   for (i = 0; i < 10; i++) {
1840      randV128(&src);
1841      randV128(&dst);
1842      DO_mandr_r("pminuw", src, dst);
1843   }
1844}
1845
1846void test_PMOVSXBW ( void )
1847{
1848   V128 src, dst;
1849   Int i;
1850   for (i = 0; i < 10; i++) {
1851      randV128(&src);
1852      randV128(&dst);
1853      DO_mandr_r("pmovsxbw", src, dst);
1854   }
1855}
1856
1857void test_PMOVSXBD ( void )
1858{
1859   V128 src, dst;
1860   Int i;
1861   for (i = 0; i < 10; i++) {
1862      randV128(&src);
1863      randV128(&dst);
1864      DO_mandr_r("pmovsxbd", src, dst);
1865   }
1866}
1867
1868void test_PMOVSXBQ ( void )
1869{
1870   V128 src, dst;
1871   Int i;
1872   for (i = 0; i < 10; i++) {
1873      randV128(&src);
1874      randV128(&dst);
1875      DO_mandr_r("pmovsxbq", src, dst);
1876   }
1877}
1878
1879void test_PMOVSXWD ( void )
1880{
1881   V128 src, dst;
1882   Int i;
1883   for (i = 0; i < 10; i++) {
1884      randV128(&src);
1885      randV128(&dst);
1886      DO_mandr_r("pmovsxwd", src, dst);
1887   }
1888}
1889
1890void test_PMOVSXWQ ( void )
1891{
1892   V128 src, dst;
1893   Int i;
1894   for (i = 0; i < 10; i++) {
1895      randV128(&src);
1896      randV128(&dst);
1897      DO_mandr_r("pmovsxwq", src, dst);
1898   }
1899}
1900
1901void test_PMOVSXDQ ( void )
1902{
1903   V128 src, dst;
1904   Int i;
1905   for (i = 0; i < 10; i++) {
1906      randV128(&src);
1907      randV128(&dst);
1908      DO_mandr_r("pmovsxdq", src, dst);
1909   }
1910}
1911
1912void test_PMOVZXBW ( void )
1913{
1914   V128 src, dst;
1915   Int i;
1916   for (i = 0; i < 10; i++) {
1917      randV128(&src);
1918      randV128(&dst);
1919      DO_mandr_r("pmovzxbw", src, dst);
1920   }
1921}
1922
1923void test_PMOVZXBD ( void )
1924{
1925   V128 src, dst;
1926   Int i;
1927   for (i = 0; i < 10; i++) {
1928      randV128(&src);
1929      randV128(&dst);
1930      DO_mandr_r("pmovzxbd", src, dst);
1931   }
1932}
1933
1934void test_PMOVZXBQ ( void )
1935{
1936   V128 src, dst;
1937   Int i;
1938   for (i = 0; i < 10; i++) {
1939      randV128(&src);
1940      randV128(&dst);
1941      DO_mandr_r("pmovzxbq", src, dst);
1942   }
1943}
1944
1945void test_PMOVZXWD ( void )
1946{
1947   V128 src, dst;
1948   Int i;
1949   for (i = 0; i < 10; i++) {
1950      randV128(&src);
1951      randV128(&dst);
1952      DO_mandr_r("pmovzxwd", src, dst);
1953   }
1954}
1955
1956void test_PMOVZXWQ ( void )
1957{
1958   V128 src, dst;
1959   Int i;
1960   for (i = 0; i < 10; i++) {
1961      randV128(&src);
1962      randV128(&dst);
1963      DO_mandr_r("pmovzxwq", src, dst);
1964   }
1965}
1966
1967void test_PMOVZXDQ ( void )
1968{
1969   V128 src, dst;
1970   Int i;
1971   for (i = 0; i < 10; i++) {
1972      randV128(&src);
1973      randV128(&dst);
1974      DO_mandr_r("pmovzxdq", src, dst);
1975   }
1976}
1977
1978void test_PMULDQ ( void )
1979{
1980   V128 src, dst;
1981   Int i;
1982   for (i = 0; i < 10; i++) {
1983      randV128(&src);
1984      randV128(&dst);
1985      DO_mandr_r("pmuldq", src, dst);
1986   }
1987}
1988
1989
1990void test_PMULLD ( void )
1991{
1992   V128 src, dst;
1993   Int i;
1994   for (i = 0; i < 10; i++) {
1995      randV128(&src);
1996      randV128(&dst);
1997      DO_mandr_r("pmulld", src, dst);
1998   }
1999}
2000
2001
2002void test_POPCNTQ ( void )
2003{
2004   ULong block[4];
2005   Int i;
2006   ULong oszacp_mask = 0x8D5;
2007   for (i = 0; i < 10; i++) {
2008      block[0] = i == 0 ? 0 : randULong();
2009      block[1] = randULong();
2010      block[2] = randULong();
2011      block[3] = randULong();
2012      __asm__ __volatile__(
2013         "movq %0,       %%rax"  "\n\t"
2014         "movq 0(%%rax), %%rdi"  "\n\t"
2015         "movq 8(%%rax), %%r11"  "\n\t"
2016#ifndef VGP_amd64_darwin
2017         "popcntq %%rdi, %%r11"  "\n\t"
2018#else
2019         "popcnt  %%rdi, %%r11"  "\n\t"
2020#endif
2021         "movq %%r11, 16(%%rax)"  "\n\t"
2022         "pushfq"                 "\n\t"
2023         "popq %%r12"             "\n\t"
2024         "movq %%r12, 24(%%rax)"  "\n"
2025         : /*out*/
2026         : /*in*/"r"(&block[0])
2027         : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2028      );
2029      printf("r popcntq  %016llx %016llx  %016llx %016llx\n",
2030             block[0], block[1], block[2], block[3] & oszacp_mask);
2031
2032      block[0] = i == 0 ? 0 : randULong();
2033      block[1] = randULong();
2034      block[2] = randULong();
2035      block[3] = randULong();
2036      __asm__ __volatile__(
2037         "movq %0,       %%rax"  "\n\t"
2038         "movq 8(%%rax), %%r11"  "\n\t"
2039#ifndef VGP_amd64_darwin
2040         "popcntq 0(%%rax), %%r11"  "\n\t"
2041#else
2042         "popcnt  0(%%rax), %%r11"  "\n\t"
2043#endif
2044         "movq %%r11, 16(%%rax)"  "\n\t"
2045         "pushfq"                 "\n\t"
2046         "popq %%r12"             "\n\t"
2047         "movq %%r12, 24(%%rax)"  "\n"
2048         : /*out*/
2049         : /*in*/"r"(&block[0])
2050         : /*trash*/ "cc", "memory", "r11", "r12"
2051      );
2052      printf("m popcntq  %016llx %016llx  %016llx %016llx\n",
2053             block[0], block[1], block[2], block[3] & oszacp_mask);
2054   }
2055}
2056
2057
2058void test_POPCNTL ( void )
2059{
2060   ULong block[4];
2061   Int i;
2062   ULong oszacp_mask = 0x8D5;
2063   for (i = 0; i < 10; i++) {
2064      block[0] = i == 0 ? 0 : randULong();
2065      block[1] = randULong();
2066      block[2] = randULong();
2067      block[3] = randULong();
2068      __asm__ __volatile__(
2069         "movq %0,       %%rax"  "\n\t"
2070         "movq 0(%%rax), %%rdi"  "\n\t"
2071         "movq 8(%%rax), %%r11"  "\n\t"
2072#ifndef VGP_amd64_darwin
2073         "popcntl %%edi, %%r11d"  "\n\t"
2074#else
2075         "popcnt  %%edi, %%r11d"  "\n\t"
2076#endif
2077         "movq %%r11, 16(%%rax)"  "\n\t"
2078         "pushfq"                 "\n\t"
2079         "popq %%r12"             "\n\t"
2080         "movq %%r12, 24(%%rax)"  "\n"
2081         : /*out*/
2082         : /*in*/"r"(&block[0])
2083         : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2084      );
2085      printf("r popcntl  %016llx %016llx  %016llx %016llx\n",
2086             block[0], block[1], block[2], block[3] & oszacp_mask);
2087
2088      block[0] = i == 0 ? 0 : randULong();
2089      block[1] = randULong();
2090      block[2] = randULong();
2091      block[3] = randULong();
2092      __asm__ __volatile__(
2093         "movq %0,       %%rax"  "\n\t"
2094         "movq 8(%%rax), %%r11"  "\n\t"
2095#ifndef VGP_amd64_darwin
2096         "popcntl 0(%%rax), %%r11d"  "\n\t"
2097#else
2098         "popcnt  0(%%rax), %%r11d"  "\n\t"
2099#endif
2100         "movq %%r11, 16(%%rax)"  "\n\t"
2101         "pushfq"                 "\n\t"
2102         "popq %%r12"             "\n\t"
2103         "movq %%r12, 24(%%rax)"  "\n"
2104         : /*out*/
2105         : /*in*/"r"(&block[0])
2106         : /*trash*/ "cc", "memory", "r11", "r12"
2107      );
2108      printf("m popcntl  %016llx %016llx  %016llx %016llx\n",
2109             block[0], block[1], block[2], block[3] & oszacp_mask);
2110   }
2111}
2112
2113
2114void test_POPCNTW ( void )
2115{
2116   ULong block[4];
2117   Int i;
2118   ULong oszacp_mask = 0x8D5;
2119   for (i = 0; i < 10; i++) {
2120      block[0] = i == 0 ? 0 : randULong();
2121      block[1] = randULong();
2122      block[2] = randULong();
2123      block[3] = randULong();
2124      __asm__ __volatile__(
2125         "movq %0,       %%rax"  "\n\t"
2126         "movq 0(%%rax), %%rdi"  "\n\t"
2127         "movq 8(%%rax), %%r11"  "\n\t"
2128#ifndef VGP_amd64_darwin
2129         "popcntw %%di,  %%r11w"  "\n\t"
2130#else
2131         "popcnt  %%di,  %%r11w"  "\n\t"
2132#endif
2133         "movq %%r11, 16(%%rax)"  "\n\t"
2134         "pushfq"                 "\n\t"
2135         "popq %%r12"             "\n\t"
2136         "movq %%r12, 24(%%rax)"  "\n"
2137         : /*out*/
2138         : /*in*/"r"(&block[0])
2139         : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2140      );
2141      printf("r popcntw  %016llx %016llx  %016llx %016llx\n",
2142             block[0], block[1], block[2], block[3] & oszacp_mask);
2143
2144      block[0] = i == 0 ? 0 : randULong();
2145      block[1] = randULong();
2146      block[2] = randULong();
2147      block[3] = randULong();
2148      __asm__ __volatile__(
2149         "movq %0,       %%rax"  "\n\t"
2150         "movq 8(%%rax), %%r11"  "\n\t"
2151#ifndef VGP_amd64_darwin
2152         "popcntw 0(%%rax), %%r11w"  "\n\t"
2153#else
2154         "popcnt  0(%%rax), %%r11w"  "\n\t"
2155#endif
2156         "movq %%r11, 16(%%rax)"  "\n\t"
2157         "pushfq"                 "\n\t"
2158         "popq %%r12"             "\n\t"
2159         "movq %%r12, 24(%%rax)"  "\n"
2160         : /*out*/
2161         : /*in*/"r"(&block[0])
2162         : /*trash*/ "cc", "memory", "r11", "r12"
2163      );
2164      printf("m popcntw  %016llx %016llx  %016llx %016llx\n",
2165             block[0], block[1], block[2], block[3] & oszacp_mask);
2166   }
2167}
2168
2169
2170void test_PCMPGTQ ( void )
2171{
2172   V128 spec[7];
2173   do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL );
2174   do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL );
2175   do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL );
2176   do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL );
2177   do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL );
2178   do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL );
2179   do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL );
2180
2181   V128 src, dst;
2182   Int i, j;
2183   for (i = 0; i < 10; i++) {
2184      randV128(&src);
2185      randV128(&dst);
2186      DO_mandr_r("pcmpgtq", src, dst);
2187   }
2188   for (i = 0; i < 7; i++) {
2189      for (j = 0; j < 7; j++) {
2190         memcpy(&src, &spec[i], 16);
2191         memcpy(&dst, &spec[j], 16);
2192         DO_mandr_r("pcmpgtq", src, dst);
2193      }
2194   }
2195}
2196
2197/* ------------ ROUNDSD ------------ */
2198
2199void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2200{
2201   if (mem) {
2202      __asm__ __volatile__(
2203         "movupd  (%1), %%xmm11"       "\n\t"
2204         "roundsd $0, (%0), %%xmm11"   "\n\t"
2205         "movupd  %%xmm11, (%1)"       "\n"
2206         : /*OUT*/
2207         : /*IN*/ "r"(src), "r"(dst)
2208         : /*TRASH*/ "xmm11"
2209      );
2210   } else {
2211      __asm__ __volatile__(
2212         "movupd  (%1), %%xmm11"         "\n\t"
2213         "movupd  (%0), %%xmm2"          "\n\t"
2214         "roundsd $0, %%xmm2, %%xmm11"   "\n\t"
2215         "movupd  %%xmm11, (%1)"         "\n"
2216         : /*OUT*/
2217         : /*IN*/ "r"(src), "r"(dst)
2218         : /*TRASH*/ "xmm11","xmm2"
2219      );
2220   }
2221}
2222
2223void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2224{
2225   if (mem) {
2226      __asm__ __volatile__(
2227         "movupd  (%1), %%xmm11"       "\n\t"
2228         "roundsd $1, (%0), %%xmm11"   "\n\t"
2229         "movupd  %%xmm11, (%1)"       "\n"
2230         : /*OUT*/
2231         : /*IN*/ "r"(src), "r"(dst)
2232         : /*TRASH*/ "xmm11"
2233      );
2234   } else {
2235      __asm__ __volatile__(
2236         "movupd  (%1), %%xmm11"         "\n\t"
2237         "movupd  (%0), %%xmm2"          "\n\t"
2238         "roundsd $1, %%xmm2, %%xmm11"   "\n\t"
2239         "movupd  %%xmm11, (%1)"         "\n"
2240         : /*OUT*/
2241         : /*IN*/ "r"(src), "r"(dst)
2242         : /*TRASH*/ "xmm11","xmm2"
2243      );
2244   }
2245}
2246
2247void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2248{
2249   if (mem) {
2250      __asm__ __volatile__(
2251         "movupd  (%1), %%xmm11"       "\n\t"
2252         "roundsd $2, (%0), %%xmm11"   "\n\t"
2253         "movupd  %%xmm11, (%1)"       "\n"
2254         : /*OUT*/
2255         : /*IN*/ "r"(src), "r"(dst)
2256         : /*TRASH*/ "xmm11"
2257      );
2258   } else {
2259      __asm__ __volatile__(
2260         "movupd  (%1), %%xmm11"         "\n\t"
2261         "movupd  (%0), %%xmm2"          "\n\t"
2262         "roundsd $2, %%xmm2, %%xmm11"   "\n\t"
2263         "movupd  %%xmm11, (%1)"         "\n"
2264         : /*OUT*/
2265         : /*IN*/ "r"(src), "r"(dst)
2266         : /*TRASH*/ "xmm11","xmm2"
2267      );
2268   }
2269}
2270
2271void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2272{
2273   if (mem) {
2274      __asm__ __volatile__(
2275         "movupd  (%1), %%xmm11"       "\n\t"
2276         "roundsd $3, (%0), %%xmm11"   "\n\t"
2277         "movupd  %%xmm11, (%1)"       "\n"
2278         : /*OUT*/
2279         : /*IN*/ "r"(src), "r"(dst)
2280         : /*TRASH*/ "xmm11"
2281      );
2282   } else {
2283      __asm__ __volatile__(
2284         "movupd  (%1), %%xmm11"         "\n\t"
2285         "movupd  (%0), %%xmm2"          "\n\t"
2286         "roundsd $3, %%xmm2, %%xmm11"   "\n\t"
2287         "movupd  %%xmm11, (%1)"         "\n"
2288         : /*OUT*/
2289         : /*IN*/ "r"(src), "r"(dst)
2290         : /*TRASH*/ "xmm11","xmm2"
2291      );
2292   }
2293}
2294
2295void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2296{
2297   if (mem) {
2298      __asm__ __volatile__(
2299         "movupd  (%1), %%xmm11"       "\n\t"
2300         "roundsd $4, (%0), %%xmm11"   "\n\t"
2301         "movupd  %%xmm11, (%1)"       "\n"
2302         : /*OUT*/
2303         : /*IN*/ "r"(src), "r"(dst)
2304         : /*TRASH*/ "xmm11"
2305      );
2306   } else {
2307      __asm__ __volatile__(
2308         "movupd  (%1), %%xmm11"         "\n\t"
2309         "movupd  (%0), %%xmm2"          "\n\t"
2310         "roundsd $4, %%xmm2, %%xmm11"   "\n\t"
2311         "movupd  %%xmm11, (%1)"         "\n"
2312         : /*OUT*/
2313         : /*IN*/ "r"(src), "r"(dst)
2314         : /*TRASH*/ "xmm11","xmm2"
2315      );
2316   }
2317}
2318
2319void test_ROUNDSD_w_immediate_rounding ( void )
2320{
2321   double vals[22];
2322   Int i = 0;
2323   vals[i++] = 0.0;
2324   vals[i++] = -0.0;
2325   vals[i++] = mkPosInf();
2326   vals[i++] = mkNegInf();
2327   vals[i++] = mkPosNan();
2328   vals[i++] = mkNegNan();
2329   vals[i++] = -1.3;
2330   vals[i++] = -1.1;
2331   vals[i++] = -0.9;
2332   vals[i++] = -0.7;
2333   vals[i++] = -0.50001;
2334   vals[i++] = -0.49999;
2335   vals[i++] = -0.3;
2336   vals[i++] = -0.1;
2337   vals[i++] = 0.1;
2338   vals[i++] = 0.3;
2339   vals[i++] = 0.49999;
2340   vals[i++] = 0.50001;
2341   vals[i++] = 0.7;
2342   vals[i++] = 0.9;
2343   vals[i++] = 1.1;
2344   vals[i++] = 1.3;
2345   assert(i == 22);
2346
2347   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2348      V128 src, dst;
2349
2350      randV128(&src);
2351      randV128(&dst);
2352      memcpy(&src[0], &vals[i], 8);
2353      do_ROUNDSD_000(False/*reg*/, &src, &dst);
2354      printf("r roundsd_000  ");
2355      showV128(&src);
2356      printf(" ");
2357      showV128(&dst);
2358      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2359      printf("\n");
2360
2361      randV128(&src);
2362      randV128(&dst);
2363      memcpy(&src[0], &vals[i], 8);
2364      do_ROUNDSD_000(True/*mem*/, &src, &dst);
2365      printf("m roundsd_000  ");
2366      showV128(&src);
2367      printf(" ");
2368      showV128(&dst);
2369      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2370      printf("\n");
2371
2372
2373      randV128(&src);
2374      randV128(&dst);
2375      memcpy(&src[0], &vals[i], 8);
2376      do_ROUNDSD_001(False/*reg*/, &src, &dst);
2377      printf("r roundsd_001  ");
2378      showV128(&src);
2379      printf(" ");
2380      showV128(&dst);
2381      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2382      printf("\n");
2383
2384      randV128(&src);
2385      randV128(&dst);
2386      memcpy(&src[0], &vals[i], 8);
2387      do_ROUNDSD_001(True/*mem*/, &src, &dst);
2388      printf("m roundsd_001  ");
2389      showV128(&src);
2390      printf(" ");
2391      showV128(&dst);
2392      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2393      printf("\n");
2394
2395
2396      randV128(&src);
2397      randV128(&dst);
2398      memcpy(&src[0], &vals[i], 8);
2399      do_ROUNDSD_010(False/*reg*/, &src, &dst);
2400      printf("r roundsd_010  ");
2401      showV128(&src);
2402      printf(" ");
2403      showV128(&dst);
2404      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2405      printf("\n");
2406
2407      randV128(&src);
2408      randV128(&dst);
2409      memcpy(&src[0], &vals[i], 8);
2410      do_ROUNDSD_010(True/*mem*/, &src, &dst);
2411      printf("m roundsd_010  ");
2412      showV128(&src);
2413      printf(" ");
2414      showV128(&dst);
2415      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2416      printf("\n");
2417
2418
2419      randV128(&src);
2420      randV128(&dst);
2421      memcpy(&src[0], &vals[i], 8);
2422      do_ROUNDSD_011(False/*reg*/, &src, &dst);
2423      printf("r roundsd_011  ");
2424      showV128(&src);
2425      printf(" ");
2426      showV128(&dst);
2427      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2428      printf("\n");
2429
2430      randV128(&src);
2431      randV128(&dst);
2432      memcpy(&src[0], &vals[i], 8);
2433      do_ROUNDSD_011(True/*mem*/, &src, &dst);
2434      printf("m roundsd_011  ");
2435      showV128(&src);
2436      printf(" ");
2437      showV128(&dst);
2438      printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2439      printf("\n");
2440   }
2441}
2442
2443void test_ROUNDSD_w_mxcsr_rounding ( void )
2444{
2445   UInt rm;
2446   double vals[22];
2447   Int i = 0;
2448   vals[i++] = 0.0;
2449   vals[i++] = -0.0;
2450   vals[i++] = mkPosInf();
2451   vals[i++] = mkNegInf();
2452   vals[i++] = mkPosNan();
2453   vals[i++] = mkNegNan();
2454   vals[i++] = -1.3;
2455   vals[i++] = -1.1;
2456   vals[i++] = -0.9;
2457   vals[i++] = -0.7;
2458   vals[i++] = -0.50001;
2459   vals[i++] = -0.49999;
2460   vals[i++] = -0.3;
2461   vals[i++] = -0.1;
2462   vals[i++] = 0.1;
2463   vals[i++] = 0.3;
2464   vals[i++] = 0.49999;
2465   vals[i++] = 0.50001;
2466   vals[i++] = 0.7;
2467   vals[i++] = 0.9;
2468   vals[i++] = 1.1;
2469   vals[i++] = 1.3;
2470   assert(i == 22);
2471
2472   rm = get_sse_roundingmode();
2473   assert(rm == 0); // 0 == RN == default
2474
2475   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2476      V128 src, dst;
2477
2478      for (rm = 0; rm <= 3; rm++) {
2479         set_sse_roundingmode(rm);
2480
2481         randV128(&src);
2482         randV128(&dst);
2483         memcpy(&src[0], &vals[i], 8);
2484         do_ROUNDSD_1XX(False/*reg*/, &src, &dst);
2485         printf("r (rm=%u) roundsd_1XX  ", rm);
2486         showV128(&src);
2487         printf(" ");
2488         showV128(&dst);
2489         printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2490         printf("\n");
2491
2492         randV128(&src);
2493         randV128(&dst);
2494         memcpy(&src[0], &vals[i], 8);
2495         do_ROUNDSD_1XX(True/*mem*/, &src, &dst);
2496         printf("m (rm=%u) roundsd_1XX  ", rm);
2497         showV128(&src);
2498         printf(" ");
2499         showV128(&dst);
2500         printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
2501         printf("\n");
2502      }
2503   }
2504
2505   rm = get_sse_roundingmode();
2506   assert(rm == 3);
2507   set_sse_roundingmode(0);
2508   rm = get_sse_roundingmode();
2509   assert(rm == 0); // 0 == RN == default
2510}
2511
2512
2513/* ------------ ROUNDSS ------------ */
2514
2515void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2516{
2517   if (mem) {
2518      __asm__ __volatile__(
2519         "movupd  (%1), %%xmm11"       "\n\t"
2520         "roundss $0, (%0), %%xmm11"   "\n\t"
2521         "movupd  %%xmm11, (%1)"       "\n"
2522         : /*OUT*/
2523         : /*IN*/ "r"(src), "r"(dst)
2524         : /*TRASH*/ "xmm11"
2525      );
2526   } else {
2527      __asm__ __volatile__(
2528         "movupd  (%1), %%xmm11"         "\n\t"
2529         "movupd  (%0), %%xmm2"          "\n\t"
2530         "roundss $0, %%xmm2, %%xmm11"   "\n\t"
2531         "movupd  %%xmm11, (%1)"         "\n"
2532         : /*OUT*/
2533         : /*IN*/ "r"(src), "r"(dst)
2534         : /*TRASH*/ "xmm11","xmm2"
2535      );
2536   }
2537}
2538
2539void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2540{
2541   if (mem) {
2542      __asm__ __volatile__(
2543         "movupd  (%1), %%xmm11"       "\n\t"
2544         "roundss $1, (%0), %%xmm11"   "\n\t"
2545         "movupd  %%xmm11, (%1)"       "\n"
2546         : /*OUT*/
2547         : /*IN*/ "r"(src), "r"(dst)
2548         : /*TRASH*/ "xmm11"
2549      );
2550   } else {
2551      __asm__ __volatile__(
2552         "movupd  (%1), %%xmm11"         "\n\t"
2553         "movupd  (%0), %%xmm2"          "\n\t"
2554         "roundss $1, %%xmm2, %%xmm11"   "\n\t"
2555         "movupd  %%xmm11, (%1)"         "\n"
2556         : /*OUT*/
2557         : /*IN*/ "r"(src), "r"(dst)
2558         : /*TRASH*/ "xmm11","xmm2"
2559      );
2560   }
2561}
2562
2563void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2564{
2565   if (mem) {
2566      __asm__ __volatile__(
2567         "movupd  (%1), %%xmm11"       "\n\t"
2568         "roundss $2, (%0), %%xmm11"   "\n\t"
2569         "movupd  %%xmm11, (%1)"       "\n"
2570         : /*OUT*/
2571         : /*IN*/ "r"(src), "r"(dst)
2572         : /*TRASH*/ "xmm11"
2573      );
2574   } else {
2575      __asm__ __volatile__(
2576         "movupd  (%1), %%xmm11"         "\n\t"
2577         "movupd  (%0), %%xmm2"          "\n\t"
2578         "roundss $2, %%xmm2, %%xmm11"   "\n\t"
2579         "movupd  %%xmm11, (%1)"         "\n"
2580         : /*OUT*/
2581         : /*IN*/ "r"(src), "r"(dst)
2582         : /*TRASH*/ "xmm11","xmm2"
2583      );
2584   }
2585}
2586
2587void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2588{
2589   if (mem) {
2590      __asm__ __volatile__(
2591         "movupd  (%1), %%xmm11"       "\n\t"
2592         "roundss $3, (%0), %%xmm11"   "\n\t"
2593         "movupd  %%xmm11, (%1)"       "\n"
2594         : /*OUT*/
2595         : /*IN*/ "r"(src), "r"(dst)
2596         : /*TRASH*/ "xmm11"
2597      );
2598   } else {
2599      __asm__ __volatile__(
2600         "movupd  (%1), %%xmm11"         "\n\t"
2601         "movupd  (%0), %%xmm2"          "\n\t"
2602         "roundss $3, %%xmm2, %%xmm11"   "\n\t"
2603         "movupd  %%xmm11, (%1)"         "\n"
2604         : /*OUT*/
2605         : /*IN*/ "r"(src), "r"(dst)
2606         : /*TRASH*/ "xmm11","xmm2"
2607      );
2608   }
2609}
2610
2611void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2612{
2613   if (mem) {
2614      __asm__ __volatile__(
2615         "movupd  (%1), %%xmm11"       "\n\t"
2616         "roundss $4, (%0), %%xmm11"   "\n\t"
2617         "movupd  %%xmm11, (%1)"       "\n"
2618         : /*OUT*/
2619         : /*IN*/ "r"(src), "r"(dst)
2620         : /*TRASH*/ "xmm11"
2621      );
2622   } else {
2623      __asm__ __volatile__(
2624         "movupd  (%1), %%xmm11"         "\n\t"
2625         "movupd  (%0), %%xmm2"          "\n\t"
2626         "roundss $4, %%xmm2, %%xmm11"   "\n\t"
2627         "movupd  %%xmm11, (%1)"         "\n"
2628         : /*OUT*/
2629         : /*IN*/ "r"(src), "r"(dst)
2630         : /*TRASH*/ "xmm11","xmm2"
2631      );
2632   }
2633}
2634
2635void test_ROUNDSS_w_immediate_rounding ( void )
2636{
2637   float vals[22];
2638   Int i = 0;
2639   vals[i++] = 0.0;
2640   vals[i++] = -0.0;
2641   vals[i++] = mkPosInf();
2642   vals[i++] = mkNegInf();
2643   vals[i++] = mkPosNan();
2644   vals[i++] = mkNegNan();
2645   vals[i++] = -1.3;
2646   vals[i++] = -1.1;
2647   vals[i++] = -0.9;
2648   vals[i++] = -0.7;
2649   vals[i++] = -0.50001;
2650   vals[i++] = -0.49999;
2651   vals[i++] = -0.3;
2652   vals[i++] = -0.1;
2653   vals[i++] = 0.1;
2654   vals[i++] = 0.3;
2655   vals[i++] = 0.49999;
2656   vals[i++] = 0.50001;
2657   vals[i++] = 0.7;
2658   vals[i++] = 0.9;
2659   vals[i++] = 1.1;
2660   vals[i++] = 1.3;
2661   assert(i == 22);
2662
2663   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2664      V128 src, dst;
2665
2666      randV128(&src);
2667      randV128(&dst);
2668      memcpy(&src[0], &vals[i], 4);
2669      do_ROUNDSS_000(False/*reg*/, &src, &dst);
2670      printf("r roundss_000  ");
2671      showV128(&src);
2672      printf(" ");
2673      showV128(&dst);
2674      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2675      printf("\n");
2676
2677      randV128(&src);
2678      randV128(&dst);
2679      memcpy(&src[0], &vals[i], 4);
2680      do_ROUNDSS_000(True/*mem*/, &src, &dst);
2681      printf("m roundss_000  ");
2682      showV128(&src);
2683      printf(" ");
2684      showV128(&dst);
2685      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2686      printf("\n");
2687
2688
2689      randV128(&src);
2690      randV128(&dst);
2691      memcpy(&src[0], &vals[i], 4);
2692      do_ROUNDSS_001(False/*reg*/, &src, &dst);
2693      printf("r roundss_001  ");
2694      showV128(&src);
2695      printf(" ");
2696      showV128(&dst);
2697      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2698      printf("\n");
2699
2700      randV128(&src);
2701      randV128(&dst);
2702      memcpy(&src[0], &vals[i], 4);
2703      do_ROUNDSS_001(True/*mem*/, &src, &dst);
2704      printf("m roundss_001  ");
2705      showV128(&src);
2706      printf(" ");
2707      showV128(&dst);
2708      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2709      printf("\n");
2710
2711
2712      randV128(&src);
2713      randV128(&dst);
2714      memcpy(&src[0], &vals[i], 4);
2715      do_ROUNDSS_010(False/*reg*/, &src, &dst);
2716      printf("r roundss_010  ");
2717      showV128(&src);
2718      printf(" ");
2719      showV128(&dst);
2720      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2721      printf("\n");
2722
2723      randV128(&src);
2724      randV128(&dst);
2725      memcpy(&src[0], &vals[i], 4);
2726      do_ROUNDSS_010(True/*mem*/, &src, &dst);
2727      printf("m roundss_010  ");
2728      showV128(&src);
2729      printf(" ");
2730      showV128(&dst);
2731      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2732      printf("\n");
2733
2734
2735      randV128(&src);
2736      randV128(&dst);
2737      memcpy(&src[0], &vals[i], 4);
2738      do_ROUNDSS_011(False/*reg*/, &src, &dst);
2739      printf("r roundss_011  ");
2740      showV128(&src);
2741      printf(" ");
2742      showV128(&dst);
2743      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2744      printf("\n");
2745
2746      randV128(&src);
2747      randV128(&dst);
2748      memcpy(&src[0], &vals[i], 4);
2749      do_ROUNDSS_011(True/*mem*/, &src, &dst);
2750      printf("m roundss_011  ");
2751      showV128(&src);
2752      printf(" ");
2753      showV128(&dst);
2754      printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2755      printf("\n");
2756   }
2757}
2758
2759void test_ROUNDSS_w_mxcsr_rounding ( void )
2760{
2761   UInt rm;
2762   float vals[22];
2763   Int i = 0;
2764   vals[i++] = 0.0;
2765   vals[i++] = -0.0;
2766   vals[i++] = mkPosInf();
2767   vals[i++] = mkNegInf();
2768   vals[i++] = mkPosNan();
2769   vals[i++] = mkNegNan();
2770   vals[i++] = -1.3;
2771   vals[i++] = -1.1;
2772   vals[i++] = -0.9;
2773   vals[i++] = -0.7;
2774   vals[i++] = -0.50001;
2775   vals[i++] = -0.49999;
2776   vals[i++] = -0.3;
2777   vals[i++] = -0.1;
2778   vals[i++] = 0.1;
2779   vals[i++] = 0.3;
2780   vals[i++] = 0.49999;
2781   vals[i++] = 0.50001;
2782   vals[i++] = 0.7;
2783   vals[i++] = 0.9;
2784   vals[i++] = 1.1;
2785   vals[i++] = 1.3;
2786   assert(i == 22);
2787
2788   rm = get_sse_roundingmode();
2789   assert(rm == 0); // 0 == RN == default
2790
2791   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2792      V128 src, dst;
2793
2794      for (rm = 0; rm <= 3; rm++) {
2795         set_sse_roundingmode(rm);
2796
2797         randV128(&src);
2798         randV128(&dst);
2799         memcpy(&src[0], &vals[i], 4);
2800         do_ROUNDSS_1XX(False/*reg*/, &src, &dst);
2801         printf("r (rm=%u) roundss_1XX  ", rm);
2802         showV128(&src);
2803         printf(" ");
2804         showV128(&dst);
2805         printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2806         printf("\n");
2807
2808         randV128(&src);
2809         randV128(&dst);
2810         memcpy(&src[0], &vals[i], 4);
2811         do_ROUNDSS_1XX(True/*mem*/, &src, &dst);
2812         printf("m (rm=%u) roundss_1XX  ", rm);
2813         showV128(&src);
2814         printf(" ");
2815         showV128(&dst);
2816         printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2817         printf("\n");
2818      }
2819   }
2820
2821   rm = get_sse_roundingmode();
2822   assert(rm == 3);
2823   set_sse_roundingmode(0);
2824   rm = get_sse_roundingmode();
2825   assert(rm == 0); // 0 == RN == default
2826}
2827
2828/* ------------ ROUNDPD ------------ */
2829
2830void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2831{
2832   if (mem) {
2833      __asm__ __volatile__(
2834         "movupd  (%1), %%xmm11"       "\n\t"
2835         "roundpd $0, (%0), %%xmm11"   "\n\t"
2836         "movupd  %%xmm11, (%1)"       "\n"
2837         : /*OUT*/
2838         : /*IN*/ "r"(src), "r"(dst)
2839         : /*TRASH*/ "xmm11"
2840      );
2841   } else {
2842      __asm__ __volatile__(
2843         "movupd  (%1), %%xmm11"         "\n\t"
2844         "movupd  (%0), %%xmm2"          "\n\t"
2845         "roundpd $0, %%xmm2, %%xmm11"   "\n\t"
2846         "movupd  %%xmm11, (%1)"         "\n"
2847         : /*OUT*/
2848         : /*IN*/ "r"(src), "r"(dst)
2849         : /*TRASH*/ "xmm11","xmm2"
2850      );
2851   }
2852}
2853
2854void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2855{
2856   if (mem) {
2857      __asm__ __volatile__(
2858         "movupd  (%1), %%xmm11"       "\n\t"
2859         "roundpd $1, (%0), %%xmm11"   "\n\t"
2860         "movupd  %%xmm11, (%1)"       "\n"
2861         : /*OUT*/
2862         : /*IN*/ "r"(src), "r"(dst)
2863         : /*TRASH*/ "xmm11"
2864      );
2865   } else {
2866      __asm__ __volatile__(
2867         "movupd  (%1), %%xmm11"         "\n\t"
2868         "movupd  (%0), %%xmm2"          "\n\t"
2869         "roundpd $1, %%xmm2, %%xmm11"   "\n\t"
2870         "movupd  %%xmm11, (%1)"         "\n"
2871         : /*OUT*/
2872         : /*IN*/ "r"(src), "r"(dst)
2873         : /*TRASH*/ "xmm11","xmm2"
2874      );
2875   }
2876}
2877
2878void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2879{
2880   if (mem) {
2881      __asm__ __volatile__(
2882         "movupd  (%1), %%xmm11"       "\n\t"
2883         "roundpd $2, (%0), %%xmm11"   "\n\t"
2884         "movupd  %%xmm11, (%1)"       "\n"
2885         : /*OUT*/
2886         : /*IN*/ "r"(src), "r"(dst)
2887         : /*TRASH*/ "xmm11"
2888      );
2889   } else {
2890      __asm__ __volatile__(
2891         "movupd  (%1), %%xmm11"         "\n\t"
2892         "movupd  (%0), %%xmm2"          "\n\t"
2893         "roundpd $2, %%xmm2, %%xmm11"   "\n\t"
2894         "movupd  %%xmm11, (%1)"         "\n"
2895         : /*OUT*/
2896         : /*IN*/ "r"(src), "r"(dst)
2897         : /*TRASH*/ "xmm11","xmm2"
2898      );
2899   }
2900}
2901
2902void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2903{
2904   if (mem) {
2905      __asm__ __volatile__(
2906         "movupd  (%1), %%xmm11"       "\n\t"
2907         "roundpd $3, (%0), %%xmm11"   "\n\t"
2908         "movupd  %%xmm11, (%1)"       "\n"
2909         : /*OUT*/
2910         : /*IN*/ "r"(src), "r"(dst)
2911         : /*TRASH*/ "xmm11"
2912      );
2913   } else {
2914      __asm__ __volatile__(
2915         "movupd  (%1), %%xmm11"         "\n\t"
2916         "movupd  (%0), %%xmm2"          "\n\t"
2917         "roundpd $3, %%xmm2, %%xmm11"   "\n\t"
2918         "movupd  %%xmm11, (%1)"         "\n"
2919         : /*OUT*/
2920         : /*IN*/ "r"(src), "r"(dst)
2921         : /*TRASH*/ "xmm11","xmm2"
2922      );
2923   }
2924}
2925
2926void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2927{
2928   if (mem) {
2929      __asm__ __volatile__(
2930         "movupd  (%1), %%xmm11"       "\n\t"
2931         "roundpd $4, (%0), %%xmm11"   "\n\t"
2932         "movupd  %%xmm11, (%1)"       "\n"
2933         : /*OUT*/
2934         : /*IN*/ "r"(src), "r"(dst)
2935         : /*TRASH*/ "xmm11"
2936      );
2937   } else {
2938      __asm__ __volatile__(
2939         "movupd  (%1), %%xmm11"         "\n\t"
2940         "movupd  (%0), %%xmm2"          "\n\t"
2941         "roundpd $4, %%xmm2, %%xmm11"   "\n\t"
2942         "movupd  %%xmm11, (%1)"         "\n"
2943         : /*OUT*/
2944         : /*IN*/ "r"(src), "r"(dst)
2945         : /*TRASH*/ "xmm11","xmm2"
2946      );
2947   }
2948}
2949
2950void test_ROUNDPD_w_immediate_rounding ( void )
2951{
2952   double vals[22];
2953   Int i = 0;
2954   vals[i++] = 0.0;
2955   vals[i++] = -0.0;
2956   vals[i++] = mkPosInf();
2957   vals[i++] = mkNegInf();
2958   vals[i++] = mkPosNan();
2959   vals[i++] = mkNegNan();
2960   vals[i++] = -1.3;
2961   vals[i++] = -1.1;
2962   vals[i++] = -0.9;
2963   vals[i++] = -0.7;
2964   vals[i++] = -0.50001;
2965   vals[i++] = -0.49999;
2966   vals[i++] = -0.3;
2967   vals[i++] = -0.1;
2968   vals[i++] = 0.1;
2969   vals[i++] = 0.3;
2970   vals[i++] = 0.49999;
2971   vals[i++] = 0.50001;
2972   vals[i++] = 0.7;
2973   vals[i++] = 0.9;
2974   vals[i++] = 1.1;
2975   vals[i++] = 1.3;
2976   assert(i == 22);
2977
2978   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2979      V128 src, dst;
2980
2981      randV128(&src);
2982      randV128(&dst);
2983      memcpy(&src[0], &vals[i], 8);
2984      memcpy(&src[8], &vals[(i+11)%22], 8);
2985      do_ROUNDPD_000(False/*reg*/, &src, &dst);
2986      printf("r roundpd_000  ");
2987      showV128(&src);
2988      printf(" ");
2989      showV128(&dst);
2990      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
2991      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2992      printf("\n");
2993
2994      randV128(&src);
2995      randV128(&dst);
2996      memcpy(&src[0], &vals[i], 8);
2997      memcpy(&src[8], &vals[(i+11)%22], 8);
2998      do_ROUNDPD_000(True/*mem*/, &src, &dst);
2999      printf("m roundpd_000  ");
3000      showV128(&src);
3001      printf(" ");
3002      showV128(&dst);
3003      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3004      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3005      printf("\n");
3006
3007
3008      randV128(&src);
3009      randV128(&dst);
3010      memcpy(&src[0], &vals[i], 8);
3011      memcpy(&src[8], &vals[(i+11)%22], 8);
3012      do_ROUNDPD_001(False/*reg*/, &src, &dst);
3013      printf("r roundpd_001  ");
3014      showV128(&src);
3015      printf(" ");
3016      showV128(&dst);
3017      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3018      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3019      printf("\n");
3020
3021      randV128(&src);
3022      randV128(&dst);
3023      memcpy(&src[0], &vals[i], 8);
3024      memcpy(&src[8], &vals[(i+11)%22], 8);
3025      do_ROUNDPD_001(True/*mem*/, &src, &dst);
3026      printf("m roundpd_001  ");
3027      showV128(&src);
3028      printf(" ");
3029      showV128(&dst);
3030      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3031      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3032      printf("\n");
3033
3034
3035      randV128(&src);
3036      randV128(&dst);
3037      memcpy(&src[0], &vals[i], 8);
3038      memcpy(&src[8], &vals[(i+11)%22], 8);
3039      do_ROUNDPD_010(False/*reg*/, &src, &dst);
3040      printf("r roundpd_010  ");
3041      showV128(&src);
3042      printf(" ");
3043      showV128(&dst);
3044      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3045      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3046      printf("\n");
3047
3048      randV128(&src);
3049      randV128(&dst);
3050      memcpy(&src[0], &vals[i], 8);
3051      memcpy(&src[8], &vals[(i+11)%22], 8);
3052      do_ROUNDPD_010(True/*mem*/, &src, &dst);
3053      printf("m roundpd_010  ");
3054      showV128(&src);
3055      printf(" ");
3056      showV128(&dst);
3057      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3058      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3059      printf("\n");
3060
3061
3062      randV128(&src);
3063      randV128(&dst);
3064      memcpy(&src[0], &vals[i], 8);
3065      memcpy(&src[8], &vals[(i+11)%22], 8);
3066      do_ROUNDPD_011(False/*reg*/, &src, &dst);
3067      printf("r roundpd_011  ");
3068      showV128(&src);
3069      printf(" ");
3070      showV128(&dst);
3071      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3072      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3073      printf("\n");
3074
3075      randV128(&src);
3076      randV128(&dst);
3077      memcpy(&src[0], &vals[i], 8);
3078      memcpy(&src[8], &vals[(i+11)%22], 8);
3079      do_ROUNDPD_011(True/*mem*/, &src, &dst);
3080      printf("m roundpd_011  ");
3081      showV128(&src);
3082      printf(" ");
3083      showV128(&dst);
3084      printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3085      printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3086      printf("\n");
3087   }
3088}
3089
3090void test_ROUNDPD_w_mxcsr_rounding ( void )
3091{
3092   UInt rm;
3093   double vals[22];
3094   Int i = 0;
3095   vals[i++] = 0.0;
3096   vals[i++] = -0.0;
3097   vals[i++] = mkPosInf();
3098   vals[i++] = mkNegInf();
3099   vals[i++] = mkPosNan();
3100   vals[i++] = mkNegNan();
3101   vals[i++] = -1.3;
3102   vals[i++] = -1.1;
3103   vals[i++] = -0.9;
3104   vals[i++] = -0.7;
3105   vals[i++] = -0.50001;
3106   vals[i++] = -0.49999;
3107   vals[i++] = -0.3;
3108   vals[i++] = -0.1;
3109   vals[i++] = 0.1;
3110   vals[i++] = 0.3;
3111   vals[i++] = 0.49999;
3112   vals[i++] = 0.50001;
3113   vals[i++] = 0.7;
3114   vals[i++] = 0.9;
3115   vals[i++] = 1.1;
3116   vals[i++] = 1.3;
3117   assert(i == 22);
3118
3119   rm = get_sse_roundingmode();
3120   assert(rm == 0); // 0 == RN == default
3121
3122   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3123      V128 src, dst;
3124
3125      for (rm = 0; rm <= 3; rm++) {
3126         set_sse_roundingmode(rm);
3127
3128         randV128(&src);
3129         randV128(&dst);
3130         memcpy(&src[0], &vals[i], 8);
3131         memcpy(&src[8], &vals[(i+11)%22], 8);
3132         do_ROUNDPD_1XX(False/*reg*/, &src, &dst);
3133         printf("r (rm=%u) roundpd_1XX  ", rm);
3134         showV128(&src);
3135         printf(" ");
3136         showV128(&dst);
3137         printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3138         printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3139         printf("\n");
3140
3141         randV128(&src);
3142         randV128(&dst);
3143         memcpy(&src[0], &vals[i], 8);
3144         memcpy(&src[8], &vals[(i+11)%22], 8);
3145         do_ROUNDPD_1XX(True/*mem*/, &src, &dst);
3146         printf("m (rm=%u) roundpd_1XX  ", rm);
3147         showV128(&src);
3148         printf(" ");
3149         showV128(&dst);
3150         printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
3151         printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3152         printf("\n");
3153      }
3154   }
3155
3156   rm = get_sse_roundingmode();
3157   assert(rm == 3);
3158   set_sse_roundingmode(0);
3159   rm = get_sse_roundingmode();
3160   assert(rm == 0); // 0 == RN == default
3161}
3162
3163/* ------------ ROUNDPS ------------ */
3164
3165void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
3166{
3167   if (mem) {
3168      __asm__ __volatile__(
3169         "movupd  (%1), %%xmm11"       "\n\t"
3170         "roundps $0, (%0), %%xmm11"   "\n\t"
3171         "movupd  %%xmm11, (%1)"       "\n"
3172         : /*OUT*/
3173         : /*IN*/ "r"(src), "r"(dst)
3174         : /*TRASH*/ "xmm11"
3175      );
3176   } else {
3177      __asm__ __volatile__(
3178         "movupd  (%1), %%xmm11"         "\n\t"
3179         "movupd  (%0), %%xmm2"          "\n\t"
3180         "roundps $0, %%xmm2, %%xmm11"   "\n\t"
3181         "movupd  %%xmm11, (%1)"         "\n"
3182         : /*OUT*/
3183         : /*IN*/ "r"(src), "r"(dst)
3184         : /*TRASH*/ "xmm11","xmm2"
3185      );
3186   }
3187}
3188
3189void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
3190{
3191   if (mem) {
3192      __asm__ __volatile__(
3193         "movupd  (%1), %%xmm11"       "\n\t"
3194         "roundps $1, (%0), %%xmm11"   "\n\t"
3195         "movupd  %%xmm11, (%1)"       "\n"
3196         : /*OUT*/
3197         : /*IN*/ "r"(src), "r"(dst)
3198         : /*TRASH*/ "xmm11"
3199      );
3200   } else {
3201      __asm__ __volatile__(
3202         "movupd  (%1), %%xmm11"         "\n\t"
3203         "movupd  (%0), %%xmm2"          "\n\t"
3204         "roundps $1, %%xmm2, %%xmm11"   "\n\t"
3205         "movupd  %%xmm11, (%1)"         "\n"
3206         : /*OUT*/
3207         : /*IN*/ "r"(src), "r"(dst)
3208         : /*TRASH*/ "xmm11","xmm2"
3209      );
3210   }
3211}
3212
3213void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
3214{
3215   if (mem) {
3216      __asm__ __volatile__(
3217         "movupd  (%1), %%xmm11"       "\n\t"
3218         "roundps $2, (%0), %%xmm11"   "\n\t"
3219         "movupd  %%xmm11, (%1)"       "\n"
3220         : /*OUT*/
3221         : /*IN*/ "r"(src), "r"(dst)
3222         : /*TRASH*/ "xmm11"
3223      );
3224   } else {
3225      __asm__ __volatile__(
3226         "movupd  (%1), %%xmm11"         "\n\t"
3227         "movupd  (%0), %%xmm2"          "\n\t"
3228         "roundps $2, %%xmm2, %%xmm11"   "\n\t"
3229         "movupd  %%xmm11, (%1)"         "\n"
3230         : /*OUT*/
3231         : /*IN*/ "r"(src), "r"(dst)
3232         : /*TRASH*/ "xmm11","xmm2"
3233      );
3234   }
3235}
3236
3237void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
3238{
3239   if (mem) {
3240      __asm__ __volatile__(
3241         "movupd  (%1), %%xmm11"       "\n\t"
3242         "roundps $3, (%0), %%xmm11"   "\n\t"
3243         "movupd  %%xmm11, (%1)"       "\n"
3244         : /*OUT*/
3245         : /*IN*/ "r"(src), "r"(dst)
3246         : /*TRASH*/ "xmm11"
3247      );
3248   } else {
3249      __asm__ __volatile__(
3250         "movupd  (%1), %%xmm11"         "\n\t"
3251         "movupd  (%0), %%xmm2"          "\n\t"
3252         "roundps $3, %%xmm2, %%xmm11"   "\n\t"
3253         "movupd  %%xmm11, (%1)"         "\n"
3254         : /*OUT*/
3255         : /*IN*/ "r"(src), "r"(dst)
3256         : /*TRASH*/ "xmm11","xmm2"
3257      );
3258   }
3259}
3260
3261void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
3262{
3263   if (mem) {
3264      __asm__ __volatile__(
3265         "movupd  (%1), %%xmm11"       "\n\t"
3266         "roundps $4, (%0), %%xmm11"   "\n\t"
3267         "movupd  %%xmm11, (%1)"       "\n"
3268         : /*OUT*/
3269         : /*IN*/ "r"(src), "r"(dst)
3270         : /*TRASH*/ "xmm11"
3271      );
3272   } else {
3273      __asm__ __volatile__(
3274         "movupd  (%1), %%xmm11"         "\n\t"
3275         "movupd  (%0), %%xmm2"          "\n\t"
3276         "roundps $4, %%xmm2, %%xmm11"   "\n\t"
3277         "movupd  %%xmm11, (%1)"         "\n"
3278         : /*OUT*/
3279         : /*IN*/ "r"(src), "r"(dst)
3280         : /*TRASH*/ "xmm11","xmm2"
3281      );
3282   }
3283}
3284
3285void test_ROUNDPS_w_immediate_rounding ( void )
3286{
3287   float vals[22];
3288   Int i = 0;
3289   vals[i++] = 0.0;
3290   vals[i++] = -0.0;
3291   vals[i++] = mkPosInf();
3292   vals[i++] = mkNegInf();
3293   vals[i++] = mkPosNan();
3294   vals[i++] = mkNegNan();
3295   vals[i++] = -1.3;
3296   vals[i++] = -1.1;
3297   vals[i++] = -0.9;
3298   vals[i++] = -0.7;
3299   vals[i++] = -0.50001;
3300   vals[i++] = -0.49999;
3301   vals[i++] = -0.3;
3302   vals[i++] = -0.1;
3303   vals[i++] = 0.1;
3304   vals[i++] = 0.3;
3305   vals[i++] = 0.49999;
3306   vals[i++] = 0.50001;
3307   vals[i++] = 0.7;
3308   vals[i++] = 0.9;
3309   vals[i++] = 1.1;
3310   vals[i++] = 1.3;
3311   assert(i == 22);
3312
3313   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3314      V128 src, dst;
3315
3316      randV128(&src);
3317      randV128(&dst);
3318      memcpy(&src[0], &vals[i], 4);
3319      memcpy(&src[4], &vals[(i+5)%22], 4);
3320      memcpy(&src[8], &vals[(i+11)%22], 4);
3321      memcpy(&src[12], &vals[(i+17)%22], 4);
3322      do_ROUNDPS_000(False/*reg*/, &src, &dst);
3323      printf("r roundps_000  ");
3324      showV128(&src);
3325      printf(" ");
3326      showV128(&dst);
3327      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3328      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3329      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3330      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3331      printf("\n");
3332
3333      randV128(&src);
3334      randV128(&dst);
3335      memcpy(&src[0], &vals[i], 4);
3336      memcpy(&src[4], &vals[(i+5)%22], 4);
3337      memcpy(&src[8], &vals[(i+11)%22], 4);
3338      memcpy(&src[12], &vals[(i+17)%22], 4);
3339      do_ROUNDPS_000(True/*mem*/, &src, &dst);
3340      printf("m roundps_000  ");
3341      showV128(&src);
3342      printf(" ");
3343      showV128(&dst);
3344      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3345      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3346      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3347      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3348      printf("\n");
3349
3350
3351      randV128(&src);
3352      randV128(&dst);
3353      memcpy(&src[0], &vals[i], 4);
3354      memcpy(&src[4], &vals[(i+5)%22], 4);
3355      memcpy(&src[8], &vals[(i+11)%22], 4);
3356      memcpy(&src[12], &vals[(i+17)%22], 4);
3357      do_ROUNDPS_001(False/*reg*/, &src, &dst);
3358      printf("r roundps_001  ");
3359      showV128(&src);
3360      printf(" ");
3361      showV128(&dst);
3362      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3363      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3364      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3365      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3366      printf("\n");
3367
3368      randV128(&src);
3369      randV128(&dst);
3370      memcpy(&src[0], &vals[i], 4);
3371      memcpy(&src[4], &vals[(i+5)%22], 4);
3372      memcpy(&src[8], &vals[(i+11)%22], 4);
3373      memcpy(&src[12], &vals[(i+17)%22], 4);
3374      do_ROUNDPS_001(True/*mem*/, &src, &dst);
3375      printf("m roundps_001  ");
3376      showV128(&src);
3377      printf(" ");
3378      showV128(&dst);
3379      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3380      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3381      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3382      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3383      printf("\n");
3384
3385
3386      randV128(&src);
3387      randV128(&dst);
3388      memcpy(&src[0], &vals[i], 4);
3389      memcpy(&src[4], &vals[(i+5)%22], 4);
3390      memcpy(&src[8], &vals[(i+11)%22], 4);
3391      memcpy(&src[12], &vals[(i+17)%22], 4);
3392      do_ROUNDPS_010(False/*reg*/, &src, &dst);
3393      printf("r roundps_010  ");
3394      showV128(&src);
3395      printf(" ");
3396      showV128(&dst);
3397      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3398      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3399      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3400      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3401      printf("\n");
3402
3403      randV128(&src);
3404      randV128(&dst);
3405      memcpy(&src[0], &vals[i], 4);
3406      memcpy(&src[4], &vals[(i+5)%22], 4);
3407      memcpy(&src[8], &vals[(i+11)%22], 4);
3408      memcpy(&src[12], &vals[(i+17)%22], 4);
3409      do_ROUNDPS_010(True/*mem*/, &src, &dst);
3410      printf("m roundps_010  ");
3411      showV128(&src);
3412      printf(" ");
3413      showV128(&dst);
3414      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3415      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3416      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3417      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3418      printf("\n");
3419
3420
3421      randV128(&src);
3422      randV128(&dst);
3423      memcpy(&src[0], &vals[i], 4);
3424      memcpy(&src[4], &vals[(i+5)%22], 4);
3425      memcpy(&src[8], &vals[(i+11)%22], 4);
3426      memcpy(&src[12], &vals[(i+17)%22], 4);
3427      do_ROUNDPS_011(False/*reg*/, &src, &dst);
3428      printf("r roundps_011  ");
3429      showV128(&src);
3430      printf(" ");
3431      showV128(&dst);
3432      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3433      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3434      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3435      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3436      printf("\n");
3437
3438      randV128(&src);
3439      randV128(&dst);
3440      memcpy(&src[0], &vals[i], 4);
3441      memcpy(&src[4], &vals[(i+5)%22], 4);
3442      memcpy(&src[8], &vals[(i+11)%22], 4);
3443      memcpy(&src[12], &vals[(i+17)%22], 4);
3444      do_ROUNDPS_011(True/*mem*/, &src, &dst);
3445      printf("m roundps_011  ");
3446      showV128(&src);
3447      printf(" ");
3448      showV128(&dst);
3449      printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3450      printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3451      printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3452      printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3453      printf("\n");
3454   }
3455}
3456
3457void test_ROUNDPS_w_mxcsr_rounding ( void )
3458{
3459   UInt rm;
3460   float vals[22];
3461   Int i = 0;
3462   vals[i++] = 0.0;
3463   vals[i++] = -0.0;
3464   vals[i++] = mkPosInf();
3465   vals[i++] = mkNegInf();
3466   vals[i++] = mkPosNan();
3467   vals[i++] = mkNegNan();
3468   vals[i++] = -1.3;
3469   vals[i++] = -1.1;
3470   vals[i++] = -0.9;
3471   vals[i++] = -0.7;
3472   vals[i++] = -0.50001;
3473   vals[i++] = -0.49999;
3474   vals[i++] = -0.3;
3475   vals[i++] = -0.1;
3476   vals[i++] = 0.1;
3477   vals[i++] = 0.3;
3478   vals[i++] = 0.49999;
3479   vals[i++] = 0.50001;
3480   vals[i++] = 0.7;
3481   vals[i++] = 0.9;
3482   vals[i++] = 1.1;
3483   vals[i++] = 1.3;
3484   assert(i == 22);
3485
3486   rm = get_sse_roundingmode();
3487   assert(rm == 0); // 0 == RN == default
3488
3489   for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3490      V128 src, dst;
3491
3492      for (rm = 0; rm <= 3; rm++) {
3493         set_sse_roundingmode(rm);
3494
3495         randV128(&src);
3496         randV128(&dst);
3497         memcpy(&src[0], &vals[i], 4);
3498         memcpy(&src[4], &vals[(i+5)%22], 4);
3499         memcpy(&src[8], &vals[(i+11)%22], 4);
3500         memcpy(&src[12], &vals[(i+17)%22], 4);
3501         do_ROUNDPS_1XX(False/*reg*/, &src, &dst);
3502         printf("r (rm=%u) roundps_1XX  ", rm);
3503         showV128(&src);
3504         printf(" ");
3505         showV128(&dst);
3506         printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3507         printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3508         printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3509         printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3510         printf("\n");
3511
3512         randV128(&src);
3513         randV128(&dst);
3514         memcpy(&src[0], &vals[i], 4);
3515         memcpy(&src[4], &vals[(i+5)%22], 4);
3516         memcpy(&src[8], &vals[(i+11)%22], 4);
3517         memcpy(&src[12], &vals[(i+17)%22], 4);
3518         do_ROUNDPS_1XX(True/*mem*/, &src, &dst);
3519         printf("m (rm=%u) roundps_1XX  ", rm);
3520         showV128(&src);
3521         printf(" ");
3522         showV128(&dst);
3523         printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3524         printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3525         printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3526         printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3527         printf("\n");
3528      }
3529   }
3530
3531   rm = get_sse_roundingmode();
3532   assert(rm == 3);
3533   set_sse_roundingmode(0);
3534   rm = get_sse_roundingmode();
3535   assert(rm == 0); // 0 == RN == default
3536}
3537
3538/* ------------ PTEST ------------ */
3539
3540void test_PTEST ( void )
3541{
3542   const Int ntests = 8;
3543   V128 spec[ntests];
3544   do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL );
3545   do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL );
3546   do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL );
3547   do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL );
3548   do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL );
3549   do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL );
3550   do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL );
3551   do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL );
3552   V128 block[2];
3553   Int i, j;
3554   ULong flags;
3555   for (i = 0; i < ntests; i++) {
3556      for (j = 0; j < ntests; j++) {
3557         memcpy(&block[0], &spec[i], 16);
3558         memcpy(&block[1], &spec[j], 16);
3559         __asm__ __volatile__(
3560            "subq $256, %%rsp"        "\n\t"
3561            "movupd 0(%1), %%xmm2"    "\n\t"
3562            "ptest 16(%1), %%xmm2"    "\n\t"
3563            "pushfq"                  "\n\t"
3564            "popq %0"                 "\n\t"
3565            "addq $256, %%rsp"        "\n\t"
3566            : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) :
3567            "xmm2", "memory", "cc"
3568         );
3569         printf("r   ptest ");
3570         showV128(&block[0]);
3571         printf(" ");
3572         showV128(&block[1]);
3573         printf(" -> eflags %04x\n", (UInt)flags & 0x8D5);
3574      }
3575   }
3576}
3577
3578/* ------------ PBLENDVB ------------ */
3579
3580void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3581{
3582   if (mem) {
3583      __asm__ __volatile__(
3584         "movupd   (%2), %%xmm0"         "\n\t"
3585         "movupd   (%1), %%xmm11"        "\n\t"
3586         "pblendvb (%0), %%xmm11"        "\n\t"
3587         "movupd   %%xmm11, (%1)"        "\n"
3588         : /*OUT*/
3589         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3590         : /*TRASH*/ "xmm11","xmm0"
3591      );
3592   } else {
3593      __asm__ __volatile__(
3594         "movupd   (%2), %%xmm0"         "\n\t"
3595         "movupd   (%1), %%xmm11"        "\n\t"
3596         "movupd   (%0), %%xmm2"         "\n\t"
3597         "pblendvb %%xmm2, %%xmm11"      "\n\t"
3598         "movupd   %%xmm11, (%1)"        "\n"
3599         : /*OUT*/
3600         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3601         : /*TRASH*/ "xmm11","xmm2","xmm0"
3602      );
3603   }
3604}
3605
3606void test_PBLENDVB ( void )
3607{
3608   V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3609   Int i;
3610   for (i = 0; i < 10; i++) {
3611      randV128(&t_xmm0);
3612      randV128(&t_src);
3613      randV128(&t_dst);
3614
3615      memcpy(&xmm0, &t_xmm0, 16);
3616      memcpy(&src, &t_src, 16);
3617      memcpy(&dst, &t_dst, 16);
3618      do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst);
3619      printf("r pblendvb  ");
3620      showV128(&t_xmm0);
3621      printf(" ");
3622      showV128(&t_src);
3623      printf(" ");
3624      showV128(&t_dst);
3625      printf(" -> ");
3626      showV128(&dst);
3627      printf("\n");
3628
3629      memcpy(&xmm0, &t_xmm0, 16);
3630      memcpy(&src, &t_src, 16);
3631      memcpy(&dst, &t_dst, 16);
3632      do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst);
3633      printf("m pblendvb  ");
3634      showV128(&t_xmm0);
3635      printf(" ");
3636      showV128(&t_src);
3637      printf(" ");
3638      showV128(&t_dst);
3639      printf(" -> ");
3640      showV128(&dst);
3641      printf("\n");
3642   }
3643}
3644
3645/* ------------ BLENDVPD ------------ */
3646
3647void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3648{
3649   if (mem) {
3650      __asm__ __volatile__(
3651         "movupd   (%2), %%xmm0"         "\n\t"
3652         "movupd   (%1), %%xmm11"        "\n\t"
3653         "blendvpd (%0), %%xmm11"        "\n\t"
3654         "movupd   %%xmm11, (%1)"        "\n"
3655         : /*OUT*/
3656         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3657         : /*TRASH*/ "xmm11","xmm0"
3658      );
3659   } else {
3660      __asm__ __volatile__(
3661         "movupd   (%2), %%xmm0"         "\n\t"
3662         "movupd   (%1), %%xmm11"        "\n\t"
3663         "movupd   (%0), %%xmm2"         "\n\t"
3664         "blendvpd %%xmm2, %%xmm11"      "\n\t"
3665         "movupd   %%xmm11, (%1)"        "\n"
3666         : /*OUT*/
3667         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3668         : /*TRASH*/ "xmm11","xmm2","xmm0"
3669      );
3670   }
3671}
3672
3673void test_BLENDVPD ( void )
3674{
3675   V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3676   Int i;
3677   for (i = 0; i < 10; i++) {
3678      randV128(&t_xmm0);
3679      randV128(&t_src);
3680      randV128(&t_dst);
3681
3682      memcpy(&xmm0, &t_xmm0, 16);
3683      memcpy(&src, &t_src, 16);
3684      memcpy(&dst, &t_dst, 16);
3685      do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst);
3686      printf("r blendvpd  ");
3687      showV128(&t_xmm0);
3688      printf(" ");
3689      showV128(&t_src);
3690      printf(" ");
3691      showV128(&t_dst);
3692      printf(" -> ");
3693      showV128(&dst);
3694      printf("\n");
3695
3696      memcpy(&xmm0, &t_xmm0, 16);
3697      memcpy(&src, &t_src, 16);
3698      memcpy(&dst, &t_dst, 16);
3699      do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst);
3700      printf("m blendvpd  ");
3701      showV128(&t_xmm0);
3702      printf(" ");
3703      showV128(&t_src);
3704      printf(" ");
3705      showV128(&t_dst);
3706      printf(" -> ");
3707      showV128(&dst);
3708      printf("\n");
3709   }
3710}
3711
3712/* ------------ BLENDVPS ------------ */
3713
3714void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3715{
3716   if (mem) {
3717      __asm__ __volatile__(
3718         "movupd   (%2), %%xmm0"         "\n\t"
3719         "movupd   (%1), %%xmm11"        "\n\t"
3720         "blendvps (%0), %%xmm11"        "\n\t"
3721         "movupd   %%xmm11, (%1)"        "\n"
3722         : /*OUT*/
3723         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3724         : /*TRASH*/ "xmm11","xmm0"
3725      );
3726   } else {
3727      __asm__ __volatile__(
3728         "movupd   (%2), %%xmm0"         "\n\t"
3729         "movupd   (%1), %%xmm11"        "\n\t"
3730         "movupd   (%0), %%xmm2"         "\n\t"
3731         "blendvps %%xmm2, %%xmm11"      "\n\t"
3732         "movupd   %%xmm11, (%1)"        "\n"
3733         : /*OUT*/
3734         : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3735         : /*TRASH*/ "xmm11","xmm2","xmm0"
3736      );
3737   }
3738}
3739
3740void test_BLENDVPS ( void )
3741{
3742   V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3743   Int i;
3744   for (i = 0; i < 10; i++) {
3745      randV128(&t_xmm0);
3746      randV128(&t_src);
3747      randV128(&t_dst);
3748
3749      memcpy(&xmm0, &t_xmm0, 16);
3750      memcpy(&src, &t_src, 16);
3751      memcpy(&dst, &t_dst, 16);
3752      do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst);
3753      printf("r blendvps  ");
3754      showV128(&t_xmm0);
3755      printf(" ");
3756      showV128(&t_src);
3757      printf(" ");
3758      showV128(&t_dst);
3759      printf(" -> ");
3760      showV128(&dst);
3761      printf("\n");
3762
3763      memcpy(&xmm0, &t_xmm0, 16);
3764      memcpy(&src, &t_src, 16);
3765      memcpy(&dst, &t_dst, 16);
3766      do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst);
3767      printf("m blendvps  ");
3768      showV128(&t_xmm0);
3769      printf(" ");
3770      showV128(&t_src);
3771      printf(" ");
3772      showV128(&t_dst);
3773      printf(" -> ");
3774      showV128(&dst);
3775      printf("\n");
3776   }
3777}
3778
3779void test_MOVNTDQA ( void )
3780{
3781   V128 src, dst;
3782   Int i;
3783   for (i = 0; i < 10; i++) {
3784      randV128(&src);
3785      /* make sure the load actually happens */
3786      randV128(&dst);
3787      DO_m_r("movntdqa", src, dst);
3788   }
3789}
3790
3791/* ------------ main ------------ */
3792
3793int main ( int argc, char** argv )
3794{
3795#if 1
3796   // ------ SSE 4.1 ------
3797   test_BLENDPD();        // done Apr.01.2010
3798   test_BLENDPS();        // done Apr.02.2010
3799   test_PBLENDW();
3800   test_PBLENDVB();
3801   test_BLENDVPD();
3802   test_BLENDVPS();
3803   test_DPPD();           // done Apr.08.2010
3804   test_DPPS();           // done Apr.09.2010
3805   test_EXTRACTPS();
3806   test_INSERTPS();       // done Apr.01.2010
3807   test_PCMPEQQ();
3808   test_PEXTRB();         // done Apr.15.2010
3809   test_PEXTRD();         // done Apr.14.2010
3810   test_PEXTRQ();         // done Apr.14.2010
3811   test_PEXTRW();         // done Apr.14.2010
3812   test_PINSRQ();         // done Apr.16.2010
3813   test_PINSRD();         // todo
3814   test_PINSRW(); /* Umm, this is SSE2, not SSE4.  Right? */
3815   test_PINSRB();         // todo
3816   test_PMAXSB();
3817   test_PMAXSD();         // done Apr.09.2010
3818   test_PMAXUD();         // done Apr.16.2010
3819   test_PMAXUW();
3820   test_PMINSB();
3821   test_PMINSD();         // done Apr.09.2010
3822   test_PMINUD();
3823   test_PMINUW();
3824   test_PMOVSXBW();       // done Apr.02.2010
3825   test_PMOVSXBD();       // done Mar.30.2010
3826   test_PMOVSXBQ();       // done Mar.30.2010
3827   test_PMOVSXWD();       // done Mar.31.2010
3828   test_PMOVSXWQ();       // done Mar.31.2010
3829   test_PMOVSXDQ();       // done Mar.31.2010
3830   test_PMOVZXBW();       // done Mar.28.2010
3831   test_PMOVZXBD();       // done Mar.29.2010
3832   test_PMOVZXBQ();       // done Mar.29.2010
3833   test_PMOVZXWD();       // done Mar.28.2010
3834   test_PMOVZXWQ();       // done Mar.29.2010
3835   test_PMOVZXDQ();       // done Mar.29.2010
3836   test_POPCNTW();
3837   test_POPCNTL();
3838   test_POPCNTQ();
3839   test_PMULDQ();
3840   test_PMULLD();
3841   test_PTEST();
3842   test_ROUNDSD_w_immediate_rounding();
3843   test_ROUNDSS_w_immediate_rounding();
3844   test_ROUNDPD_w_immediate_rounding();
3845   test_ROUNDPS_w_immediate_rounding();
3846   test_ROUNDSD_w_mxcsr_rounding();
3847   test_ROUNDSS_w_mxcsr_rounding();
3848   test_ROUNDPD_w_mxcsr_rounding();
3849   test_ROUNDPS_w_mxcsr_rounding();
3850   // ------ SSE 4.2 ------
3851   test_PCMPGTQ();
3852   // CRC32B,Q
3853   test_PACKUSDW();
3854   test_PHMINPOSUW();
3855   test_MPSADBW();
3856   test_MOVNTDQA(); /* not sure whether this is 4.1 or 4.2 */
3857#else
3858   test_MPSADBW();
3859#endif
3860
3861   return 0;
3862}
3863
3864