pcmpstr64w.c revision eb0bae136f4eeaaf29761dddb148b118fb824632
1
2/* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
3   pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
4   aspect. */
5
6#include <string.h>
7#include <stdio.h>
8#include <assert.h>
9
10typedef  unsigned int   UInt;
11typedef  signed int     Int;
12typedef  unsigned char  UChar;
13typedef  unsigned short UShort;
14typedef  unsigned long long int ULong;
15typedef  UChar          Bool;
16#define False ((Bool)0)
17#define True  ((Bool)1)
18
19//typedef  unsigned char  V128[16];
20typedef
21   union {
22      UChar  uChar[16];
23      UShort uShort[8];
24      UInt   uInt[4];
25      UInt   w32[4];
26   }
27   V128;
28
29#define SHIFT_O   11
30#define SHIFT_S   7
31#define SHIFT_Z   6
32#define SHIFT_A   4
33#define SHIFT_C   0
34#define SHIFT_P   2
35
36#define MASK_O    (1ULL << SHIFT_O)
37#define MASK_S    (1ULL << SHIFT_S)
38#define MASK_Z    (1ULL << SHIFT_Z)
39#define MASK_A    (1ULL << SHIFT_A)
40#define MASK_C    (1ULL << SHIFT_C)
41#define MASK_P    (1ULL << SHIFT_P)
42
43
44UInt clz32 ( UInt x )
45{
46   Int y, m, n;
47   y = -(x >> 16);
48   m = (y >> 16) & 16;
49   n = 16 - m;
50   x = x >> m;
51   y = x - 0x100;
52   m = (y >> 16) & 8;
53   n = n + m;
54   x = x << m;
55   y = x - 0x1000;
56   m = (y >> 16) & 4;
57   n = n + m;
58   x = x << m;
59   y = x - 0x4000;
60   m = (y >> 16) & 2;
61   n = n + m;
62   x = x << m;
63   y = x >> 14;
64   m = y & ~(y >> 1);
65   return n + 2 - m;
66}
67
68UInt ctz32 ( UInt x )
69{
70   return 32 - clz32((~x) & (x-1));
71}
72
73void expand ( V128* dst, char* summary )
74{
75   Int i;
76   assert( strlen(summary) == 16 );
77   for (i = 0; i < 16; i++) {
78      UChar xx = 0;
79      UChar x = summary[15-i];
80      if      (x >= '0' && x <= '9') { xx = x - '0'; }
81      else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
82      else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
83      else assert(0);
84
85      assert(xx < 16);
86      xx = (xx << 4) | xx;
87      assert(xx < 256);
88      dst->uChar[i] = xx;
89   }
90}
91
92void try_istri ( char* which,
93                 UInt(*h_fn)(V128*,V128*),
94                 UInt(*s_fn)(V128*,V128*),
95                 char* summL, char* summR )
96{
97   assert(strlen(which) == 2);
98   V128 argL, argR;
99   expand(&argL, summL);
100   expand(&argR, summR);
101   UInt h_res = h_fn(&argL, &argR);
102   UInt s_res = s_fn(&argL, &argR);
103   printf("istri %s  %s %s -> %08x %08x %s\n",
104          which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
105}
106
107UInt zmask_from_V128 ( V128* arg )
108{
109   UInt i, res = 0;
110   for (i = 0; i < 8; i++) {
111      res |=  ((arg->uShort[i] == 0) ? 1 : 0) << i;
112   }
113   return res;
114}
115
116//////////////////////////////////////////////////////////
117//                                                      //
118//                       GENERAL                        //
119//                                                      //
120//////////////////////////////////////////////////////////
121
122
123/* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
124   basically), generate an I- or M-format output value, also the new
125   OSZACP flags.  */
126static
127void PCMPxSTRx_WRK_gen_output_fmt_I_wide ( /*OUT*/V128* resV,
128					   /*OUT*/UInt* resOSZACP,
129					   UInt intRes1,
130					   UInt zmaskL, UInt zmaskR,
131					   UInt validL,
132					   UInt pol, UInt idx )
133{
134   assert((pol >> 2) == 0);
135   assert((idx >> 1) == 0);
136
137   UInt intRes2 = 0;
138   switch (pol) {
139      case 0: intRes2 = intRes1;          break; // pol +
140      case 1: intRes2 = ~intRes1;         break; // pol -
141      case 2: intRes2 = intRes1;          break; // pol m+
142      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
143   }
144   intRes2 &= 0xFF;
145
146   // generate I-format output (an index in ECX)
147   // generate ecx value
148   UInt newECX = 0;
149   if (idx) {
150     // index of ms-1-bit
151     newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
152   } else {
153     // index of ls-1-bit
154     newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
155   }
156
157   resV->w32[0] = newECX;
158   resV->w32[1] = 0;
159   resV->w32[2] = 0;
160   resV->w32[3] = 0;
161
162   // generate new flags, common to all ISTRI and ISTRM cases
163   *resOSZACP    // A, P are zero
164     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
165     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
166     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
167     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
168}
169
170/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
171   variants on 16-bit characters.
172
173   For xSTRI variants, the new ECX value is placed in the 32 bits
174   pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
175   variants, the result is a 128 bit value and is placed at *resV in
176   the obvious way.
177
178   For all variants, the new OSZACP value is placed at *resOSZACP.
179
180   argLV and argRV are the vector args.  The caller must prepare a
181   8-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
182   must be 1 for each zero byte of of the respective arg.  For ESTRx
183   variants this is derived from the explicit length indication, and
184   must be 0 in all places except at the bit index corresponding to
185   the valid length (0 .. 8).  If the valid length is 8 then the
186   mask must be all zeroes.  In all cases, bits 31:8 must be zero.
187
188   imm8 is the original immediate from the instruction.  isSTRM
189   indicates whether this is a xSTRM or xSTRI variant, which controls
190   how much of *res is written.
191
192   If the given imm8 case can be handled, the return value is True.
193   If not, False is returned, and neither *res not *resOSZACP are
194   altered.
195*/
196
197Bool pcmpXstrX_WRK_wide ( /*OUT*/V128* resV,
198			  /*OUT*/UInt* resOSZACP,
199			  V128* argLV,  V128* argRV,
200			  UInt zmaskL, UInt zmaskR,
201			  UInt imm8,   Bool isxSTRM )
202{
203   assert(imm8 < 0x80);
204   assert((zmaskL >> 8) == 0);
205   assert((zmaskR >> 8) == 0);
206
207   /* Explicitly reject any imm8 values that haven't been validated,
208      even if they would probably work.  Life is too short to have
209      unvalidated cases in the code base. */
210   switch (imm8) {
211      case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
212      case 0x13:            case 0x1B:
213                            case 0x39: case 0x3B:
214                 case 0x45:            case 0x4B:
215         break;
216      default:
217         return False;
218   }
219
220   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
221   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
222   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
223   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
224
225   /*----------------------------------------*/
226   /*-- strcmp on wide data                --*/
227   /*----------------------------------------*/
228
229   if (agg == 2/*equal each, aka strcmp*/
230       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
231      Int    i;
232      UShort* argL = (UShort*)argLV;
233      UShort* argR = (UShort*)argRV;
234      UInt boolResII = 0;
235      for (i = 7; i >= 0; i--) {
236         UShort cL  = argL[i];
237         UShort cR  = argR[i];
238         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
239      }
240      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
241      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
242
243      // do invalidation, common to all equal-each cases
244      UInt intRes1
245         = (boolResII & validL & validR)  // if both valid, use cmpres
246           | (~ (validL | validR));       // if both invalid, force 1
247                                          // else force 0
248      intRes1 &= 0xFF;
249
250      // generate I-format output
251      PCMPxSTRx_WRK_gen_output_fmt_I_wide(
252         resV, resOSZACP,
253         intRes1, zmaskL, zmaskR, validL, pol, idx
254      );
255
256      return True;
257   }
258
259   /*----------------------------------------*/
260   /*-- set membership on wide data        --*/
261   /*----------------------------------------*/
262
263   if (agg == 0/*equal any, aka find chars in a set*/
264       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
265      /* argL: the string,  argR: charset */
266      UInt   si, ci;
267      UShort* argL    = (UShort*)argLV;
268      UShort* argR    = (UShort*)argRV;
269      UInt   boolRes = 0;
270      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
271      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
272
273      for (si = 0; si < 8; si++) {
274         if ((validL & (1 << si)) == 0)
275            // run off the end of the string.
276            break;
277         UInt m = 0;
278         for (ci = 0; ci < 8; ci++) {
279            if ((validR & (1 << ci)) == 0) break;
280            if (argR[ci] == argL[si]) { m = 1; break; }
281         }
282         boolRes |= (m << si);
283      }
284
285      // boolRes is "pre-invalidated"
286      UInt intRes1 = boolRes & 0xFF;
287
288      // generate I-format output
289      PCMPxSTRx_WRK_gen_output_fmt_I_wide(
290         resV, resOSZACP,
291         intRes1, zmaskL, zmaskR, validL, pol, idx
292      );
293
294      return True;
295   }
296
297   /*----------------------------------------*/
298   /*-- substring search on wide data      --*/
299   /*----------------------------------------*/
300
301   if (agg == 3/*equal ordered, aka substring search*/
302       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
303
304      /* argL: haystack,  argR: needle */
305      UInt   ni, hi;
306      UShort* argL    = (UShort*)argLV;
307      UShort* argR    = (UShort*)argRV;
308      UInt   boolRes = 0;
309      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
310      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
311      for (hi = 0; hi < 8; hi++) {
312         UInt m = 1;
313         for (ni = 0; ni < 8; ni++) {
314            if ((validR & (1 << ni)) == 0) break;
315            UInt i = ni + hi;
316            if (i >= 8) break;
317            if (argL[i] != argR[ni]) { m = 0; break; }
318         }
319         boolRes |= (m << hi);
320         if ((validL & (1 << hi)) == 0)
321            // run off the end of the haystack
322            break;
323      }
324
325      // boolRes is "pre-invalidated"
326      UInt intRes1 = boolRes & 0xFF;
327
328      // generate I-format output
329      PCMPxSTRx_WRK_gen_output_fmt_I_wide(
330         resV, resOSZACP,
331         intRes1, zmaskL, zmaskR, validL, pol, idx
332      );
333
334      return True;
335   }
336
337   /*----------------------------------------*/
338   /*-- ranges, unsigned wide data         --*/
339   /*----------------------------------------*/
340
341   if (agg == 1/*ranges*/
342       && fmt == 1/*uw*/) {
343
344      /* argL: string,  argR: range-pairs */
345      UInt   ri, si;
346      UShort* argL    = (UShort*)argLV;
347      UShort* argR    = (UShort*)argRV;
348      UInt   boolRes = 0;
349      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
350      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
351      for (si = 0; si < 8; si++) {
352         if ((validL & (1 << si)) == 0)
353            // run off the end of the string
354            break;
355         UInt m = 0;
356         for (ri = 0; ri < 8; ri += 2) {
357            if ((validR & (3 << ri)) != (3 << ri)) break;
358            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
359               m = 1; break;
360            }
361         }
362         boolRes |= (m << si);
363      }
364
365      // boolRes is "pre-invalidated"
366      UInt intRes1 = boolRes & 0xFF;
367
368      // generate I-format output
369      PCMPxSTRx_WRK_gen_output_fmt_I_wide(
370         resV, resOSZACP,
371         intRes1, zmaskL, zmaskR, validL, pol, idx
372      );
373
374      return True;
375   }
376
377   return False;
378}
379
380//////////////////////////////////////////////////////////
381//                                                      //
382//                       ISTRI_4B                       //
383//                                                      //
384//////////////////////////////////////////////////////////
385
386UInt h_pcmpistri_4B ( V128* argL, V128* argR )
387{
388   V128 block[2];
389   memcpy(&block[0], argL, sizeof(V128));
390   memcpy(&block[1], argR, sizeof(V128));
391   ULong res, flags;
392   __asm__ __volatile__(
393      "subq      $1024,  %%rsp"             "\n\t"
394      "movdqu    0(%2),  %%xmm2"            "\n\t"
395      "movdqu    16(%2), %%xmm11"           "\n\t"
396      "pcmpistri $0x4B,  %%xmm2, %%xmm11"   "\n\t"
397      "pushfq"                              "\n\t"
398      "popq      %%rdx"                     "\n\t"
399      "movq      %%rcx,  %0"                "\n\t"
400      "movq      %%rdx,  %1"                "\n\t"
401      "addq      $1024,  %%rsp"             "\n\t"
402      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
403      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
404   );
405   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
406}
407
408UInt s_pcmpistri_4B ( V128* argLU, V128* argRU )
409{
410   V128 resV;
411   UInt resOSZACP, resECX;
412   Bool ok
413      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
414			    zmask_from_V128(argLU),
415			    zmask_from_V128(argRU),
416			    0x4B, False/*!isSTRM*/
417        );
418   assert(ok);
419   resECX = resV.uInt[0];
420   return (resOSZACP << 16) | resECX;
421}
422
423void istri_4B ( void )
424{
425   char* wot = "4B";
426   UInt(*h)(V128*,V128*) = h_pcmpistri_4B;
427   UInt(*s)(V128*,V128*) = s_pcmpistri_4B;
428
429   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
430
431   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
432   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
433   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
434   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
435
436   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
437   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
438   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
439
440   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
441   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
442   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
443   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
444
445   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
446   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
447   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
448
449   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
450
451   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
452   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
453   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
454
455   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
456   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
457   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
458
459   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
460   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
461   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
462
463   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
464   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
465   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
466
467   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
468   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
469}
470
471//////////////////////////////////////////////////////////
472//                                                      //
473//                       ISTRI_3B                       //
474//                                                      //
475//////////////////////////////////////////////////////////
476
477UInt h_pcmpistri_3B ( V128* argL, V128* argR )
478{
479   V128 block[2];
480   memcpy(&block[0], argL, sizeof(V128));
481   memcpy(&block[1], argR, sizeof(V128));
482   ULong res, flags;
483   __asm__ __volatile__(
484      "subq      $1024,  %%rsp"             "\n\t"
485      "movdqu    0(%2),  %%xmm2"            "\n\t"
486      "movdqu    16(%2), %%xmm11"           "\n\t"
487      "pcmpistri $0x3B,  %%xmm2, %%xmm11"   "\n\t"
488      "pushfq"                              "\n\t"
489      "popq      %%rdx"                     "\n\t"
490      "movq      %%rcx,  %0"                "\n\t"
491      "movq      %%rdx,  %1"                "\n\t"
492      "addq      $1024,  %%rsp"             "\n\t"
493      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
494      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
495   );
496   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
497}
498
499UInt s_pcmpistri_3B ( V128* argLU, V128* argRU )
500{
501   V128 resV;
502   UInt resOSZACP, resECX;
503   Bool ok
504      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
505			    zmask_from_V128(argLU),
506			    zmask_from_V128(argRU),
507			    0x3B, False/*!isSTRM*/
508        );
509   assert(ok);
510   resECX = resV.uInt[0];
511   return (resOSZACP << 16) | resECX;
512}
513
514void istri_3B ( void )
515{
516   char* wot = "3B";
517   UInt(*h)(V128*,V128*) = h_pcmpistri_3B;
518   UInt(*s)(V128*,V128*) = s_pcmpistri_3B;
519
520   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
521
522   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
523   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
524   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
525   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
526
527   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
528   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
529   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
530
531   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
532   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
533   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
534   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
535
536   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
537   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
538   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
539
540   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
541
542   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
543   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
544   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
545
546   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
547   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
548   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
549
550   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
551   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
552   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
553
554   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
555   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
556   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
557
558   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
559   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
560}
561
562
563
564//////////////////////////////////////////////////////////
565//                                                      //
566//                       ISTRI_0D                       //
567//                                                      //
568//////////////////////////////////////////////////////////
569
570__attribute__((noinline))
571UInt h_pcmpistri_0D ( V128* argL, V128* argR )
572{
573   V128 block[2];
574   memcpy(&block[0], argL, sizeof(V128));
575   memcpy(&block[1], argR, sizeof(V128));
576   ULong res = 0, flags = 0;
577   __asm__ __volatile__(
578      "movdqu    0(%2),  %%xmm2"            "\n\t"
579      "movdqu    16(%2), %%xmm11"           "\n\t"
580      "pcmpistri $0x0D,  %%xmm2, %%xmm11"   "\n\t"
581      //"pcmpistrm $0x0D,  %%xmm2, %%xmm11"   "\n\t"
582      //"movd %%xmm0, %%ecx" "\n\t"
583      "pushfq"                              "\n\t"
584      "popq      %%rdx"                     "\n\t"
585      "movq      %%rcx,  %0"                "\n\t"
586      "movq      %%rdx,  %1"                "\n\t"
587      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
588      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
589   );
590   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
591}
592
593UInt s_pcmpistri_0D ( V128* argLU, V128* argRU )
594{
595   V128 resV;
596   UInt resOSZACP, resECX;
597   Bool ok
598      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
599			    zmask_from_V128(argLU),
600			    zmask_from_V128(argRU),
601			    0x0D, False/*!isSTRM*/
602        );
603   assert(ok);
604   resECX = resV.uInt[0];
605   return (resOSZACP << 16) | resECX;
606}
607
608void istri_0D ( void )
609{
610   char* wot = "0D";
611   UInt(*h)(V128*,V128*) = h_pcmpistri_0D;
612   UInt(*s)(V128*,V128*) = s_pcmpistri_0D;
613
614   try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
615
616   try_istri(wot,h,s, "11111111abcdef11", "00abcdef00abcdef");
617
618   try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
619   try_istri(wot,h,s, "1111111111abcdef", "0000000000abcdef");
620   try_istri(wot,h,s, "111111111111abcd", "0000000000abcdef");
621
622   try_istri(wot,h,s, "1111abcd11abcd11", "000000000000abcd");
623
624   try_istri(wot,h,s, "11abcd1111abcd11", "000000000000abcd");
625   try_istri(wot,h,s, "abcd111111abcd11", "000000000000abcd");
626   try_istri(wot,h,s, "cd11111111abcd11", "000000000000abcd");
627
628   try_istri(wot,h,s, "01abcd11abcd1111", "000000000000abcd");
629   try_istri(wot,h,s, "00abcd11abcd1111", "000000000000abcd");
630   try_istri(wot,h,s, "0000cd11abcd1111", "000000000000abcd");
631
632   try_istri(wot,h,s, "00abcd1100abcd11", "000000000000abcd");
633   try_istri(wot,h,s, "00abcd110000cd11", "000000000000abcd");
634
635   try_istri(wot,h,s, "1111111111111234", "0000000000000000");
636   try_istri(wot,h,s, "1111111111111234", "0000000000000011");
637   try_istri(wot,h,s, "1111111111111234", "0000000000001111");
638
639   try_istri(wot,h,s, "1111111111111234", "1111111111111234");
640   try_istri(wot,h,s, "0a11111111111111", "000000000000000a");
641   try_istri(wot,h,s, "0b11111111111111", "000000000000000a");
642
643   try_istri(wot,h,s, "b111111111111111", "0000000000000000");
644   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
645   try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
646   try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
647}
648
649
650//////////////////////////////////////////////////////////
651//                                                      //
652//                       ISTRI_09                       //
653//                                                      //
654//////////////////////////////////////////////////////////
655
656UInt h_pcmpistri_09 ( V128* argL, V128* argR )
657{
658   V128 block[2];
659   memcpy(&block[0], argL, sizeof(V128));
660   memcpy(&block[1], argR, sizeof(V128));
661   ULong res, flags;
662   __asm__ __volatile__(
663      "subq      $1024,  %%rsp"             "\n\t"
664      "movdqu    0(%2),  %%xmm2"            "\n\t"
665      "movdqu    16(%2), %%xmm11"           "\n\t"
666      "pcmpistri $0x09,  %%xmm2, %%xmm11"   "\n\t"
667      "pushfq"                              "\n\t"
668      "popq      %%rdx"                     "\n\t"
669      "movq      %%rcx,  %0"                "\n\t"
670      "movq      %%rdx,  %1"                "\n\t"
671      "addq      $1024,  %%rsp"             "\n\t"
672      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
673      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
674   );
675   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
676}
677
678UInt s_pcmpistri_09 ( V128* argLU, V128* argRU )
679{
680   V128 resV;
681   UInt resOSZACP, resECX;
682   Bool ok
683      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
684			    zmask_from_V128(argLU),
685			    zmask_from_V128(argRU),
686			    0x09, False/*!isSTRM*/
687        );
688   assert(ok);
689   resECX = resV.uInt[0];
690   return (resOSZACP << 16) | resECX;
691}
692
693void istri_09 ( void )
694{
695   char* wot = "09";
696   UInt(*h)(V128*,V128*) = h_pcmpistri_09;
697   UInt(*s)(V128*,V128*) = s_pcmpistri_09;
698
699   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
700
701   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
702   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
703   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
704   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
705
706   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
707   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
708   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
709
710   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
711   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
712   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
713   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
714
715   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
716   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
717   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
718
719   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
720
721   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
722   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
723   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
724
725   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
726   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
727   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
728
729   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
730   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
731   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
732
733   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
734   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
735   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
736
737   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
738   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
739}
740
741
742
743//////////////////////////////////////////////////////////
744//                                                      //
745//                       ISTRI_1B                       //
746//                                                      //
747//////////////////////////////////////////////////////////
748
749UInt h_pcmpistri_1B ( V128* argL, V128* argR )
750{
751   V128 block[2];
752   memcpy(&block[0], argL, sizeof(V128));
753   memcpy(&block[1], argR, sizeof(V128));
754   ULong res, flags;
755   __asm__ __volatile__(
756      "subq      $1024,  %%rsp"             "\n\t"
757      "movdqu    0(%2),  %%xmm2"            "\n\t"
758      "movdqu    16(%2), %%xmm11"           "\n\t"
759      "pcmpistri $0x1B,  %%xmm2, %%xmm11"   "\n\t"
760      "pushfq"                              "\n\t"
761      "popq      %%rdx"                     "\n\t"
762      "movq      %%rcx,  %0"                "\n\t"
763      "movq      %%rdx,  %1"                "\n\t"
764      "addq      $1024,  %%rsp"             "\n\t"
765      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
766      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
767   );
768   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
769}
770
771UInt s_pcmpistri_1B ( V128* argLU, V128* argRU )
772{
773   V128 resV;
774   UInt resOSZACP, resECX;
775   Bool ok
776      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
777			    zmask_from_V128(argLU),
778			    zmask_from_V128(argRU),
779			    0x1B, False/*!isSTRM*/
780        );
781   assert(ok);
782   resECX = resV.uInt[0];
783   return (resOSZACP << 16) | resECX;
784}
785
786void istri_1B ( void )
787{
788   char* wot = "1B";
789   UInt(*h)(V128*,V128*) = h_pcmpistri_1B;
790   UInt(*s)(V128*,V128*) = s_pcmpistri_1B;
791
792   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
793
794   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
795   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
796   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
797   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
798
799   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
800   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
801   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
802
803   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
804   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
805   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
806   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
807
808   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
809   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
810   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
811
812   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
813
814   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
815   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
816   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
817
818   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
819   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
820   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
821
822   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
823   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
824   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
825
826   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
827   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
828   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
829
830   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
831   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
832}
833
834
835
836//////////////////////////////////////////////////////////
837//                                                      //
838//                       ISTRI_03                       //
839//                                                      //
840//////////////////////////////////////////////////////////
841
842UInt h_pcmpistri_03 ( V128* argL, V128* argR )
843{
844   V128 block[2];
845   memcpy(&block[0], argL, sizeof(V128));
846   memcpy(&block[1], argR, sizeof(V128));
847   ULong res, flags;
848   __asm__ __volatile__(
849      "subq      $1024,  %%rsp"             "\n\t"
850      "movdqu    0(%2),  %%xmm2"            "\n\t"
851      "movdqu    16(%2), %%xmm11"           "\n\t"
852      "pcmpistri $0x03,  %%xmm2, %%xmm11"   "\n\t"
853//"pcmpistrm $0x03, %%xmm2, %%xmm11"   "\n\t"
854//"movd %%xmm0, %%ecx" "\n\t"
855      "pushfq"                              "\n\t"
856      "popq      %%rdx"                     "\n\t"
857      "movq      %%rcx,  %0"                "\n\t"
858      "movq      %%rdx,  %1"                "\n\t"
859      "addq      $1024,  %%rsp"             "\n\t"
860      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
861      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
862   );
863   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
864}
865
866UInt s_pcmpistri_03 ( V128* argLU, V128* argRU )
867{
868   V128 resV;
869   UInt resOSZACP, resECX;
870   Bool ok
871      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
872			    zmask_from_V128(argLU),
873			    zmask_from_V128(argRU),
874			    0x03, False/*!isSTRM*/
875        );
876   assert(ok);
877   resECX = resV.uInt[0];
878   return (resOSZACP << 16) | resECX;
879}
880
881void istri_03 ( void )
882{
883   char* wot = "03";
884   UInt(*h)(V128*,V128*) = h_pcmpistri_03;
885   UInt(*s)(V128*,V128*) = s_pcmpistri_03;
886
887   try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
888   try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
889   try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
890   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
891
892   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
893   try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
894   try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
895   try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
896   try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
897
898   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
899   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
900   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
901   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
902
903   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
904   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
905
906   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
907   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
908   try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
909   try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
910
911   try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
912
913   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
914   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
915}
916
917
918//////////////////////////////////////////////////////////
919//                                                      //
920//                       ISTRI_13                       //
921//                                                      //
922//////////////////////////////////////////////////////////
923
924UInt h_pcmpistri_13 ( V128* argL, V128* argR )
925{
926   V128 block[2];
927   memcpy(&block[0], argL, sizeof(V128));
928   memcpy(&block[1], argR, sizeof(V128));
929   ULong res, flags;
930   __asm__ __volatile__(
931      "subq      $1024,  %%rsp"             "\n\t"
932      "movdqu    0(%2),  %%xmm2"            "\n\t"
933      "movdqu    16(%2), %%xmm11"           "\n\t"
934      "pcmpistri $0x13,  %%xmm2, %%xmm11"   "\n\t"
935//"pcmpistrm $0x13, %%xmm2, %%xmm11"   "\n\t"
936//"movd %%xmm0, %%ecx" "\n\t"
937      "pushfq"                              "\n\t"
938      "popq      %%rdx"                     "\n\t"
939      "movq      %%rcx,  %0"                "\n\t"
940      "movq      %%rdx,  %1"                "\n\t"
941      "addq      $1024,  %%rsp"             "\n\t"
942      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
943      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
944   );
945   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
946}
947
948UInt s_pcmpistri_13 ( V128* argLU, V128* argRU )
949{
950   V128 resV;
951   UInt resOSZACP, resECX;
952   Bool ok
953      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
954			    zmask_from_V128(argLU),
955			    zmask_from_V128(argRU),
956			    0x13, False/*!isSTRM*/
957        );
958   assert(ok);
959   resECX = resV.uInt[0];
960   return (resOSZACP << 16) | resECX;
961}
962
963void istri_13 ( void )
964{
965   char* wot = "13";
966   UInt(*h)(V128*,V128*) = h_pcmpistri_13;
967   UInt(*s)(V128*,V128*) = s_pcmpistri_13;
968
969   try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
970   try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
971   try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
972   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
973
974   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
975   try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
976   try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
977   try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
978   try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
979
980   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
981   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
982   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
983   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
984
985   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
986   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
987
988   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
989   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
990   try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
991   try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
992
993   try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
994
995   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
996   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
997}
998
999
1000
1001//////////////////////////////////////////////////////////
1002//                                                      //
1003//                       ISTRI_45                       //
1004//                                                      //
1005//////////////////////////////////////////////////////////
1006
1007UInt h_pcmpistri_45 ( V128* argL, V128* argR )
1008{
1009   V128 block[2];
1010   memcpy(&block[0], argL, sizeof(V128));
1011   memcpy(&block[1], argR, sizeof(V128));
1012   ULong res, flags;
1013   __asm__ __volatile__(
1014      "subq      $1024,  %%rsp"             "\n\t"
1015      "movdqu    0(%2),  %%xmm2"            "\n\t"
1016      "movdqu    16(%2), %%xmm11"           "\n\t"
1017      "pcmpistri $0x45,  %%xmm2, %%xmm11"   "\n\t"
1018//"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
1019//"movd %%xmm0, %%ecx" "\n\t"
1020      "pushfq"                              "\n\t"
1021      "popq      %%rdx"                     "\n\t"
1022      "movq      %%rcx,  %0"                "\n\t"
1023      "movq      %%rdx,  %1"                "\n\t"
1024      "addq      $1024,  %%rsp"             "\n\t"
1025      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1026      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1027   );
1028   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1029}
1030
1031UInt s_pcmpistri_45 ( V128* argLU, V128* argRU )
1032{
1033   V128 resV;
1034   UInt resOSZACP, resECX;
1035   Bool ok
1036      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1037			    zmask_from_V128(argLU),
1038			    zmask_from_V128(argRU),
1039			    0x45, False/*!isSTRM*/
1040        );
1041   assert(ok);
1042   resECX = resV.uInt[0];
1043   return (resOSZACP << 16) | resECX;
1044}
1045
1046void istri_45 ( void )
1047{
1048   char* wot = "45";
1049   UInt(*h)(V128*,V128*) = h_pcmpistri_45;
1050   UInt(*s)(V128*,V128*) = s_pcmpistri_45;
1051
1052   try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000bbcc");
1053   try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000ccbb");
1054   try_istri(wot,h,s, "baaabbbbccccdddd", "000000000000ccbb");
1055   try_istri(wot,h,s, "baaabbbbccccdddc", "000000000000ccbb");
1056
1057   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
1058   try_istri(wot,h,s, "bbbbbbbb00bbbbbb", "000000000000ccbb");
1059   try_istri(wot,h,s, "bbbbbbbbbbbb00bb", "000000000000ccbb");
1060   try_istri(wot,h,s, "bbbbbbbbbbbbbb00", "000000000000ccbb");
1061   try_istri(wot,h,s, "0000000000000000", "000000000000ccbb");
1062
1063   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1064
1065   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
1066   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000bb");
1067   try_istri(wot,h,s, "bb44bb44bb44bb44", "000000006622ccbb");
1068
1069   try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000022ccbb");
1070   try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000000ccbb");
1071   try_istri(wot,h,s, "bb44bb44bb44bb44", "00000000000000bb");
1072
1073   try_istri(wot,h,s, "0011223344556677", "0000997755442211");
1074   try_istri(wot,h,s, "1122334455667711", "0000997755442211");
1075
1076   try_istri(wot,h,s, "0011223344556677", "0000aa8866553322");
1077   try_istri(wot,h,s, "1122334455667711", "0000aa8866553322");
1078}
1079
1080
1081//////////////////////////////////////////////////////////
1082//                                                      //
1083//                       ISTRI_01                       //
1084//                                                      //
1085//////////////////////////////////////////////////////////
1086
1087UInt h_pcmpistri_01 ( V128* argL, V128* argR )
1088{
1089   V128 block[2];
1090   memcpy(&block[0], argL, sizeof(V128));
1091   memcpy(&block[1], argR, sizeof(V128));
1092   ULong res, flags;
1093   __asm__ __volatile__(
1094      "subq      $1024,  %%rsp"             "\n\t"
1095      "movdqu    0(%2),  %%xmm2"            "\n\t"
1096      "movdqu    16(%2), %%xmm11"           "\n\t"
1097      "pcmpistri $0x01,  %%xmm2, %%xmm11"   "\n\t"
1098//"pcmpistrm $0x01, %%xmm2, %%xmm11"   "\n\t"
1099//"movd %%xmm0, %%ecx" "\n\t"
1100      "pushfq"                              "\n\t"
1101      "popq      %%rdx"                     "\n\t"
1102      "movq      %%rcx,  %0"                "\n\t"
1103      "movq      %%rdx,  %1"                "\n\t"
1104      "addq      $1024,  %%rsp"             "\n\t"
1105      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1106      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1107   );
1108   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1109}
1110
1111UInt s_pcmpistri_01 ( V128* argLU, V128* argRU )
1112{
1113   V128 resV;
1114   UInt resOSZACP, resECX;
1115   Bool ok
1116      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1117			    zmask_from_V128(argLU),
1118			    zmask_from_V128(argRU),
1119			    0x01, False/*!isSTRM*/
1120        );
1121   assert(ok);
1122   resECX = resV.uInt[0];
1123   return (resOSZACP << 16) | resECX;
1124}
1125
1126void istri_01 ( void )
1127{
1128   char* wot = "01";
1129   UInt(*h)(V128*,V128*) = h_pcmpistri_01;
1130   UInt(*s)(V128*,V128*) = s_pcmpistri_01;
1131
1132   try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
1133   try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
1134   try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
1135   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
1136
1137   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
1138   try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
1139   try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
1140   try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
1141   try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
1142
1143   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
1144   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
1145   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
1146   try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
1147
1148   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1149   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1150
1151   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
1152   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
1153   try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
1154   try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
1155
1156   try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
1157
1158   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
1159   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
1160}
1161
1162
1163//////////////////////////////////////////////////////////
1164//                                                      //
1165//                       ISTRI_39                       //
1166//                                                      //
1167//////////////////////////////////////////////////////////
1168
1169UInt h_pcmpistri_39 ( V128* argL, V128* argR )
1170{
1171   V128 block[2];
1172   memcpy(&block[0], argL, sizeof(V128));
1173   memcpy(&block[1], argR, sizeof(V128));
1174   ULong res, flags;
1175   __asm__ __volatile__(
1176      "subq      $1024,  %%rsp"             "\n\t"
1177      "movdqu    0(%2),  %%xmm2"            "\n\t"
1178      "movdqu    16(%2), %%xmm11"           "\n\t"
1179      "pcmpistri $0x39,  %%xmm2, %%xmm11"   "\n\t"
1180      "pushfq"                              "\n\t"
1181      "popq      %%rdx"                     "\n\t"
1182      "movq      %%rcx,  %0"                "\n\t"
1183      "movq      %%rdx,  %1"                "\n\t"
1184      "addq      $1024,  %%rsp"             "\n\t"
1185      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1186      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1187   );
1188   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1189}
1190
1191UInt s_pcmpistri_39 ( V128* argLU, V128* argRU )
1192{
1193   V128 resV;
1194   UInt resOSZACP, resECX;
1195   Bool ok
1196      = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
1197			    zmask_from_V128(argLU),
1198			    zmask_from_V128(argRU),
1199			    0x39, False/*!isSTRM*/
1200        );
1201   assert(ok);
1202   resECX = resV.uInt[0];
1203   return (resOSZACP << 16) | resECX;
1204}
1205
1206void istri_39 ( void )
1207{
1208   char* wot = "39";
1209   UInt(*h)(V128*,V128*) = h_pcmpistri_39;
1210   UInt(*s)(V128*,V128*) = s_pcmpistri_39;
1211
1212   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1213
1214   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1215   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1216   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
1217   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
1218
1219   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
1220   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
1221   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
1222
1223   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1224   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1225   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1226   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1227
1228   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1229   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
1230   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
1231
1232   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1233
1234   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
1235   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
1236   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
1237
1238   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
1239   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
1240   try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
1241
1242   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
1243   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
1244   try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
1245
1246   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
1247   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
1248   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
1249
1250   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
1251   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
1252}
1253
1254
1255
1256//////////////////////////////////////////////////////////
1257//                                                      //
1258//                         main                         //
1259//                                                      //
1260//////////////////////////////////////////////////////////
1261
1262int main ( void )
1263{
1264   istri_4B();
1265   istri_3B();
1266   istri_09();
1267   istri_1B();
1268   istri_03();
1269   istri_0D();
1270   istri_13();
1271   istri_45();
1272   istri_01();
1273   istri_39();
1274   return 0;
1275}
1276