pcmpstr64.c revision b32f58018498ea2225959b0ba11c18f0c433deef
1
2/* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
3   pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
4   aspect. */
5
6#include <string.h>
7#include <stdio.h>
8#include <assert.h>
9
10typedef  unsigned int   UInt;
11typedef  signed int     Int;
12typedef  unsigned char  UChar;
13typedef  unsigned long long int ULong;
14typedef  UChar          Bool;
15#define False ((Bool)0)
16#define True  ((Bool)1)
17
18//typedef  unsigned char  V128[16];
19typedef
20   union {
21      UChar uChar[16];
22      UInt  uInt[4];
23   }
24   V128;
25
26#define SHIFT_O   11
27#define SHIFT_S   7
28#define SHIFT_Z   6
29#define SHIFT_A   4
30#define SHIFT_C   0
31#define SHIFT_P   2
32
33#define MASK_O    (1ULL << SHIFT_O)
34#define MASK_S    (1ULL << SHIFT_S)
35#define MASK_Z    (1ULL << SHIFT_Z)
36#define MASK_A    (1ULL << SHIFT_A)
37#define MASK_C    (1ULL << SHIFT_C)
38#define MASK_P    (1ULL << SHIFT_P)
39
40
41UInt clz32 ( UInt x )
42{
43   Int y, m, n;
44   y = -(x >> 16);
45   m = (y >> 16) & 16;
46   n = 16 - m;
47   x = x >> m;
48   y = x - 0x100;
49   m = (y >> 16) & 8;
50   n = n + m;
51   x = x << m;
52   y = x - 0x1000;
53   m = (y >> 16) & 4;
54   n = n + m;
55   x = x << m;
56   y = x - 0x4000;
57   m = (y >> 16) & 2;
58   n = n + m;
59   x = x << m;
60   y = x >> 14;
61   m = y & ~(y >> 1);
62   return n + 2 - m;
63}
64
65UInt ctz32 ( UInt x )
66{
67   return 32 - clz32((~x) & (x-1));
68}
69
70void expand ( V128* dst, char* summary )
71{
72   Int i;
73   assert( strlen(summary) == 16 );
74   for (i = 0; i < 16; i++) {
75      UChar xx = 0;
76      UChar x = summary[15-i];
77      if      (x >= '0' && x <= '9') { xx = x - '0'; }
78      else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
79      else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
80      else assert(0);
81
82      assert(xx < 16);
83      xx = (xx << 4) | xx;
84      assert(xx < 256);
85      dst->uChar[i] = xx;
86   }
87}
88
89void try_istri ( char* which,
90                 UInt(*h_fn)(V128*,V128*),
91                 UInt(*s_fn)(V128*,V128*),
92                 char* summL, char* summR )
93{
94   assert(strlen(which) == 2);
95   V128 argL, argR;
96   expand(&argL, summL);
97   expand(&argR, summR);
98   UInt h_res = h_fn(&argL, &argR);
99   UInt s_res = s_fn(&argL, &argR);
100   printf("istri %s  %s %s -> %08x %08x %s\n",
101          which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
102}
103
104UInt zmask_from_V128 ( V128* arg )
105{
106   UInt i, res = 0;
107   for (i = 0; i < 16; i++) {
108      res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
109   }
110   return res;
111}
112
113//////////////////////////////////////////////////////////
114//                                                      //
115//                       GENERAL                        //
116//                                                      //
117//////////////////////////////////////////////////////////
118
119
120/* Given partial results from a pcmpXstrX operation (intRes1,
121   basically), generate an I format (index value for ECX) output, and
122   also the new OSZACP flags.
123*/
124static
125void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
126                                    /*OUT*/UInt* resOSZACP,
127                                    UInt intRes1,
128                                    UInt zmaskL, UInt zmaskR,
129                                    UInt validL,
130                                    UInt pol, UInt idx )
131{
132   assert((pol >> 2) == 0);
133   assert((idx >> 1) == 0);
134
135   UInt intRes2 = 0;
136   switch (pol) {
137      case 0: intRes2 = intRes1;          break; // pol +
138      case 1: intRes2 = ~intRes1;         break; // pol -
139      case 2: intRes2 = intRes1;          break; // pol m+
140      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
141   }
142   intRes2 &= 0xFFFF;
143
144   // generate ecx value
145   UInt newECX = 0;
146   if (idx) {
147     // index of ms-1-bit
148     newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
149   } else {
150     // index of ls-1-bit
151     newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
152   }
153
154   *(UInt*)(&resV[0]) = newECX;
155
156   // generate new flags, common to all ISTRI and ISTRM cases
157   *resOSZACP    // A, P are zero
158     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
159     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
160     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
161     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
162}
163
164
165/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
166   variants.
167
168   For xSTRI variants, the new ECX value is placed in the 32 bits
169   pointed to by *resV.  For xSTRM variants, the result is a 128 bit
170   value and is placed at *resV in the obvious way.
171
172   For all variants, the new OSZACP value is placed at *resOSZACP.
173
174   argLV and argRV are the vector args.  The caller must prepare a
175   16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
176   must be 1 for each zero byte of of the respective arg.  For ESTRx
177   variants this is derived from the explicit length indication, and
178   must be 0 in all places except at the bit index corresponding to
179   the valid length (0 .. 16).  If the valid length is 16 then the
180   mask must be all zeroes.  In all cases, bits 31:16 must be zero.
181
182   imm8 is the original immediate from the instruction.  isSTRM
183   indicates whether this is a xSTRM or xSTRI variant, which controls
184   how much of *res is written.
185
186   If the given imm8 case can be handled, the return value is True.
187   If not, False is returned, and neither *res not *resOSZACP are
188   altered.
189*/
190
191Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
192                     /*OUT*/UInt* resOSZACP,
193                     V128* argLV,  V128* argRV,
194                     UInt zmaskL, UInt zmaskR,
195                     UInt imm8,   Bool isSTRM )
196{
197   assert(imm8 < 0x80);
198   assert((zmaskL >> 16) == 0);
199   assert((zmaskR >> 16) == 0);
200
201   /* Explicitly reject any imm8 values that haven't been validated,
202      even if they would probably work.  Life is too short to have
203      unvalidated cases in the code base. */
204   switch (imm8) {
205      case 0x00:
206      case 0x02: case 0x08: case 0x0C: case 0x12: case 0x1A:
207      case 0x38: case 0x3A: case 0x44: case 0x4A:
208         break;
209      default:
210         return False;
211   }
212
213   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
214   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
215   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
216   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
217
218   /*----------------------------------------*/
219   /*-- strcmp on byte data                --*/
220   /*----------------------------------------*/
221
222   if (agg == 2/*equal each, aka strcmp*/
223       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
224       && !isSTRM) {
225      Int    i;
226      UChar* argL = (UChar*)argLV;
227      UChar* argR = (UChar*)argRV;
228      UInt boolResII = 0;
229      for (i = 15; i >= 0; i--) {
230         UChar cL  = argL[i];
231         UChar cR  = argR[i];
232         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
233      }
234      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
235      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
236
237      // do invalidation, common to all equal-each cases
238      UInt intRes1
239         = (boolResII & validL & validR)  // if both valid, use cmpres
240           | (~ (validL | validR));       // if both invalid, force 1
241                                          // else force 0
242      intRes1 &= 0xFFFF;
243
244      // generate I-format output
245      pcmpXstrX_WRK_gen_output_fmt_I(
246         resV, resOSZACP,
247         intRes1, zmaskL, zmaskR, validL, pol, idx
248      );
249
250      return True;
251   }
252
253   /*----------------------------------------*/
254   /*-- set membership on byte data        --*/
255   /*----------------------------------------*/
256
257   if (agg == 0/*equal any, aka find chars in a set*/
258       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
259       && !isSTRM) {
260      /* argL: the string,  argR: charset */
261      UInt   si, ci;
262      UChar* argL    = (UChar*)argLV;
263      UChar* argR    = (UChar*)argRV;
264      UInt   boolRes = 0;
265      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
266      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
267
268      for (si = 0; si < 16; si++) {
269         if ((validL & (1 << si)) == 0)
270            // run off the end of the string.
271            break;
272         UInt m = 0;
273         for (ci = 0; ci < 16; ci++) {
274            if ((validR & (1 << ci)) == 0) break;
275            if (argR[ci] == argL[si]) { m = 1; break; }
276         }
277         boolRes |= (m << si);
278      }
279
280      // boolRes is "pre-invalidated"
281      UInt intRes1 = boolRes & 0xFFFF;
282
283      // generate I-format output
284      pcmpXstrX_WRK_gen_output_fmt_I(
285         resV, resOSZACP,
286         intRes1, zmaskL, zmaskR, validL, pol, idx
287      );
288
289      return True;
290   }
291
292   /*----------------------------------------*/
293   /*-- substring search on byte data      --*/
294   /*----------------------------------------*/
295
296   if (agg == 3/*equal ordered, aka substring search*/
297       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
298       && !isSTRM) {
299
300      /* argL: haystack,  argR: needle */
301      UInt   ni, hi;
302      UChar* argL    = (UChar*)argLV;
303      UChar* argR    = (UChar*)argRV;
304      UInt   boolRes = 0;
305      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
306      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
307      for (hi = 0; hi < 16; hi++) {
308         if ((validL & (1 << hi)) == 0)
309            // run off the end of the haystack
310            break;
311         UInt m = 1;
312         for (ni = 0; ni < 16; ni++) {
313            if ((validR & (1 << ni)) == 0) break;
314            UInt i = ni + hi;
315            if (i >= 16) break;
316            if (argL[i] != argR[ni]) { m = 0; break; }
317         }
318         boolRes |= (m << hi);
319      }
320
321      // boolRes is "pre-invalidated"
322      UInt intRes1 = boolRes & 0xFFFF;
323
324      // generate I-format output
325      pcmpXstrX_WRK_gen_output_fmt_I(
326         resV, resOSZACP,
327         intRes1, zmaskL, zmaskR, validL, pol, idx
328      );
329
330      return True;
331   }
332
333   /*----------------------------------------*/
334   /*-- ranges, unsigned byte data         --*/
335   /*----------------------------------------*/
336
337   if (agg == 1/*ranges*/
338       && fmt == 0/*ub*/
339       && !isSTRM) {
340
341      /* argL: string,  argR: range-pairs */
342      UInt   ri, si;
343      UChar* argL    = (UChar*)argLV;
344      UChar* argR    = (UChar*)argRV;
345      UInt   boolRes = 0;
346      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
347      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
348      for (si = 0; si < 16; si++) {
349         if ((validL & (1 << si)) == 0)
350            // run off the end of the string
351            break;
352         UInt m = 0;
353         for (ri = 0; ri < 16; ri += 2) {
354            if ((validR & (3 << ri)) != (3 << ri)) break;
355            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
356               m = 1; break;
357            }
358         }
359         boolRes |= (m << si);
360      }
361
362      // boolRes is "pre-invalidated"
363      UInt intRes1 = boolRes & 0xFFFF;
364
365      // generate I-format output
366      pcmpXstrX_WRK_gen_output_fmt_I(
367         resV, resOSZACP,
368         intRes1, zmaskL, zmaskR, validL, pol, idx
369      );
370
371      return True;
372   }
373
374   return False;
375}
376
377
378//////////////////////////////////////////////////////////
379//                                                      //
380//                       ISTRI_4A                       //
381//                                                      //
382//////////////////////////////////////////////////////////
383
384UInt h_pcmpistri_4A ( V128* argL, V128* argR )
385{
386   V128 block[2];
387   memcpy(&block[0], argL, sizeof(V128));
388   memcpy(&block[1], argR, sizeof(V128));
389   ULong res, flags;
390   __asm__ __volatile__(
391      "subq      $1024,  %%rsp"             "\n\t"
392      "movdqu    0(%2),  %%xmm2"            "\n\t"
393      "movdqu    16(%2), %%xmm11"           "\n\t"
394      "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
395      "pushfq"                              "\n\t"
396      "popq      %%rdx"                     "\n\t"
397      "movq      %%rcx,  %0"                "\n\t"
398      "movq      %%rdx,  %1"                "\n\t"
399      "addq      $1024,  %%rsp"             "\n\t"
400      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
401      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
402   );
403   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
404}
405
406UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
407{
408   V128 resV;
409   UInt resOSZACP, resECX;
410   Bool ok
411      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
412                       zmask_from_V128(argLU),
413                       zmask_from_V128(argRU),
414                       0x4A, False/*!isSTRM*/
415        );
416   assert(ok);
417   resECX = resV.uInt[0];
418   return (resOSZACP << 16) | resECX;
419}
420
421void istri_4A ( void )
422{
423   char* wot = "4A";
424   UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
425   UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
426
427   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
428
429   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
430   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
431   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
432   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
433
434   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
435   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
436   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
437
438   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
439   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
440   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
441   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
442
443   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
444   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
445   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
446
447   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
448
449   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
450   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
451   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
452
453   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
454   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
455   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
456
457   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
458   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
459   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
460
461   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
462   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
463   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
464
465   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
466   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
467}
468
469//////////////////////////////////////////////////////////
470//                                                      //
471//                       ISTRI_3A                       //
472//                                                      //
473//////////////////////////////////////////////////////////
474
475UInt h_pcmpistri_3A ( V128* argL, V128* argR )
476{
477   V128 block[2];
478   memcpy(&block[0], argL, sizeof(V128));
479   memcpy(&block[1], argR, sizeof(V128));
480   ULong res, flags;
481   __asm__ __volatile__(
482      "subq      $1024,  %%rsp"             "\n\t"
483      "movdqu    0(%2),  %%xmm2"            "\n\t"
484      "movdqu    16(%2), %%xmm11"           "\n\t"
485      "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
486      "pushfq"                              "\n\t"
487      "popq      %%rdx"                     "\n\t"
488      "movq      %%rcx,  %0"                "\n\t"
489      "movq      %%rdx,  %1"                "\n\t"
490      "addq      $1024,  %%rsp"             "\n\t"
491      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
492      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
493   );
494   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
495}
496
497UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
498{
499   V128 resV;
500   UInt resOSZACP, resECX;
501   Bool ok
502      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
503                       zmask_from_V128(argLU),
504                       zmask_from_V128(argRU),
505                       0x3A, False/*!isSTRM*/
506        );
507   assert(ok);
508   resECX = resV.uInt[0];
509   return (resOSZACP << 16) | resECX;
510}
511
512void istri_3A ( void )
513{
514   char* wot = "3A";
515   UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
516   UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
517
518   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
519
520   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
521   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
522   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
523   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
524
525   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
526   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
527   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
528
529   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
530   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
531   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
532   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
533
534   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
535   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
536   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
537
538   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
539
540   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
541   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
542   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
543
544   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
545   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
546   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
547
548   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
549   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
550   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
551
552   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
553   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
554   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
555
556   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
557   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
558}
559
560
561
562//////////////////////////////////////////////////////////
563//                                                      //
564//                       ISTRI_0C                       //
565//                                                      //
566//////////////////////////////////////////////////////////
567
568__attribute__((noinline))
569UInt h_pcmpistri_0C ( V128* argL, V128* argR )
570{
571   V128 block[2];
572   memcpy(&block[0], argL, sizeof(V128));
573   memcpy(&block[1], argR, sizeof(V128));
574   ULong res = 0, flags = 0;
575   __asm__ __volatile__(
576      "movdqa    0(%2),  %%xmm2"            "\n\t"
577      "movdqa    16(%2), %%xmm11"           "\n\t"
578      "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
579      //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
580      //"movd %%xmm0, %%ecx" "\n\t"
581      "pushfq"                              "\n\t"
582      "popq      %%rdx"                     "\n\t"
583      "movq      %%rcx,  %0"                "\n\t"
584      "movq      %%rdx,  %1"                "\n\t"
585      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
586      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
587   );
588   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
589}
590
591UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
592{
593   V128 resV;
594   UInt resOSZACP, resECX;
595   Bool ok
596      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
597                       zmask_from_V128(argLU),
598                       zmask_from_V128(argRU),
599                       0x0C, False/*!isSTRM*/
600        );
601   assert(ok);
602   resECX = resV.uInt[0];
603   return (resOSZACP << 16) | resECX;
604}
605
606void istri_0C ( void )
607{
608   char* wot = "0C";
609   UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
610   UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
611
612   try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
613
614   try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
615
616   try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
617   try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
618   try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
619
620   try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
621
622   try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
623   try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
624   try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
625   try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
626   try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
627
628   try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
629   try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
630   try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
631
632   try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
633   try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
634
635   try_istri(wot,h,s, "1111111111111234", "0000000000000000");
636   try_istri(wot,h,s, "1111111111111234", "0000000000000001");
637   try_istri(wot,h,s, "1111111111111234", "0000000000000011");
638
639   try_istri(wot,h,s, "1111111111111234", "1111111111111234");
640   try_istri(wot,h,s, "a111111111111111", "000000000000000a");
641   try_istri(wot,h,s, "b111111111111111", "000000000000000a");
642}
643
644
645//////////////////////////////////////////////////////////
646//                                                      //
647//                       ISTRI_08                       //
648//                                                      //
649//////////////////////////////////////////////////////////
650
651UInt h_pcmpistri_08 ( V128* argL, V128* argR )
652{
653   V128 block[2];
654   memcpy(&block[0], argL, sizeof(V128));
655   memcpy(&block[1], argR, sizeof(V128));
656   ULong res, flags;
657   __asm__ __volatile__(
658      "subq      $1024,  %%rsp"             "\n\t"
659      "movdqu    0(%2),  %%xmm2"            "\n\t"
660      "movdqu    16(%2), %%xmm11"           "\n\t"
661      "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
662      "pushfq"                              "\n\t"
663      "popq      %%rdx"                     "\n\t"
664      "movq      %%rcx,  %0"                "\n\t"
665      "movq      %%rdx,  %1"                "\n\t"
666      "addq      $1024,  %%rsp"             "\n\t"
667      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
668      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
669   );
670   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
671}
672
673UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
674{
675   V128 resV;
676   UInt resOSZACP, resECX;
677   Bool ok
678      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
679                       zmask_from_V128(argLU),
680                       zmask_from_V128(argRU),
681                       0x08, False/*!isSTRM*/
682        );
683   assert(ok);
684   resECX = resV.uInt[0];
685   return (resOSZACP << 16) | resECX;
686}
687
688void istri_08 ( void )
689{
690   char* wot = "08";
691   UInt(*h)(V128*,V128*) = h_pcmpistri_08;
692   UInt(*s)(V128*,V128*) = s_pcmpistri_08;
693
694   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
695
696   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
697   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
698   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
699   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
700
701   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
702   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
703   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
704
705   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
706   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
707   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
708   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
709
710   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
711   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
712   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
713
714   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
715
716   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
717   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
718   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
719
720   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
721   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
722   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
723
724   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
725   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
726   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
727
728   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
729   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
730   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
731
732   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
733   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
734}
735
736
737
738//////////////////////////////////////////////////////////
739//                                                      //
740//                       ISTRI_1A                       //
741//                                                      //
742//////////////////////////////////////////////////////////
743
744UInt h_pcmpistri_1A ( V128* argL, V128* argR )
745{
746   V128 block[2];
747   memcpy(&block[0], argL, sizeof(V128));
748   memcpy(&block[1], argR, sizeof(V128));
749   ULong res, flags;
750   __asm__ __volatile__(
751      "subq      $1024,  %%rsp"             "\n\t"
752      "movdqu    0(%2),  %%xmm2"            "\n\t"
753      "movdqu    16(%2), %%xmm11"           "\n\t"
754      "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
755      "pushfq"                              "\n\t"
756      "popq      %%rdx"                     "\n\t"
757      "movq      %%rcx,  %0"                "\n\t"
758      "movq      %%rdx,  %1"                "\n\t"
759      "addq      $1024,  %%rsp"             "\n\t"
760      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
761      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
762   );
763   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
764}
765
766UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
767{
768   V128 resV;
769   UInt resOSZACP, resECX;
770   Bool ok
771      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
772                       zmask_from_V128(argLU),
773                       zmask_from_V128(argRU),
774                       0x1A, False/*!isSTRM*/
775        );
776   assert(ok);
777   resECX = resV.uInt[0];
778   return (resOSZACP << 16) | resECX;
779}
780
781void istri_1A ( void )
782{
783   char* wot = "1A";
784   UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
785   UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
786
787   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
788
789   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
790   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
791   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
792   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
793
794   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
795   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
796   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
797
798   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
799   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
800   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
801   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
802
803   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
804   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
805   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
806
807   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
808
809   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
810   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
811   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
812
813   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
814   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
815   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
816
817   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
818   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
819   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
820
821   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
822   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
823   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
824
825   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
826   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
827}
828
829
830
831//////////////////////////////////////////////////////////
832//                                                      //
833//                       ISTRI_02                       //
834//                                                      //
835//////////////////////////////////////////////////////////
836
837UInt h_pcmpistri_02 ( V128* argL, V128* argR )
838{
839   V128 block[2];
840   memcpy(&block[0], argL, sizeof(V128));
841   memcpy(&block[1], argR, sizeof(V128));
842   ULong res, flags;
843   __asm__ __volatile__(
844      "subq      $1024,  %%rsp"             "\n\t"
845      "movdqu    0(%2),  %%xmm2"            "\n\t"
846      "movdqu    16(%2), %%xmm11"           "\n\t"
847      "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
848//"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
849//"movd %%xmm0, %%ecx" "\n\t"
850      "pushfq"                              "\n\t"
851      "popq      %%rdx"                     "\n\t"
852      "movq      %%rcx,  %0"                "\n\t"
853      "movq      %%rdx,  %1"                "\n\t"
854      "addq      $1024,  %%rsp"             "\n\t"
855      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
856      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
857   );
858   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
859}
860
861UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
862{
863   V128 resV;
864   UInt resOSZACP, resECX;
865   Bool ok
866      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
867                       zmask_from_V128(argLU),
868                       zmask_from_V128(argRU),
869                       0x02, False/*!isSTRM*/
870        );
871   assert(ok);
872   resECX = resV.uInt[0];
873   return (resOSZACP << 16) | resECX;
874}
875
876void istri_02 ( void )
877{
878   char* wot = "02";
879   UInt(*h)(V128*,V128*) = h_pcmpistri_02;
880   UInt(*s)(V128*,V128*) = s_pcmpistri_02;
881
882   try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
883   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
884   try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
885   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
886
887   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
888   try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
889   try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
890   try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
891   try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
892
893   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
894   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
895   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
896   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
897
898   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
899   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
900
901   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
902   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
903   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
904   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
905
906   try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
907
908   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
909   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
910}
911
912
913//////////////////////////////////////////////////////////
914//                                                      //
915//                       ISTRI_12                       //
916//                                                      //
917//////////////////////////////////////////////////////////
918
919UInt h_pcmpistri_12 ( V128* argL, V128* argR )
920{
921   V128 block[2];
922   memcpy(&block[0], argL, sizeof(V128));
923   memcpy(&block[1], argR, sizeof(V128));
924   ULong res, flags;
925   __asm__ __volatile__(
926      "subq      $1024,  %%rsp"             "\n\t"
927      "movdqu    0(%2),  %%xmm2"            "\n\t"
928      "movdqu    16(%2), %%xmm11"           "\n\t"
929      "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
930//"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
931//"movd %%xmm0, %%ecx" "\n\t"
932      "pushfq"                              "\n\t"
933      "popq      %%rdx"                     "\n\t"
934      "movq      %%rcx,  %0"                "\n\t"
935      "movq      %%rdx,  %1"                "\n\t"
936      "addq      $1024,  %%rsp"             "\n\t"
937      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
938      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
939   );
940   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
941}
942
943UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
944{
945   V128 resV;
946   UInt resOSZACP, resECX;
947   Bool ok
948      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
949                       zmask_from_V128(argLU),
950                       zmask_from_V128(argRU),
951                       0x12, False/*!isSTRM*/
952        );
953   assert(ok);
954   resECX = resV.uInt[0];
955   return (resOSZACP << 16) | resECX;
956}
957
958void istri_12 ( void )
959{
960   char* wot = "12";
961   UInt(*h)(V128*,V128*) = h_pcmpistri_12;
962   UInt(*s)(V128*,V128*) = s_pcmpistri_12;
963
964   try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
965   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
966   try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
967   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
968
969   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
970   try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
971   try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
972   try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
973   try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
974
975   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
976   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
977   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
978   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
979
980   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
981   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
982
983   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
984   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
985   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
986   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
987
988   try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
989
990   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
991   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
992}
993
994
995
996//////////////////////////////////////////////////////////
997//                                                      //
998//                       ISTRI_44                       //
999//                                                      //
1000//////////////////////////////////////////////////////////
1001
1002UInt h_pcmpistri_44 ( V128* argL, V128* argR )
1003{
1004   V128 block[2];
1005   memcpy(&block[0], argL, sizeof(V128));
1006   memcpy(&block[1], argR, sizeof(V128));
1007   ULong res, flags;
1008   __asm__ __volatile__(
1009      "subq      $1024,  %%rsp"             "\n\t"
1010      "movdqu    0(%2),  %%xmm2"            "\n\t"
1011      "movdqu    16(%2), %%xmm11"           "\n\t"
1012      "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
1013//"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
1014//"movd %%xmm0, %%ecx" "\n\t"
1015      "pushfq"                              "\n\t"
1016      "popq      %%rdx"                     "\n\t"
1017      "movq      %%rcx,  %0"                "\n\t"
1018      "movq      %%rdx,  %1"                "\n\t"
1019      "addq      $1024,  %%rsp"             "\n\t"
1020      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1021      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1022   );
1023   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1024}
1025
1026UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
1027{
1028   V128 resV;
1029   UInt resOSZACP, resECX;
1030   Bool ok
1031      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1032                       zmask_from_V128(argLU),
1033                       zmask_from_V128(argRU),
1034                       0x44, False/*!isSTRM*/
1035        );
1036   assert(ok);
1037   resECX = resV.uInt[0];
1038   return (resOSZACP << 16) | resECX;
1039}
1040
1041void istri_44 ( void )
1042{
1043   char* wot = "44";
1044   UInt(*h)(V128*,V128*) = h_pcmpistri_44;
1045   UInt(*s)(V128*,V128*) = s_pcmpistri_44;
1046
1047   try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
1048   try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
1049   try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
1050   try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
1051
1052   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1053   try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
1054   try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
1055   try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
1056   try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
1057
1058   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1059
1060   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
1061   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
1062   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
1063
1064   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
1065   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
1066   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
1067
1068   try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
1069   try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
1070
1071   try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
1072   try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
1073}
1074
1075
1076//////////////////////////////////////////////////////////
1077//                                                      //
1078//                       ISTRI_00                       //
1079//                                                      //
1080//////////////////////////////////////////////////////////
1081
1082UInt h_pcmpistri_00 ( V128* argL, V128* argR )
1083{
1084   V128 block[2];
1085   memcpy(&block[0], argL, sizeof(V128));
1086   memcpy(&block[1], argR, sizeof(V128));
1087   ULong res, flags;
1088   __asm__ __volatile__(
1089      "subq      $1024,  %%rsp"             "\n\t"
1090      "movdqu    0(%2),  %%xmm2"            "\n\t"
1091      "movdqu    16(%2), %%xmm11"           "\n\t"
1092      "pcmpistri $0x00,  %%xmm2, %%xmm11"   "\n\t"
1093//"pcmpistrm $0x00, %%xmm2, %%xmm11"   "\n\t"
1094//"movd %%xmm0, %%ecx" "\n\t"
1095      "pushfq"                              "\n\t"
1096      "popq      %%rdx"                     "\n\t"
1097      "movq      %%rcx,  %0"                "\n\t"
1098      "movq      %%rdx,  %1"                "\n\t"
1099      "addq      $1024,  %%rsp"             "\n\t"
1100      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1101      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1102   );
1103   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1104}
1105
1106UInt s_pcmpistri_00 ( V128* argLU, V128* argRU )
1107{
1108   V128 resV;
1109   UInt resOSZACP, resECX;
1110   Bool ok
1111      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1112                       zmask_from_V128(argLU),
1113                       zmask_from_V128(argRU),
1114                       0x00, False/*!isSTRM*/
1115        );
1116   assert(ok);
1117   resECX = resV.uInt[0];
1118   return (resOSZACP << 16) | resECX;
1119}
1120
1121void istri_00 ( void )
1122{
1123   char* wot = "00";
1124   UInt(*h)(V128*,V128*) = h_pcmpistri_00;
1125   UInt(*s)(V128*,V128*) = s_pcmpistri_00;
1126
1127   try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
1128   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
1129   try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
1130   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
1131
1132   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1133   try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
1134   try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
1135   try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
1136   try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
1137
1138   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
1139   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
1140   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
1141   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
1142
1143   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1144   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1145
1146   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
1147   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
1148   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
1149   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
1150
1151   try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
1152
1153   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
1154   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
1155}
1156
1157
1158//////////////////////////////////////////////////////////
1159//                                                      //
1160//                       ISTRI_38                       //
1161//                                                      //
1162//////////////////////////////////////////////////////////
1163
1164UInt h_pcmpistri_38 ( V128* argL, V128* argR )
1165{
1166   V128 block[2];
1167   memcpy(&block[0], argL, sizeof(V128));
1168   memcpy(&block[1], argR, sizeof(V128));
1169   ULong res, flags;
1170   __asm__ __volatile__(
1171      "subq      $1024,  %%rsp"             "\n\t"
1172      "movdqu    0(%2),  %%xmm2"            "\n\t"
1173      "movdqu    16(%2), %%xmm11"           "\n\t"
1174      "pcmpistri $0x38,  %%xmm2, %%xmm11"   "\n\t"
1175      "pushfq"                              "\n\t"
1176      "popq      %%rdx"                     "\n\t"
1177      "movq      %%rcx,  %0"                "\n\t"
1178      "movq      %%rdx,  %1"                "\n\t"
1179      "addq      $1024,  %%rsp"             "\n\t"
1180      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1181      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1182   );
1183   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1184}
1185
1186UInt s_pcmpistri_38 ( V128* argLU, V128* argRU )
1187{
1188   V128 resV;
1189   UInt resOSZACP, resECX;
1190   Bool ok
1191      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1192                       zmask_from_V128(argLU),
1193                       zmask_from_V128(argRU),
1194                       0x38, False/*!isSTRM*/
1195        );
1196   assert(ok);
1197   resECX = resV.uInt[0];
1198   return (resOSZACP << 16) | resECX;
1199}
1200
1201void istri_38 ( void )
1202{
1203   char* wot = "38";
1204   UInt(*h)(V128*,V128*) = h_pcmpistri_38;
1205   UInt(*s)(V128*,V128*) = s_pcmpistri_38;
1206
1207   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1208
1209   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1210   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1211   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
1212   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
1213
1214   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
1215   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
1216   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
1217
1218   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1219   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1220   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1221   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1222
1223   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1224   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
1225   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
1226
1227   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1228
1229   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1230   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1231   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
1232
1233   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
1234   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1235   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
1236
1237   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1238   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
1239   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
1240
1241   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
1242   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
1243   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
1244
1245   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
1246   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
1247}
1248
1249
1250
1251//////////////////////////////////////////////////////////
1252//                                                      //
1253//                         main                         //
1254//                                                      //
1255//////////////////////////////////////////////////////////
1256
1257int main ( void )
1258{
1259   istri_4A();
1260   istri_3A();
1261   istri_08();
1262   istri_1A();
1263   istri_02();
1264   istri_0C();
1265   istri_12();
1266   istri_44();
1267   return 0;
1268}
1269
1270/* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
1271   pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
1272   aspect. */
1273
1274#include <string.h>
1275#include <stdio.h>
1276#include <assert.h>
1277
1278typedef  unsigned int   UInt;
1279typedef  signed int     Int;
1280typedef  unsigned char  UChar;
1281typedef  unsigned long long int ULong;
1282typedef  UChar          Bool;
1283#define False ((Bool)0)
1284#define True  ((Bool)1)
1285
1286//typedef  unsigned char  V128[16];
1287typedef
1288   union {
1289      UChar uChar[16];
1290      UInt  uInt[4];
1291   }
1292   V128;
1293
1294#define SHIFT_O   11
1295#define SHIFT_S   7
1296#define SHIFT_Z   6
1297#define SHIFT_A   4
1298#define SHIFT_C   0
1299#define SHIFT_P   2
1300
1301#define MASK_O    (1ULL << SHIFT_O)
1302#define MASK_S    (1ULL << SHIFT_S)
1303#define MASK_Z    (1ULL << SHIFT_Z)
1304#define MASK_A    (1ULL << SHIFT_A)
1305#define MASK_C    (1ULL << SHIFT_C)
1306#define MASK_P    (1ULL << SHIFT_P)
1307
1308
1309UInt clz32 ( UInt x )
1310{
1311   Int y, m, n;
1312   y = -(x >> 16);
1313   m = (y >> 16) & 16;
1314   n = 16 - m;
1315   x = x >> m;
1316   y = x - 0x100;
1317   m = (y >> 16) & 8;
1318   n = n + m;
1319   x = x << m;
1320   y = x - 0x1000;
1321   m = (y >> 16) & 4;
1322   n = n + m;
1323   x = x << m;
1324   y = x - 0x4000;
1325   m = (y >> 16) & 2;
1326   n = n + m;
1327   x = x << m;
1328   y = x >> 14;
1329   m = y & ~(y >> 1);
1330   return n + 2 - m;
1331}
1332
1333UInt ctz32 ( UInt x )
1334{
1335   return 32 - clz32((~x) & (x-1));
1336}
1337
1338void expand ( V128* dst, char* summary )
1339{
1340   Int i;
1341   assert( strlen(summary) == 16 );
1342   for (i = 0; i < 16; i++) {
1343      UChar xx = 0;
1344      UChar x = summary[15-i];
1345      if      (x >= '0' && x <= '9') { xx = x - '0'; }
1346      else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
1347      else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
1348      else assert(0);
1349
1350      assert(xx < 16);
1351      xx = (xx << 4) | xx;
1352      assert(xx < 256);
1353      dst->uChar[i] = xx;
1354   }
1355}
1356
1357void try_istri ( char* which,
1358                 UInt(*h_fn)(V128*,V128*),
1359                 UInt(*s_fn)(V128*,V128*),
1360                 char* summL, char* summR )
1361{
1362   assert(strlen(which) == 2);
1363   V128 argL, argR;
1364   expand(&argL, summL);
1365   expand(&argR, summR);
1366   UInt h_res = h_fn(&argL, &argR);
1367   UInt s_res = s_fn(&argL, &argR);
1368   printf("istri %s  %s %s -> %08x %08x %s\n",
1369          which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
1370}
1371
1372UInt zmask_from_V128 ( V128* arg )
1373{
1374   UInt i, res = 0;
1375   for (i = 0; i < 16; i++) {
1376      res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
1377   }
1378   return res;
1379}
1380
1381//////////////////////////////////////////////////////////
1382//                                                      //
1383//                       GENERAL                        //
1384//                                                      //
1385//////////////////////////////////////////////////////////
1386
1387
1388/* Given partial results from a pcmpXstrX operation (intRes1,
1389   basically), generate an I format (index value for ECX) output, and
1390   also the new OSZACP flags.
1391*/
1392static
1393void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
1394                                    /*OUT*/UInt* resOSZACP,
1395                                    UInt intRes1,
1396                                    UInt zmaskL, UInt zmaskR,
1397                                    UInt validL,
1398                                    UInt pol, UInt idx )
1399{
1400   assert((pol >> 2) == 0);
1401   assert((idx >> 1) == 0);
1402
1403   UInt intRes2 = 0;
1404   switch (pol) {
1405      case 0: intRes2 = intRes1;          break; // pol +
1406      case 1: intRes2 = ~intRes1;         break; // pol -
1407      case 2: intRes2 = intRes1;          break; // pol m+
1408      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
1409   }
1410   intRes2 &= 0xFFFF;
1411
1412   // generate ecx value
1413   UInt newECX = 0;
1414   if (idx) {
1415     // index of ms-1-bit
1416     newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
1417   } else {
1418     // index of ls-1-bit
1419     newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
1420   }
1421
1422   *(UInt*)(&resV[0]) = newECX;
1423
1424   // generate new flags, common to all ISTRI and ISTRM cases
1425   *resOSZACP    // A, P are zero
1426     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
1427     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
1428     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
1429     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
1430}
1431
1432
1433/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
1434   variants.
1435
1436   For xSTRI variants, the new ECX value is placed in the 32 bits
1437   pointed to by *resV.  For xSTRM variants, the result is a 128 bit
1438   value and is placed at *resV in the obvious way.
1439
1440   For all variants, the new OSZACP value is placed at *resOSZACP.
1441
1442   argLV and argRV are the vector args.  The caller must prepare a
1443   16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
1444   must be 1 for each zero byte of of the respective arg.  For ESTRx
1445   variants this is derived from the explicit length indication, and
1446   must be 0 in all places except at the bit index corresponding to
1447   the valid length (0 .. 16).  If the valid length is 16 then the
1448   mask must be all zeroes.  In all cases, bits 31:16 must be zero.
1449
1450   imm8 is the original immediate from the instruction.  isSTRM
1451   indicates whether this is a xSTRM or xSTRI variant, which controls
1452   how much of *res is written.
1453
1454   If the given imm8 case can be handled, the return value is True.
1455   If not, False is returned, and neither *res not *resOSZACP are
1456   altered.
1457*/
1458
1459Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
1460                     /*OUT*/UInt* resOSZACP,
1461                     V128* argLV,  V128* argRV,
1462                     UInt zmaskL, UInt zmaskR,
1463                     UInt imm8,   Bool isSTRM )
1464{
1465   assert(imm8 < 0x80);
1466   assert((zmaskL >> 16) == 0);
1467   assert((zmaskR >> 16) == 0);
1468
1469   /* Explicitly reject any imm8 values that haven't been validated,
1470      even if they would probably work.  Life is too short to have
1471      unvalidated cases in the code base. */
1472   switch (imm8) {
1473      case 0x02: case 0x08: case 0x0C: case 0x12: case 0x1A:
1474      case 0x3A: case 0x44: case 0x4A:
1475         break;
1476      default:
1477         return False;
1478   }
1479
1480   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
1481   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
1482   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
1483   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
1484
1485   /*----------------------------------------*/
1486   /*-- strcmp on byte data                --*/
1487   /*----------------------------------------*/
1488
1489   if (agg == 2/*equal each, aka strcmp*/
1490       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
1491       && !isSTRM) {
1492      Int    i;
1493      UChar* argL = (UChar*)argLV;
1494      UChar* argR = (UChar*)argRV;
1495      UInt boolResII = 0;
1496      for (i = 15; i >= 0; i--) {
1497         UChar cL  = argL[i];
1498         UChar cR  = argR[i];
1499         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
1500      }
1501      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
1502      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
1503
1504      // do invalidation, common to all equal-each cases
1505      UInt intRes1
1506         = (boolResII & validL & validR)  // if both valid, use cmpres
1507           | (~ (validL | validR));       // if both invalid, force 1
1508                                          // else force 0
1509      intRes1 &= 0xFFFF;
1510
1511      // generate I-format output
1512      pcmpXstrX_WRK_gen_output_fmt_I(
1513         resV, resOSZACP,
1514         intRes1, zmaskL, zmaskR, validL, pol, idx
1515      );
1516
1517      return True;
1518   }
1519
1520   /*----------------------------------------*/
1521   /*-- set membership on byte data        --*/
1522   /*----------------------------------------*/
1523
1524   if (agg == 0/*equal any, aka find chars in a set*/
1525       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
1526       && !isSTRM) {
1527      /* argL: the string,  argR: charset */
1528      UInt   si, ci;
1529      UChar* argL    = (UChar*)argLV;
1530      UChar* argR    = (UChar*)argRV;
1531      UInt   boolRes = 0;
1532      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
1533      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
1534
1535      for (si = 0; si < 16; si++) {
1536         if ((validL & (1 << si)) == 0)
1537            // run off the end of the string.
1538            break;
1539         UInt m = 0;
1540         for (ci = 0; ci < 16; ci++) {
1541            if ((validR & (1 << ci)) == 0) break;
1542            if (argR[ci] == argL[si]) { m = 1; break; }
1543         }
1544         boolRes |= (m << si);
1545      }
1546
1547      // boolRes is "pre-invalidated"
1548      UInt intRes1 = boolRes & 0xFFFF;
1549
1550      // generate I-format output
1551      pcmpXstrX_WRK_gen_output_fmt_I(
1552         resV, resOSZACP,
1553         intRes1, zmaskL, zmaskR, validL, pol, idx
1554      );
1555
1556      return True;
1557   }
1558
1559   /*----------------------------------------*/
1560   /*-- substring search on byte data      --*/
1561   /*----------------------------------------*/
1562
1563   if (agg == 3/*equal ordered, aka substring search*/
1564       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
1565       && !isSTRM) {
1566
1567      /* argL: haystack,  argR: needle */
1568      UInt   ni, hi;
1569      UChar* argL    = (UChar*)argLV;
1570      UChar* argR    = (UChar*)argRV;
1571      UInt   boolRes = 0;
1572      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
1573      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
1574      for (hi = 0; hi < 16; hi++) {
1575         if ((validL & (1 << hi)) == 0)
1576            // run off the end of the haystack
1577            break;
1578         UInt m = 1;
1579         for (ni = 0; ni < 16; ni++) {
1580            if ((validR & (1 << ni)) == 0) break;
1581            UInt i = ni + hi;
1582            if (i >= 16) break;
1583            if (argL[i] != argR[ni]) { m = 0; break; }
1584         }
1585         boolRes |= (m << hi);
1586      }
1587
1588      // boolRes is "pre-invalidated"
1589      UInt intRes1 = boolRes & 0xFFFF;
1590
1591      // generate I-format output
1592      pcmpXstrX_WRK_gen_output_fmt_I(
1593         resV, resOSZACP,
1594         intRes1, zmaskL, zmaskR, validL, pol, idx
1595      );
1596
1597      return True;
1598   }
1599
1600   /*----------------------------------------*/
1601   /*-- ranges, unsigned byte data         --*/
1602   /*----------------------------------------*/
1603
1604   if (agg == 1/*ranges*/
1605       && fmt == 0/*ub*/
1606       && !isSTRM) {
1607
1608      /* argL: string,  argR: range-pairs */
1609      UInt   ri, si;
1610      UChar* argL    = (UChar*)argLV;
1611      UChar* argR    = (UChar*)argRV;
1612      UInt   boolRes = 0;
1613      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
1614      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
1615      for (si = 0; si < 16; si++) {
1616         if ((validL & (1 << si)) == 0)
1617            // run off the end of the string
1618            break;
1619         UInt m = 0;
1620         for (ri = 0; ri < 16; ri += 2) {
1621            if ((validR & (3 << ri)) != (3 << ri)) break;
1622            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
1623               m = 1; break;
1624            }
1625         }
1626         boolRes |= (m << si);
1627      }
1628
1629      // boolRes is "pre-invalidated"
1630      UInt intRes1 = boolRes & 0xFFFF;
1631
1632      // generate I-format output
1633      pcmpXstrX_WRK_gen_output_fmt_I(
1634         resV, resOSZACP,
1635         intRes1, zmaskL, zmaskR, validL, pol, idx
1636      );
1637
1638      return True;
1639   }
1640
1641   return False;
1642}
1643
1644
1645//////////////////////////////////////////////////////////
1646//                                                      //
1647//                       ISTRI_4A                       //
1648//                                                      //
1649//////////////////////////////////////////////////////////
1650
1651UInt h_pcmpistri_4A ( V128* argL, V128* argR )
1652{
1653   V128 block[2];
1654   memcpy(&block[0], argL, sizeof(V128));
1655   memcpy(&block[1], argR, sizeof(V128));
1656   ULong res, flags;
1657   __asm__ __volatile__(
1658      "subq      $1024,  %%rsp"             "\n\t"
1659      "movdqu    0(%2),  %%xmm2"            "\n\t"
1660      "movdqu    16(%2), %%xmm11"           "\n\t"
1661      "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
1662      "pushfq"                              "\n\t"
1663      "popq      %%rdx"                     "\n\t"
1664      "movq      %%rcx,  %0"                "\n\t"
1665      "movq      %%rdx,  %1"                "\n\t"
1666      "addq      $1024,  %%rsp"             "\n\t"
1667      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1668      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1669   );
1670   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1671}
1672
1673UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
1674{
1675   V128 resV;
1676   UInt resOSZACP, resECX;
1677   Bool ok
1678      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1679                       zmask_from_V128(argLU),
1680                       zmask_from_V128(argRU),
1681                       0x4A, False/*!isSTRM*/
1682        );
1683   assert(ok);
1684   resECX = resV.uInt[0];
1685   return (resOSZACP << 16) | resECX;
1686}
1687
1688void istri_4A ( void )
1689{
1690   char* wot = "4A";
1691   UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
1692   UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
1693
1694   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1695
1696   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1697   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1698   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
1699   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
1700
1701   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
1702   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
1703   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
1704
1705   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1706   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1707   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1708   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1709
1710   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1711   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
1712   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
1713
1714   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1715
1716   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1717   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1718   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
1719
1720   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
1721   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1722   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
1723
1724   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1725   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
1726   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
1727
1728   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
1729   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
1730   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
1731
1732   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
1733   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
1734}
1735
1736//////////////////////////////////////////////////////////
1737//                                                      //
1738//                       ISTRI_3A                       //
1739//                                                      //
1740//////////////////////////////////////////////////////////
1741
1742UInt h_pcmpistri_3A ( V128* argL, V128* argR )
1743{
1744   V128 block[2];
1745   memcpy(&block[0], argL, sizeof(V128));
1746   memcpy(&block[1], argR, sizeof(V128));
1747   ULong res, flags;
1748   __asm__ __volatile__(
1749      "subq      $1024,  %%rsp"             "\n\t"
1750      "movdqu    0(%2),  %%xmm2"            "\n\t"
1751      "movdqu    16(%2), %%xmm11"           "\n\t"
1752      "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
1753      "pushfq"                              "\n\t"
1754      "popq      %%rdx"                     "\n\t"
1755      "movq      %%rcx,  %0"                "\n\t"
1756      "movq      %%rdx,  %1"                "\n\t"
1757      "addq      $1024,  %%rsp"             "\n\t"
1758      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1759      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1760   );
1761   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1762}
1763
1764UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
1765{
1766   V128 resV;
1767   UInt resOSZACP, resECX;
1768   Bool ok
1769      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1770                       zmask_from_V128(argLU),
1771                       zmask_from_V128(argRU),
1772                       0x3A, False/*!isSTRM*/
1773        );
1774   assert(ok);
1775   resECX = resV.uInt[0];
1776   return (resOSZACP << 16) | resECX;
1777}
1778
1779void istri_3A ( void )
1780{
1781   char* wot = "3A";
1782   UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
1783   UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
1784
1785   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1786
1787   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1788   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1789   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
1790   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
1791
1792   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
1793   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
1794   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
1795
1796   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1797   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1798   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1799   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1800
1801   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1802   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
1803   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
1804
1805   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1806
1807   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1808   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1809   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
1810
1811   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
1812   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1813   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
1814
1815   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1816   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
1817   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
1818
1819   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
1820   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
1821   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
1822
1823   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
1824   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
1825}
1826
1827
1828
1829//////////////////////////////////////////////////////////
1830//                                                      //
1831//                       ISTRI_0C                       //
1832//                                                      //
1833//////////////////////////////////////////////////////////
1834
1835__attribute__((noinline))
1836UInt h_pcmpistri_0C ( V128* argL, V128* argR )
1837{
1838   V128 block[2];
1839   memcpy(&block[0], argL, sizeof(V128));
1840   memcpy(&block[1], argR, sizeof(V128));
1841   ULong res = 0, flags = 0;
1842   __asm__ __volatile__(
1843      "movdqa    0(%2),  %%xmm2"            "\n\t"
1844      "movdqa    16(%2), %%xmm11"           "\n\t"
1845      "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
1846      //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
1847      //"movd %%xmm0, %%ecx" "\n\t"
1848      "pushfq"                              "\n\t"
1849      "popq      %%rdx"                     "\n\t"
1850      "movq      %%rcx,  %0"                "\n\t"
1851      "movq      %%rdx,  %1"                "\n\t"
1852      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1853      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1854   );
1855   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1856}
1857
1858UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
1859{
1860   V128 resV;
1861   UInt resOSZACP, resECX;
1862   Bool ok
1863      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1864                       zmask_from_V128(argLU),
1865                       zmask_from_V128(argRU),
1866                       0x0C, False/*!isSTRM*/
1867        );
1868   assert(ok);
1869   resECX = resV.uInt[0];
1870   return (resOSZACP << 16) | resECX;
1871}
1872
1873void istri_0C ( void )
1874{
1875   char* wot = "0C";
1876   UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
1877   UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
1878
1879   try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
1880
1881   try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
1882
1883   try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
1884   try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
1885   try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
1886
1887   try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
1888
1889   try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
1890   try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
1891   try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
1892   try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
1893   try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
1894
1895   try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
1896   try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
1897   try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
1898
1899   try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
1900   try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
1901
1902   try_istri(wot,h,s, "1111111111111234", "0000000000000000");
1903   try_istri(wot,h,s, "1111111111111234", "0000000000000001");
1904   try_istri(wot,h,s, "1111111111111234", "0000000000000011");
1905
1906   try_istri(wot,h,s, "1111111111111234", "1111111111111234");
1907   try_istri(wot,h,s, "a111111111111111", "000000000000000a");
1908   try_istri(wot,h,s, "b111111111111111", "000000000000000a");
1909}
1910
1911
1912//////////////////////////////////////////////////////////
1913//                                                      //
1914//                       ISTRI_08                       //
1915//                                                      //
1916//////////////////////////////////////////////////////////
1917
1918UInt h_pcmpistri_08 ( V128* argL, V128* argR )
1919{
1920   V128 block[2];
1921   memcpy(&block[0], argL, sizeof(V128));
1922   memcpy(&block[1], argR, sizeof(V128));
1923   ULong res, flags;
1924   __asm__ __volatile__(
1925      "subq      $1024,  %%rsp"             "\n\t"
1926      "movdqu    0(%2),  %%xmm2"            "\n\t"
1927      "movdqu    16(%2), %%xmm11"           "\n\t"
1928      "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
1929      "pushfq"                              "\n\t"
1930      "popq      %%rdx"                     "\n\t"
1931      "movq      %%rcx,  %0"                "\n\t"
1932      "movq      %%rdx,  %1"                "\n\t"
1933      "addq      $1024,  %%rsp"             "\n\t"
1934      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
1935      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
1936   );
1937   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
1938}
1939
1940UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
1941{
1942   V128 resV;
1943   UInt resOSZACP, resECX;
1944   Bool ok
1945      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
1946                       zmask_from_V128(argLU),
1947                       zmask_from_V128(argRU),
1948                       0x08, False/*!isSTRM*/
1949        );
1950   assert(ok);
1951   resECX = resV.uInt[0];
1952   return (resOSZACP << 16) | resECX;
1953}
1954
1955void istri_08 ( void )
1956{
1957   char* wot = "08";
1958   UInt(*h)(V128*,V128*) = h_pcmpistri_08;
1959   UInt(*s)(V128*,V128*) = s_pcmpistri_08;
1960
1961   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
1962
1963   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1964   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1965   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
1966   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
1967
1968   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
1969   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
1970   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
1971
1972   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1973   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1974   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1975   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1976
1977   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1978   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
1979   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
1980
1981   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
1982
1983   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1984   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1985   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
1986
1987   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
1988   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
1989   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
1990
1991   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
1992   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
1993   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
1994
1995   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
1996   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
1997   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
1998
1999   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
2000   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
2001}
2002
2003
2004
2005//////////////////////////////////////////////////////////
2006//                                                      //
2007//                       ISTRI_1A                       //
2008//                                                      //
2009//////////////////////////////////////////////////////////
2010
2011UInt h_pcmpistri_1A ( V128* argL, V128* argR )
2012{
2013   V128 block[2];
2014   memcpy(&block[0], argL, sizeof(V128));
2015   memcpy(&block[1], argR, sizeof(V128));
2016   ULong res, flags;
2017   __asm__ __volatile__(
2018      "subq      $1024,  %%rsp"             "\n\t"
2019      "movdqu    0(%2),  %%xmm2"            "\n\t"
2020      "movdqu    16(%2), %%xmm11"           "\n\t"
2021      "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
2022      "pushfq"                              "\n\t"
2023      "popq      %%rdx"                     "\n\t"
2024      "movq      %%rcx,  %0"                "\n\t"
2025      "movq      %%rdx,  %1"                "\n\t"
2026      "addq      $1024,  %%rsp"             "\n\t"
2027      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
2028      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
2029   );
2030   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
2031}
2032
2033UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
2034{
2035   V128 resV;
2036   UInt resOSZACP, resECX;
2037   Bool ok
2038      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
2039                       zmask_from_V128(argLU),
2040                       zmask_from_V128(argRU),
2041                       0x1A, False/*!isSTRM*/
2042        );
2043   assert(ok);
2044   resECX = resV.uInt[0];
2045   return (resOSZACP << 16) | resECX;
2046}
2047
2048void istri_1A ( void )
2049{
2050   char* wot = "1A";
2051   UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
2052   UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
2053
2054   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
2055
2056   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2057   try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2058   try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
2059   try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
2060
2061   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
2062   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
2063   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
2064
2065   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2066   try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2067   try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2068   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2069
2070   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2071   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
2072   try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
2073
2074   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2075
2076   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
2077   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
2078   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
2079
2080   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
2081   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
2082   try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
2083
2084   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
2085   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
2086   try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
2087
2088   try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
2089   try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
2090   try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
2091
2092   try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
2093   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
2094}
2095
2096
2097
2098//////////////////////////////////////////////////////////
2099//                                                      //
2100//                       ISTRI_02                       //
2101//                                                      //
2102//////////////////////////////////////////////////////////
2103
2104UInt h_pcmpistri_02 ( V128* argL, V128* argR )
2105{
2106   V128 block[2];
2107   memcpy(&block[0], argL, sizeof(V128));
2108   memcpy(&block[1], argR, sizeof(V128));
2109   ULong res, flags;
2110   __asm__ __volatile__(
2111      "subq      $1024,  %%rsp"             "\n\t"
2112      "movdqu    0(%2),  %%xmm2"            "\n\t"
2113      "movdqu    16(%2), %%xmm11"           "\n\t"
2114      "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
2115//"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
2116//"movd %%xmm0, %%ecx" "\n\t"
2117      "pushfq"                              "\n\t"
2118      "popq      %%rdx"                     "\n\t"
2119      "movq      %%rcx,  %0"                "\n\t"
2120      "movq      %%rdx,  %1"                "\n\t"
2121      "addq      $1024,  %%rsp"             "\n\t"
2122      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
2123      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
2124   );
2125   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
2126}
2127
2128UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
2129{
2130   V128 resV;
2131   UInt resOSZACP, resECX;
2132   Bool ok
2133      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
2134                       zmask_from_V128(argLU),
2135                       zmask_from_V128(argRU),
2136                       0x02, False/*!isSTRM*/
2137        );
2138   assert(ok);
2139   resECX = resV.uInt[0];
2140   return (resOSZACP << 16) | resECX;
2141}
2142
2143void istri_02 ( void )
2144{
2145   char* wot = "02";
2146   UInt(*h)(V128*,V128*) = h_pcmpistri_02;
2147   UInt(*s)(V128*,V128*) = s_pcmpistri_02;
2148
2149   try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
2150   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
2151   try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
2152   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
2153
2154   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
2155   try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
2156   try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
2157   try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
2158   try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
2159
2160   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
2161   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
2162   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
2163   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
2164
2165   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
2166   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2167
2168   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
2169   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
2170   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
2171   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
2172
2173   try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
2174
2175   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
2176   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
2177}
2178
2179
2180//////////////////////////////////////////////////////////
2181//                                                      //
2182//                       ISTRI_12                       //
2183//                                                      //
2184//////////////////////////////////////////////////////////
2185
2186UInt h_pcmpistri_12 ( V128* argL, V128* argR )
2187{
2188   V128 block[2];
2189   memcpy(&block[0], argL, sizeof(V128));
2190   memcpy(&block[1], argR, sizeof(V128));
2191   ULong res, flags;
2192   __asm__ __volatile__(
2193      "subq      $1024,  %%rsp"             "\n\t"
2194      "movdqu    0(%2),  %%xmm2"            "\n\t"
2195      "movdqu    16(%2), %%xmm11"           "\n\t"
2196      "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
2197//"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
2198//"movd %%xmm0, %%ecx" "\n\t"
2199      "pushfq"                              "\n\t"
2200      "popq      %%rdx"                     "\n\t"
2201      "movq      %%rcx,  %0"                "\n\t"
2202      "movq      %%rdx,  %1"                "\n\t"
2203      "addq      $1024,  %%rsp"             "\n\t"
2204      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
2205      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
2206   );
2207   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
2208}
2209
2210UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
2211{
2212   V128 resV;
2213   UInt resOSZACP, resECX;
2214   Bool ok
2215      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
2216                       zmask_from_V128(argLU),
2217                       zmask_from_V128(argRU),
2218                       0x12, False/*!isSTRM*/
2219        );
2220   assert(ok);
2221   resECX = resV.uInt[0];
2222   return (resOSZACP << 16) | resECX;
2223}
2224
2225void istri_12 ( void )
2226{
2227   char* wot = "12";
2228   UInt(*h)(V128*,V128*) = h_pcmpistri_12;
2229   UInt(*s)(V128*,V128*) = s_pcmpistri_12;
2230
2231   try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
2232   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
2233   try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
2234   try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
2235
2236   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
2237   try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
2238   try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
2239   try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
2240   try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
2241
2242   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
2243   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
2244   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
2245   try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
2246
2247   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
2248   try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
2249
2250   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
2251   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
2252   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
2253   try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
2254
2255   try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
2256
2257   try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
2258   try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
2259}
2260
2261
2262
2263//////////////////////////////////////////////////////////
2264//                                                      //
2265//                       ISTRI_44                       //
2266//                                                      //
2267//////////////////////////////////////////////////////////
2268
2269UInt h_pcmpistri_44 ( V128* argL, V128* argR )
2270{
2271   V128 block[2];
2272   memcpy(&block[0], argL, sizeof(V128));
2273   memcpy(&block[1], argR, sizeof(V128));
2274   ULong res, flags;
2275   __asm__ __volatile__(
2276      "subq      $1024,  %%rsp"             "\n\t"
2277      "movdqu    0(%2),  %%xmm2"            "\n\t"
2278      "movdqu    16(%2), %%xmm11"           "\n\t"
2279      "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
2280//"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
2281//"movd %%xmm0, %%ecx" "\n\t"
2282      "pushfq"                              "\n\t"
2283      "popq      %%rdx"                     "\n\t"
2284      "movq      %%rcx,  %0"                "\n\t"
2285      "movq      %%rdx,  %1"                "\n\t"
2286      "addq      $1024,  %%rsp"             "\n\t"
2287      : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
2288      : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
2289   );
2290   return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
2291}
2292
2293UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
2294{
2295   V128 resV;
2296   UInt resOSZACP, resECX;
2297   Bool ok
2298      = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
2299                       zmask_from_V128(argLU),
2300                       zmask_from_V128(argRU),
2301                       0x44, False/*!isSTRM*/
2302        );
2303   assert(ok);
2304   resECX = resV.uInt[0];
2305   return (resOSZACP << 16) | resECX;
2306}
2307
2308void istri_44 ( void )
2309{
2310   char* wot = "44";
2311   UInt(*h)(V128*,V128*) = h_pcmpistri_44;
2312   UInt(*s)(V128*,V128*) = s_pcmpistri_44;
2313
2314   try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
2315   try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
2316   try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
2317   try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
2318
2319   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
2320   try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
2321   try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
2322   try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
2323   try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
2324
2325   try_istri(wot,h,s, "0000000000000000", "0000000000000000");
2326
2327   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
2328   try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
2329   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
2330
2331   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
2332   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
2333   try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
2334
2335   try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
2336   try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
2337
2338   try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
2339   try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
2340}
2341
2342
2343
2344
2345
2346//////////////////////////////////////////////////////////
2347//                                                      //
2348//                         main                         //
2349//                                                      //
2350//////////////////////////////////////////////////////////
2351
2352int main ( void )
2353{
2354   istri_4A();
2355   istri_3A();
2356   istri_08();
2357   istri_1A();
2358   istri_02();
2359   istri_0C();
2360   istri_12();
2361   istri_44();
2362   istri_00();
2363   istri_38();
2364   return 0;
2365}
2366