1
2/*---------------------------------------------------------------*/
3/*--- begin                               guest_generic_x87.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2017 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* This file contains functions for doing some x87-specific
37   operations.  Both the amd64 and x86 front ends (guests) indirectly
38   call these functions via guest helper calls.  By putting them here,
39   code duplication is avoided.  Some of these functions are tricky
40   and hard to verify, so there is much to be said for only having one
41   copy thereof.
42*/
43
44#include "libvex_basictypes.h"
45
46#include "main_util.h"
47#include "guest_generic_x87.h"
48
49
50/* 80 and 64-bit floating point formats:
51
52   80-bit:
53
54    S  0       0-------0      zero
55    S  0       0X------X      denormals
56    S  1-7FFE  1X------X      normals (all normals have leading 1)
57    S  7FFF    10------0      infinity
58    S  7FFF    10X-----X      snan
59    S  7FFF    11X-----X      qnan
60
61   S is the sign bit.  For runs X----X, at least one of the Xs must be
62   nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
63   there is an explicitly represented leading 1, and a sign bit,
64   giving 80 in total.
65
66   64-bit avoids the confusion of an explicitly represented leading 1
67   and so is simpler:
68
69    S  0      0------0   zero
70    S  0      X------X   denormals
71    S  1-7FE  any        normals
72    S  7FF    0------0   infinity
73    S  7FF    0X-----X   snan
74    S  7FF    1X-----X   qnan
75
76   Exponent is 11 bits, fractional part is 52 bits, and there is a
77   sign bit, giving 64 in total.
78*/
79
80
81static inline UInt read_bit_array ( UChar* arr, UInt n )
82{
83   UChar c = arr[n >> 3];
84   c >>= (n&7);
85   return c & 1;
86}
87
88static inline void write_bit_array ( UChar* arr, UInt n, UInt b )
89{
90   UChar c = arr[n >> 3];
91   c = toUChar( c & ~(1 << (n&7)) );
92   c = toUChar( c | ((b&1) << (n&7)) );
93   arr[n >> 3] = c;
94}
95
96/* Convert an IEEE754 double (64-bit) into an x87 extended double
97   (80-bit), mimicing the hardware fairly closely.  Both numbers are
98   stored little-endian.  Limitations, all of which could be fixed,
99   given some level of hassle:
100
101   * Identity of NaNs is not preserved.
102
103   See comments in the code for more details.
104*/
105void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 )
106{
107   Bool  mantissaIsZero;
108   Int   bexp, i, j, shift;
109   UChar sign;
110
111   sign = toUChar( (f64[7] >> 7) & 1 );
112   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
113   bexp &= 0x7FF;
114
115   mantissaIsZero = False;
116   if (bexp == 0 || bexp == 0x7FF) {
117      /* We'll need to know whether or not the mantissa (bits 51:0) is
118         all zeroes in order to handle these cases.  So figure it
119         out. */
120      mantissaIsZero
121         = toBool(
122              (f64[6] & 0x0F) == 0
123              && f64[5] == 0 && f64[4] == 0 && f64[3] == 0
124              && f64[2] == 0 && f64[1] == 0 && f64[0] == 0
125           );
126   }
127
128   /* If the exponent is zero, either we have a zero or a denormal.
129      Produce a zero.  This is a hack in that it forces denormals to
130      zero.  Could do better. */
131   if (bexp == 0) {
132      f80[9] = toUChar( sign << 7 );
133      f80[8] = f80[7] = f80[6] = f80[5] = f80[4]
134             = f80[3] = f80[2] = f80[1] = f80[0] = 0;
135
136      if (mantissaIsZero)
137         /* It really is zero, so that's all we can do. */
138         return;
139
140      /* There is at least one 1-bit in the mantissa.  So it's a
141         potentially denormalised double -- but we can produce a
142         normalised long double.  Count the leading zeroes in the
143         mantissa so as to decide how much to bump the exponent down
144         by.  Note, this is SLOW. */
145      shift = 0;
146      for (i = 51; i >= 0; i--) {
147        if (read_bit_array(f64, i))
148           break;
149        shift++;
150      }
151
152      /* and copy into place as many bits as we can get our hands on. */
153      j = 63;
154      for (i = 51 - shift; i >= 0; i--) {
155         write_bit_array( f80, j,
156     	 read_bit_array( f64, i ) );
157         j--;
158      }
159
160      /* Set the exponent appropriately, and we're done. */
161      bexp -= shift;
162      bexp += (16383 - 1023);
163      f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
164      f80[8] = toUChar( bexp & 0xFF );
165      return;
166   }
167
168   /* If the exponent is 7FF, this is either an Infinity, a SNaN or
169      QNaN, as determined by examining bits 51:0, thus:
170          0  ... 0    Inf
171          0X ... X    SNaN
172          1X ... X    QNaN
173      where at least one of the Xs is not zero.
174   */
175   if (bexp == 0x7FF) {
176      if (mantissaIsZero) {
177         /* Produce an appropriately signed infinity:
178            S 1--1 (15)  1  0--0 (63)
179         */
180         f80[9] = toUChar( (sign << 7) | 0x7F );
181         f80[8] = 0xFF;
182         f80[7] = 0x80;
183         f80[6] = f80[5] = f80[4] = f80[3]
184                = f80[2] = f80[1] = f80[0] = 0;
185         return;
186      }
187      /* So it's either a QNaN or SNaN.  Distinguish by considering
188         bit 51.  Note, this destroys all the trailing bits
189         (identity?) of the NaN.  IEEE754 doesn't require preserving
190         these (it only requires that there be one QNaN value and one
191         SNaN value), but x87 does seem to have some ability to
192         preserve them.  Anyway, here, the NaN's identity is
193         destroyed.  Could be improved. */
194      if (f64[6] & 8) {
195         /* QNaN.  Make a canonical QNaN:
196            S 1--1 (15)  1 1  0--0 (62)
197         */
198         f80[9] = toUChar( (sign << 7) | 0x7F );
199         f80[8] = 0xFF;
200         f80[7] = 0xC0;
201         f80[6] = f80[5] = f80[4] = f80[3]
202                = f80[2] = f80[1] = f80[0] = 0x00;
203      } else {
204         /* SNaN.  Make a SNaN:
205            S 1--1 (15)  1 0  1--1 (62)
206         */
207         f80[9] = toUChar( (sign << 7) | 0x7F );
208         f80[8] = 0xFF;
209         f80[7] = 0xBF;
210         f80[6] = f80[5] = f80[4] = f80[3]
211                = f80[2] = f80[1] = f80[0] = 0xFF;
212      }
213      return;
214   }
215
216   /* It's not a zero, denormal, infinity or nan.  So it must be a
217      normalised number.  Rebias the exponent and build the new
218      number.  */
219   bexp += (16383 - 1023);
220
221   f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
222   f80[8] = toUChar( bexp & 0xFF );
223   f80[7] = toUChar( (1 << 7) | ((f64[6] << 3) & 0x78)
224                              | ((f64[5] >> 5) & 7) );
225   f80[6] = toUChar( ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7) );
226   f80[5] = toUChar( ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7) );
227   f80[4] = toUChar( ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7) );
228   f80[3] = toUChar( ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7) );
229   f80[2] = toUChar( ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7) );
230   f80[1] = toUChar( ((f64[0] << 3) & 0xF8) );
231   f80[0] = toUChar( 0 );
232}
233
234
235/* Convert an x87 extended double (80-bit) into an IEEE 754 double
236   (64-bit), mimicking the hardware fairly closely.  Both numbers are
237   stored little-endian.  Limitations, both of which could be fixed,
238   given some level of hassle:
239
240   * Rounding following truncation could be a bit better.
241
242   * Identity of NaNs is not preserved.
243
244   See comments in the code for more details.
245*/
246void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 )
247{
248   Bool  isInf;
249   Int   bexp, i, j;
250   UChar sign;
251
252   sign = toUChar((f80[9] >> 7) & 1);
253   bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8];
254   bexp &= 0x7FFF;
255
256   /* If the exponent is zero, either we have a zero or a denormal.
257      But an extended precision denormal becomes a double precision
258      zero, so in either case, just produce the appropriately signed
259      zero. */
260   if (bexp == 0) {
261      f64[7] = toUChar(sign << 7);
262      f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
263      return;
264   }
265
266   /* If the exponent is 7FFF, this is either an Infinity, a SNaN or
267      QNaN, as determined by examining bits 62:0, thus:
268          10  ... 0    Inf
269          10X ... X    SNaN
270          11X ... X    QNaN
271      where at least one of the Xs is not zero.
272   */
273   if (bexp == 0x7FFF) {
274      isInf = toBool(
275                 (f80[7] & 0x7F) == 0
276                 && f80[6] == 0 && f80[5] == 0 && f80[4] == 0
277                 && f80[3] == 0 && f80[2] == 0 && f80[1] == 0
278                 && f80[0] == 0
279              );
280      if (isInf) {
281         if (0 == (f80[7] & 0x80))
282            goto wierd_NaN;
283         /* Produce an appropriately signed infinity:
284            S 1--1 (11)  0--0 (52)
285         */
286         f64[7] = toUChar((sign << 7) | 0x7F);
287         f64[6] = 0xF0;
288         f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
289         return;
290      }
291      /* So it's either a QNaN or SNaN.  Distinguish by considering
292         bit 61.  Note, this destroys all the trailing bits
293         (identity?) of the NaN.  IEEE754 doesn't require preserving
294         these (it only requires that there be one QNaN value and one
295         SNaN value), but x87 does seem to have some ability to
296         preserve them.  Anyway, here, the NaN's identity is
297         destroyed.  Could be improved. */
298      if (f80[7] & 0x40) {
299         /* QNaN.  Make a canonical QNaN:
300            S 1--1 (11)  1  0--0 (51)
301         */
302         f64[7] = toUChar((sign << 7) | 0x7F);
303         f64[6] = 0xF8;
304         f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0x00;
305      } else {
306         /* SNaN.  Make a SNaN:
307            S 1--1 (11)  0  1--1 (51)
308         */
309         f64[7] = toUChar((sign << 7) | 0x7F);
310         f64[6] = 0xF7;
311         f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
312      }
313      return;
314   }
315
316   /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is
317      zero, the x87 FPU appears to consider the number denormalised
318      and converts it to a QNaN. */
319   if (0 == (f80[7] & 0x80)) {
320      wierd_NaN:
321      /* Strange hardware QNaN:
322         S 1--1 (11)  1  0--0 (51)
323      */
324      /* On a PIII, these QNaNs always appear with sign==1.  I have
325         no idea why. */
326      f64[7] = (1 /*sign*/ << 7) | 0x7F;
327      f64[6] = 0xF8;
328      f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
329      return;
330   }
331
332   /* It's not a zero, denormal, infinity or nan.  So it must be a
333      normalised number.  Rebias the exponent and consider. */
334   bexp -= (16383 - 1023);
335   if (bexp >= 0x7FF) {
336      /* It's too big for a double.  Construct an infinity. */
337      f64[7] = toUChar((sign << 7) | 0x7F);
338      f64[6] = 0xF0;
339      f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
340      return;
341   }
342
343   if (bexp <= 0) {
344      /* It's too small for a normalised double.  First construct a
345         zero and then see if it can be improved into a denormal.  */
346      f64[7] = toUChar(sign << 7);
347      f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
348
349      if (bexp < -52)
350         /* Too small even for a denormal. */
351         return;
352
353      /* Ok, let's make a denormal.  Note, this is SLOW. */
354      /* Copy bits 63, 62, 61, etc of the src mantissa into the dst,
355         indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */
356      /* bexp is in range -52 .. 0 inclusive */
357      for (i = 63; i >= 0; i--) {
358         j = i - 12 + bexp;
359         if (j < 0) break;
360         /* We shouldn't really call vassert from generated code. */
361         vassert(j >= 0 && j < 52);
362         write_bit_array ( f64,
363                           j,
364                           read_bit_array ( f80, i ) );
365      }
366      /* and now we might have to round ... */
367      if (read_bit_array(f80, 10+1 - bexp) == 1)
368         goto do_rounding;
369
370      return;
371   }
372
373   /* Ok, it's a normalised number which is representable as a double.
374      Copy the exponent and mantissa into place. */
375   /*
376   for (i = 0; i < 52; i++)
377      write_bit_array ( f64,
378                        i,
379                        read_bit_array ( f80, i+11 ) );
380   */
381   f64[0] = toUChar( (f80[1] >> 3) | (f80[2] << 5) );
382   f64[1] = toUChar( (f80[2] >> 3) | (f80[3] << 5) );
383   f64[2] = toUChar( (f80[3] >> 3) | (f80[4] << 5) );
384   f64[3] = toUChar( (f80[4] >> 3) | (f80[5] << 5) );
385   f64[4] = toUChar( (f80[5] >> 3) | (f80[6] << 5) );
386   f64[5] = toUChar( (f80[6] >> 3) | (f80[7] << 5) );
387
388   f64[6] = toUChar( ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F) );
389
390   f64[7] = toUChar( (sign << 7) | ((bexp >> 4) & 0x7F) );
391
392   /* Now consider any rounding that needs to happen as a result of
393      truncating the mantissa. */
394   if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ {
395
396      /* If the bottom bits of f80 are "100 0000 0000", then the
397         infinitely precise value is deemed to be mid-way between the
398         two closest representable values.  Since we're doing
399         round-to-nearest (the default mode), in that case it is the
400         bit immediately above which indicates whether we should round
401         upwards or not -- if 0, we don't.  All that is encapsulated
402         in the following simple test. */
403      if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0)
404         return;
405
406      do_rounding:
407      /* Round upwards.  This is a kludge.  Once in every 2^24
408         roundings (statistically) the bottom three bytes are all 0xFF
409         and so we don't round at all.  Could be improved. */
410      if (f64[0] != 0xFF) {
411         f64[0]++;
412      }
413      else
414      if (f64[0] == 0xFF && f64[1] != 0xFF) {
415         f64[0] = 0;
416         f64[1]++;
417      }
418      else
419      if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) {
420         f64[0] = 0;
421         f64[1] = 0;
422         f64[2]++;
423      }
424      /* else we don't round, but we should. */
425   }
426}
427
428
429/* CALLED FROM GENERATED CODE: CLEAN HELPER */
430/* Extract the signed significand or exponent component as per
431   fxtract.  Arg and result are doubles travelling under the guise of
432   ULongs.  Returns significand when getExp is zero and exponent
433   otherwise. */
434ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp )
435{
436   ULong  uSig, uExp;
437   /* Long   sSig; */
438   Int    sExp, i;
439   UInt   sign, expExp;
440
441   /*
442    S  7FF    0------0   infinity
443    S  7FF    0X-----X   snan
444    S  7FF    1X-----X   qnan
445   */
446   const ULong posInf  = 0x7FF0000000000000ULL;
447   const ULong negInf  = 0xFFF0000000000000ULL;
448   const ULong nanMask = 0x7FF0000000000000ULL;
449   const ULong qNan    = 0x7FF8000000000000ULL;
450   const ULong posZero = 0x0000000000000000ULL;
451   const ULong negZero = 0x8000000000000000ULL;
452   const ULong bit51   = 1ULL << 51;
453   const ULong bit52   = 1ULL << 52;
454   const ULong sigMask = bit52 - 1;
455
456   /* Mimic Core i5 behaviour for special cases. */
457   if (arg == posInf)
458      return getExp ? posInf : posInf;
459   if (arg == negInf)
460      return getExp ? posInf : negInf;
461   if ((arg & nanMask) == nanMask)
462      return qNan | (arg & (1ULL << 63));
463   if (arg == posZero)
464      return getExp ? negInf : posZero;
465   if (arg == negZero)
466      return getExp ? negInf : negZero;
467
468   /* Split into sign, exponent and significand. */
469   sign = ((UInt)(arg >> 63)) & 1;
470
471   /* Mask off exponent & sign. uSig is in range 0 .. 2^52-1. */
472   uSig = arg & sigMask;
473
474   /* Get the exponent. */
475   sExp = ((Int)(arg >> 52)) & 0x7FF;
476
477   /* Deal with denormals: if the exponent is zero, then the
478      significand cannot possibly be zero (negZero/posZero are handled
479      above).  Shift the significand left until bit 51 of it becomes
480      1, and decrease the exponent accordingly.
481   */
482   if (sExp == 0) {
483      for (i = 0; i < 52; i++) {
484         if (uSig & bit51)
485            break;
486         uSig <<= 1;
487         sExp--;
488      }
489      uSig <<= 1;
490   } else {
491      /* Add the implied leading-1 in the significand. */
492      uSig |= bit52;
493   }
494
495   /* Roll in the sign. */
496   /* sSig = uSig; */
497   /* if (sign) sSig =- sSig; */
498
499   /* Convert sig into a double.  This should be an exact conversion.
500      Then divide by 2^52, which should give a value in the range 1.0
501      to 2.0-epsilon, at least for normalised args. */
502   /* dSig = (Double)sSig; */
503   /* dSig /= 67108864.0;  */ /* 2^26 */
504   /* dSig /= 67108864.0;  */ /* 2^26 */
505   uSig &= sigMask;
506   uSig |= 0x3FF0000000000000ULL;
507   if (sign)
508      uSig ^= negZero;
509
510   /* Convert exp into a double.  Also an exact conversion. */
511   /* dExp = (Double)(sExp - 1023); */
512   sExp -= 1023;
513   if (sExp == 0) {
514      uExp = 0;
515   } else {
516      uExp   = sExp < 0 ? -sExp : sExp;
517      expExp = 0x3FF +52;
518      /* 1 <= uExp <= 1074 */
519      /* Skip first 42 iterations of normalisation loop as we know they
520         will always happen */
521      uExp <<= 42;
522      expExp -= 42;
523      for (i = 0; i < 52-42; i++) {
524         if (uExp & bit52)
525            break;
526         uExp <<= 1;
527         expExp--;
528      }
529      uExp &= sigMask;
530      uExp |= ((ULong)expExp) << 52;
531      if (sExp < 0) uExp ^= negZero;
532   }
533
534   return getExp ? uExp : uSig;
535}
536
537
538
539/*---------------------------------------------------------*/
540/*--- SSE4.2 PCMP{E,I}STR{I,M} helpers                  ---*/
541/*---------------------------------------------------------*/
542
543/* We need the definitions for OSZACP eflags/rflags offsets.
544   #including guest_{amd64,x86}_defs.h causes chaos, so just copy the
545   required values directly.  They are not going to change in the
546   foreseeable future :-)
547*/
548
549#define SHIFT_O   11
550#define SHIFT_S   7
551#define SHIFT_Z   6
552#define SHIFT_A   4
553#define SHIFT_C   0
554#define SHIFT_P   2
555
556#define MASK_O    (1 << SHIFT_O)
557#define MASK_S    (1 << SHIFT_S)
558#define MASK_Z    (1 << SHIFT_Z)
559#define MASK_A    (1 << SHIFT_A)
560#define MASK_C    (1 << SHIFT_C)
561#define MASK_P    (1 << SHIFT_P)
562
563
564/* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's
565   Delight. */
566static UInt clz32 ( UInt x )
567{
568   Int y, m, n;
569   y = -(x >> 16);
570   m = (y >> 16) & 16;
571   n = 16 - m;
572   x = x >> m;
573   y = x - 0x100;
574   m = (y >> 16) & 8;
575   n = n + m;
576   x = x << m;
577   y = x - 0x1000;
578   m = (y >> 16) & 4;
579   n = n + m;
580   x = x << m;
581   y = x - 0x4000;
582   m = (y >> 16) & 2;
583   n = n + m;
584   x = x << m;
585   y = x >> 14;
586   m = y & ~(y >> 1);
587   return n + 2 - m;
588}
589
590static UInt ctz32 ( UInt x )
591{
592   return 32 - clz32((~x) & (x-1));
593}
594
595/* Convert a 4-bit value to a 32-bit value by cloning each bit 8
596   times.  There's surely a better way to do this, but I don't know
597   what it is. */
598static UInt bits4_to_bytes4 ( UInt bits4 )
599{
600   UInt r = 0;
601   r |= (bits4 & 1) ? 0x000000FF : 0;
602   r |= (bits4 & 2) ? 0x0000FF00 : 0;
603   r |= (bits4 & 4) ? 0x00FF0000 : 0;
604   r |= (bits4 & 8) ? 0xFF000000 : 0;
605   return r;
606}
607
608
609/* Convert a 2-bit value to a 32-bit value by cloning each bit 16
610   times.  There's surely a better way to do this, but I don't know
611   what it is. */
612static UInt bits2_to_bytes4 ( UInt bits2 )
613{
614   UInt r = 0;
615   r |= (bits2 & 1) ? 0x0000FFFF : 0;
616   r |= (bits2 & 2) ? 0xFFFF0000 : 0;
617   return r;
618}
619
620
621/* Given partial results from a pcmpXstrX operation (intRes1,
622   basically), generate an I- or M-format output value, also the new
623   OSZACP flags.  */
624static
625void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV,
626                                   /*OUT*/UInt* resOSZACP,
627                                   UInt intRes1,
628                                   UInt zmaskL, UInt zmaskR,
629                                   UInt validL,
630                                   UInt pol, UInt idx,
631                                   Bool isxSTRM )
632{
633   vassert((pol >> 2) == 0);
634   vassert((idx >> 1) == 0);
635
636   UInt intRes2 = 0;
637   switch (pol) {
638      case 0: intRes2 = intRes1;          break; // pol +
639      case 1: intRes2 = ~intRes1;         break; // pol -
640      case 2: intRes2 = intRes1;          break; // pol m+
641      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
642   }
643   intRes2 &= 0xFFFF;
644
645   if (isxSTRM) {
646
647      // generate M-format output (a bit or byte mask in XMM0)
648      if (idx) {
649         resV->w32[0] = bits4_to_bytes4( (intRes2 >>  0) & 0xF );
650         resV->w32[1] = bits4_to_bytes4( (intRes2 >>  4) & 0xF );
651         resV->w32[2] = bits4_to_bytes4( (intRes2 >>  8) & 0xF );
652         resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF );
653      } else {
654         resV->w32[0] = intRes2 & 0xFFFF;
655         resV->w32[1] = 0;
656         resV->w32[2] = 0;
657         resV->w32[3] = 0;
658      }
659
660   } else {
661
662      // generate I-format output (an index in ECX)
663      // generate ecx value
664      UInt newECX = 0;
665      if (idx) {
666         // index of ms-1-bit
667         newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
668      } else {
669         // index of ls-1-bit
670         newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
671      }
672
673      resV->w32[0] = newECX;
674      resV->w32[1] = 0;
675      resV->w32[2] = 0;
676      resV->w32[3] = 0;
677
678   }
679
680   // generate new flags, common to all ISTRI and ISTRM cases
681   *resOSZACP    // A, P are zero
682     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
683     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
684     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
685     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
686}
687
688
689/* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
690   basically), generate an I- or M-format output value, also the new
691   OSZACP flags.  */
692static
693void compute_PCMPxSTRx_gen_output_wide (/*OUT*/V128* resV,
694                                        /*OUT*/UInt* resOSZACP,
695                                        UInt intRes1,
696                                        UInt zmaskL, UInt zmaskR,
697                                        UInt validL,
698                                        UInt pol, UInt idx,
699                                        Bool isxSTRM )
700{
701   vassert((pol >> 2) == 0);
702   vassert((idx >> 1) == 0);
703
704   UInt intRes2 = 0;
705   switch (pol) {
706      case 0: intRes2 = intRes1;          break; // pol +
707      case 1: intRes2 = ~intRes1;         break; // pol -
708      case 2: intRes2 = intRes1;          break; // pol m+
709      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
710   }
711   intRes2 &= 0xFF;
712
713   if (isxSTRM) {
714
715      // generate M-format output (a bit or byte mask in XMM0)
716      if (idx) {
717         resV->w32[0] = bits2_to_bytes4( (intRes2 >> 0) & 0x3 );
718         resV->w32[1] = bits2_to_bytes4( (intRes2 >> 2) & 0x3 );
719         resV->w32[2] = bits2_to_bytes4( (intRes2 >> 4) & 0x3 );
720         resV->w32[3] = bits2_to_bytes4( (intRes2 >> 6) & 0x3 );
721      } else {
722         resV->w32[0] = intRes2 & 0xFF;
723         resV->w32[1] = 0;
724         resV->w32[2] = 0;
725         resV->w32[3] = 0;
726      }
727
728   } else {
729
730      // generate I-format output (an index in ECX)
731      // generate ecx value
732      UInt newECX = 0;
733      if (idx) {
734         // index of ms-1-bit
735         newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
736      } else {
737         // index of ls-1-bit
738         newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
739      }
740
741      resV->w32[0] = newECX;
742      resV->w32[1] = 0;
743      resV->w32[2] = 0;
744      resV->w32[3] = 0;
745
746   }
747
748   // generate new flags, common to all ISTRI and ISTRM cases
749   *resOSZACP    // A, P are zero
750     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
751     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
752     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
753     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
754}
755
756
757/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
758   variants on 8-bit data.
759
760   For xSTRI variants, the new ECX value is placed in the 32 bits
761   pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
762   variants, the result is a 128 bit value and is placed at *resV in
763   the obvious way.
764
765   For all variants, the new OSZACP value is placed at *resOSZACP.
766
767   argLV and argRV are the vector args.  The caller must prepare a
768   16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
769   must be 1 for each zero byte of of the respective arg.  For ESTRx
770   variants this is derived from the explicit length indication, and
771   must be 0 in all places except at the bit index corresponding to
772   the valid length (0 .. 16).  If the valid length is 16 then the
773   mask must be all zeroes.  In all cases, bits 31:16 must be zero.
774
775   imm8 is the original immediate from the instruction.  isSTRM
776   indicates whether this is a xSTRM or xSTRI variant, which controls
777   how much of *res is written.
778
779   If the given imm8 case can be handled, the return value is True.
780   If not, False is returned, and neither *res not *resOSZACP are
781   altered.
782*/
783
784Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,
785                         /*OUT*/UInt* resOSZACP,
786                         V128* argLV,  V128* argRV,
787                         UInt zmaskL, UInt zmaskR,
788                         UInt imm8,   Bool isxSTRM )
789{
790   vassert(imm8 < 0x80);
791   vassert((zmaskL >> 16) == 0);
792   vassert((zmaskR >> 16) == 0);
793
794   /* Explicitly reject any imm8 values that haven't been validated,
795      even if they would probably work.  Life is too short to have
796      unvalidated cases in the code base. */
797   switch (imm8) {
798      case 0x00: case 0x02:
799      case 0x08: case 0x0A: case 0x0C: case 0x0E:
800      case 0x10: case 0x12: case 0x14:
801      case 0x18: case 0x1A:
802      case 0x30:            case 0x34:
803      case 0x38: case 0x3A:
804      case 0x40: case 0x42: case 0x44: case 0x46:
805                 case 0x4A:
806                 case 0x62:
807      case 0x70: case 0x72:
808         break;
809      default:
810         return False;
811   }
812
813   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
814   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
815   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
816   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
817
818   /*----------------------------------------*/
819   /*-- strcmp on byte data                --*/
820   /*----------------------------------------*/
821
822   if (agg == 2/*equal each, aka strcmp*/
823       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
824      Int    i;
825      UChar* argL = (UChar*)argLV;
826      UChar* argR = (UChar*)argRV;
827      UInt boolResII = 0;
828      for (i = 15; i >= 0; i--) {
829         UChar cL  = argL[i];
830         UChar cR  = argR[i];
831         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
832      }
833      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
834      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
835
836      // do invalidation, common to all equal-each cases
837      UInt intRes1
838         = (boolResII & validL & validR)  // if both valid, use cmpres
839           | (~ (validL | validR));       // if both invalid, force 1
840                                          // else force 0
841      intRes1 &= 0xFFFF;
842
843      // generate I-format output
844      compute_PCMPxSTRx_gen_output(
845         resV, resOSZACP,
846         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
847      );
848
849      return True;
850   }
851
852   /*----------------------------------------*/
853   /*-- set membership on byte data        --*/
854   /*----------------------------------------*/
855
856   if (agg == 0/*equal any, aka find chars in a set*/
857       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
858      /* argL: the string,  argR: charset */
859      UInt   si, ci;
860      UChar* argL    = (UChar*)argLV;
861      UChar* argR    = (UChar*)argRV;
862      UInt   boolRes = 0;
863      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
864      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
865
866      for (si = 0; si < 16; si++) {
867         if ((validL & (1 << si)) == 0)
868            // run off the end of the string.
869            break;
870         UInt m = 0;
871         for (ci = 0; ci < 16; ci++) {
872            if ((validR & (1 << ci)) == 0) break;
873            if (argR[ci] == argL[si]) { m = 1; break; }
874         }
875         boolRes |= (m << si);
876      }
877
878      // boolRes is "pre-invalidated"
879      UInt intRes1 = boolRes & 0xFFFF;
880
881      // generate I-format output
882      compute_PCMPxSTRx_gen_output(
883         resV, resOSZACP,
884         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
885      );
886
887      return True;
888   }
889
890   /*----------------------------------------*/
891   /*-- substring search on byte data      --*/
892   /*----------------------------------------*/
893
894   if (agg == 3/*equal ordered, aka substring search*/
895       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
896
897      /* argL: haystack,  argR: needle */
898      UInt   ni, hi;
899      UChar* argL    = (UChar*)argLV;
900      UChar* argR    = (UChar*)argRV;
901      UInt   boolRes = 0;
902      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
903      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
904      for (hi = 0; hi < 16; hi++) {
905         UInt m = 1;
906         for (ni = 0; ni < 16; ni++) {
907            if ((validR & (1 << ni)) == 0) break;
908            UInt i = ni + hi;
909            if (i >= 16) break;
910            if (argL[i] != argR[ni]) { m = 0; break; }
911         }
912         boolRes |= (m << hi);
913         if ((validL & (1 << hi)) == 0)
914            // run off the end of the haystack
915            break;
916      }
917
918      // boolRes is "pre-invalidated"
919      UInt intRes1 = boolRes & 0xFFFF;
920
921      // generate I-format output
922      compute_PCMPxSTRx_gen_output(
923         resV, resOSZACP,
924         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
925      );
926
927      return True;
928   }
929
930   /*----------------------------------------*/
931   /*-- ranges, unsigned byte data         --*/
932   /*----------------------------------------*/
933
934   if (agg == 1/*ranges*/
935       && fmt == 0/*ub*/) {
936
937      /* argL: string,  argR: range-pairs */
938      UInt   ri, si;
939      UChar* argL    = (UChar*)argLV;
940      UChar* argR    = (UChar*)argRV;
941      UInt   boolRes = 0;
942      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
943      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
944      for (si = 0; si < 16; si++) {
945         if ((validL & (1 << si)) == 0)
946            // run off the end of the string
947            break;
948         UInt m = 0;
949         for (ri = 0; ri < 16; ri += 2) {
950            if ((validR & (3 << ri)) != (3 << ri)) break;
951            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
952               m = 1; break;
953            }
954         }
955         boolRes |= (m << si);
956      }
957
958      // boolRes is "pre-invalidated"
959      UInt intRes1 = boolRes & 0xFFFF;
960
961      // generate I-format output
962      compute_PCMPxSTRx_gen_output(
963         resV, resOSZACP,
964         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
965      );
966
967      return True;
968   }
969
970   /*----------------------------------------*/
971   /*-- ranges, signed byte data           --*/
972   /*----------------------------------------*/
973
974   if (agg == 1/*ranges*/
975       && fmt == 2/*sb*/) {
976
977      /* argL: string,  argR: range-pairs */
978      UInt   ri, si;
979      Char*  argL    = (Char*)argLV;
980      Char*  argR    = (Char*)argRV;
981      UInt   boolRes = 0;
982      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
983      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
984      for (si = 0; si < 16; si++) {
985         if ((validL & (1 << si)) == 0)
986            // run off the end of the string
987            break;
988         UInt m = 0;
989         for (ri = 0; ri < 16; ri += 2) {
990            if ((validR & (3 << ri)) != (3 << ri)) break;
991            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
992               m = 1; break;
993            }
994         }
995         boolRes |= (m << si);
996      }
997
998      // boolRes is "pre-invalidated"
999      UInt intRes1 = boolRes & 0xFFFF;
1000
1001      // generate I-format output
1002      compute_PCMPxSTRx_gen_output(
1003         resV, resOSZACP,
1004         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1005      );
1006
1007      return True;
1008   }
1009
1010   return False;
1011}
1012
1013
1014/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
1015   variants on 16-bit characters.
1016
1017   For xSTRI variants, the new ECX value is placed in the 32 bits
1018   pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
1019   variants, the result is a 128 bit value and is placed at *resV in
1020   the obvious way.
1021
1022   For all variants, the new OSZACP value is placed at *resOSZACP.
1023
1024   argLV and argRV are the vector args.  The caller must prepare a
1025   8-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
1026   must be 1 for each zero byte of of the respective arg.  For ESTRx
1027   variants this is derived from the explicit length indication, and
1028   must be 0 in all places except at the bit index corresponding to
1029   the valid length (0 .. 8).  If the valid length is 8 then the
1030   mask must be all zeroes.  In all cases, bits 31:8 must be zero.
1031
1032   imm8 is the original immediate from the instruction.  isSTRM
1033   indicates whether this is a xSTRM or xSTRI variant, which controls
1034   how much of *res is written.
1035
1036   If the given imm8 case can be handled, the return value is True.
1037   If not, False is returned, and neither *res not *resOSZACP are
1038   altered.
1039*/
1040
1041Bool compute_PCMPxSTRx_wide ( /*OUT*/V128* resV,
1042                              /*OUT*/UInt* resOSZACP,
1043                              V128* argLV,  V128* argRV,
1044                              UInt zmaskL, UInt zmaskR,
1045                              UInt imm8,   Bool isxSTRM )
1046{
1047   vassert(imm8 < 0x80);
1048   vassert((zmaskL >> 8) == 0);
1049   vassert((zmaskR >> 8) == 0);
1050
1051   /* Explicitly reject any imm8 values that haven't been validated,
1052      even if they would probably work.  Life is too short to have
1053      unvalidated cases in the code base. */
1054   switch (imm8) {
1055      case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
1056                 case 0x13: case 0x19: case 0x1B:
1057                            case 0x39: case 0x3B:
1058                 case 0x45:            case 0x4B:
1059         break;
1060      default:
1061         return False;
1062   }
1063
1064   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
1065   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
1066   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
1067   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
1068
1069   /*----------------------------------------*/
1070   /*-- strcmp on wide data                --*/
1071   /*----------------------------------------*/
1072
1073   if (agg == 2/*equal each, aka strcmp*/
1074       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
1075      Int     i;
1076      UShort* argL = (UShort*)argLV;
1077      UShort* argR = (UShort*)argRV;
1078      UInt boolResII = 0;
1079      for (i = 7; i >= 0; i--) {
1080         UShort cL  = argL[i];
1081         UShort cR  = argR[i];
1082         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
1083      }
1084      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
1085      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
1086
1087      // do invalidation, common to all equal-each cases
1088      UInt intRes1
1089         = (boolResII & validL & validR)  // if both valid, use cmpres
1090           | (~ (validL | validR));       // if both invalid, force 1
1091                                          // else force 0
1092      intRes1 &= 0xFF;
1093
1094      // generate I-format output
1095      compute_PCMPxSTRx_gen_output_wide(
1096         resV, resOSZACP,
1097         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1098      );
1099
1100      return True;
1101   }
1102
1103   /*----------------------------------------*/
1104   /*-- set membership on wide data        --*/
1105   /*----------------------------------------*/
1106
1107   if (agg == 0/*equal any, aka find chars in a set*/
1108       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
1109      /* argL: the string,  argR: charset */
1110      UInt    si, ci;
1111      UShort* argL    = (UShort*)argLV;
1112      UShort* argR    = (UShort*)argRV;
1113      UInt    boolRes = 0;
1114      UInt    validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
1115      UInt    validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
1116
1117      for (si = 0; si < 8; si++) {
1118         if ((validL & (1 << si)) == 0)
1119            // run off the end of the string.
1120            break;
1121         UInt m = 0;
1122         for (ci = 0; ci < 8; ci++) {
1123            if ((validR & (1 << ci)) == 0) break;
1124            if (argR[ci] == argL[si]) { m = 1; break; }
1125         }
1126         boolRes |= (m << si);
1127      }
1128
1129      // boolRes is "pre-invalidated"
1130      UInt intRes1 = boolRes & 0xFF;
1131
1132      // generate I-format output
1133      compute_PCMPxSTRx_gen_output_wide(
1134         resV, resOSZACP,
1135         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1136      );
1137
1138      return True;
1139   }
1140
1141   /*----------------------------------------*/
1142   /*-- substring search on wide data      --*/
1143   /*----------------------------------------*/
1144
1145   if (agg == 3/*equal ordered, aka substring search*/
1146       && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
1147
1148      /* argL: haystack,  argR: needle */
1149      UInt    ni, hi;
1150      UShort* argL    = (UShort*)argLV;
1151      UShort* argR    = (UShort*)argRV;
1152      UInt    boolRes = 0;
1153      UInt    validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
1154      UInt    validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
1155      for (hi = 0; hi < 8; hi++) {
1156         UInt m = 1;
1157         for (ni = 0; ni < 8; ni++) {
1158            if ((validR & (1 << ni)) == 0) break;
1159            UInt i = ni + hi;
1160            if (i >= 8) break;
1161            if (argL[i] != argR[ni]) { m = 0; break; }
1162         }
1163         boolRes |= (m << hi);
1164         if ((validL & (1 << hi)) == 0)
1165            // run off the end of the haystack
1166            break;
1167      }
1168
1169      // boolRes is "pre-invalidated"
1170      UInt intRes1 = boolRes & 0xFF;
1171
1172      // generate I-format output
1173      compute_PCMPxSTRx_gen_output_wide(
1174         resV, resOSZACP,
1175         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1176      );
1177
1178      return True;
1179   }
1180
1181   /*----------------------------------------*/
1182   /*-- ranges, unsigned wide data         --*/
1183   /*----------------------------------------*/
1184
1185   if (agg == 1/*ranges*/
1186       && fmt == 1/*uw*/) {
1187
1188      /* argL: string,  argR: range-pairs */
1189      UInt    ri, si;
1190      UShort* argL    = (UShort*)argLV;
1191      UShort* argR    = (UShort*)argRV;
1192      UInt    boolRes = 0;
1193      UInt    validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
1194      UInt    validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
1195      for (si = 0; si < 8; si++) {
1196         if ((validL & (1 << si)) == 0)
1197            // run off the end of the string
1198            break;
1199         UInt m = 0;
1200         for (ri = 0; ri < 8; ri += 2) {
1201            if ((validR & (3 << ri)) != (3 << ri)) break;
1202            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
1203               m = 1; break;
1204            }
1205         }
1206         boolRes |= (m << si);
1207      }
1208
1209      // boolRes is "pre-invalidated"
1210      UInt intRes1 = boolRes & 0xFF;
1211
1212      // generate I-format output
1213      compute_PCMPxSTRx_gen_output_wide(
1214         resV, resOSZACP,
1215         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
1216      );
1217
1218      return True;
1219   }
1220
1221   return False;
1222}
1223
1224
1225/*---------------------------------------------------------------*/
1226/*--- end                                 guest_generic_x87.c ---*/
1227/*---------------------------------------------------------------*/
1228