1/*  Copyright (C) 2011 IBM
2
3 Author: Maynard Johnson <maynardj@us.ibm.com>
4
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307, USA.
19
20 The GNU General Public License is contained in the file COPYING.
21 */
22
23#ifdef HAS_VSX
24
25#include <stdio.h>
26#include <stdint.h>
27#include <stdlib.h>
28#include <string.h>
29#include <malloc.h>
30#include <altivec.h>
31#include <math.h>
32#include <unistd.h>    // getopt
33
34#ifndef __powerpc64__
35typedef uint32_t HWord_t;
36#else
37typedef uint64_t HWord_t;
38#endif /* __powerpc64__ */
39
40#ifdef VGP_ppc64le_linux
41#define isLE 1
42#else
43#define isLE 0
44#endif
45
46typedef unsigned char Bool;
47#define True 1
48#define False 0
49register HWord_t r14 __asm__ ("r14");
50register HWord_t r15 __asm__ ("r15");
51register HWord_t r16 __asm__ ("r16");
52register HWord_t r17 __asm__ ("r17");
53register double f14 __asm__ ("fr14");
54register double f15 __asm__ ("fr15");
55register double f16 __asm__ ("fr16");
56register double f17 __asm__ ("fr17");
57
58static volatile unsigned int div_flags, div_xer;
59
60#define ALLCR "cr0","cr1","cr2","cr3","cr4","cr5","cr6","cr7"
61
62#define SET_CR(_arg) \
63      __asm__ __volatile__ ("mtcr  %0" : : "b"(_arg) : ALLCR );
64
65#define SET_XER(_arg) \
66      __asm__ __volatile__ ("mtxer %0" : : "b"(_arg) : "xer" );
67
68#define GET_CR(_lval) \
69      __asm__ __volatile__ ("mfcr %0"  : "=b"(_lval) )
70
71#define GET_XER(_lval) \
72      __asm__ __volatile__ ("mfxer %0" : "=b"(_lval) )
73
74#define GET_CR_XER(_lval_cr,_lval_xer) \
75   do { GET_CR(_lval_cr); GET_XER(_lval_xer); } while (0)
76
77#define SET_CR_ZERO \
78      SET_CR(0)
79
80#define SET_XER_ZERO \
81      SET_XER(0)
82
83#define SET_CR_XER_ZERO \
84   do { SET_CR_ZERO; SET_XER_ZERO; } while (0)
85
86#define SET_FPSCR_ZERO \
87   do { double _d = 0.0; \
88        __asm__ __volatile__ ("mtfsf 0xFF, %0" : : "f"(_d) ); \
89   } while (0)
90
91
92typedef void (*test_func_t)(void);
93typedef struct test_table test_table_t;
94
95/* Defines for the instructiion groups, use bit field to identify */
96#define SCALAR_DIV_INST    0x0001
97#define OTHER_INST  0x0002
98
99/* These functions below that construct a table of floating point
100 * values were lifted from none/tests/ppc32/jm-insns.c.
101 */
102
103#if defined (DEBUG_ARGS_BUILD)
104#define AB_DPRINTF(fmt, args...) do { fprintf(stderr, fmt , ##args); } while (0)
105#else
106#define AB_DPRINTF(fmt, args...) do { } while (0)
107#endif
108
109static inline void register_farg (void *farg,
110                                  int s, uint16_t _exp, uint64_t mant)
111{
112   uint64_t tmp;
113
114   tmp = ((uint64_t)s << 63) | ((uint64_t)_exp << 52) | mant;
115   *(uint64_t *)farg = tmp;
116   AB_DPRINTF("%d %03x %013llx => %016llx %0e\n",
117              s, _exp, mant, *(uint64_t *)farg, *(double *)farg);
118}
119
120static inline void register_sp_farg (void *farg,
121                                     int s, uint16_t _exp, uint32_t mant)
122{
123   uint32_t tmp;
124   tmp = ((uint32_t)s << 31) | ((uint32_t)_exp << 23) | mant;
125   *(uint32_t *)farg = tmp;
126}
127
128
129typedef struct fp_test_args {
130   int fra_idx;
131   int frb_idx;
132} fp_test_args_t;
133
134
135fp_test_args_t two_arg_fp_tests[] = {
136                                     {8, 8},
137                                     {8, 14},
138                                     {15, 16},
139                                     {8, 5},
140                                     {8, 4},
141                                     {8, 7},
142                                     {8, 9},
143                                     {8, 11},
144                                     {14, 8},
145                                     {14, 14},
146                                     {14, 6},
147                                     {14, 5},
148                                     {14, 4},
149                                     {14, 7},
150                                     {14, 9},
151                                     {14, 11},
152                                     {6, 8},
153                                     {6, 14},
154                                     {6, 6},
155                                     {6, 5},
156                                     {6, 4},
157                                     {6, 7},
158                                     {6, 9},
159                                     {6, 11},
160                                     {5, 8},
161                                     {5, 14},
162                                     {5, 6},
163                                     {5, 5},
164                                     {5, 4},
165                                     {5, 7},
166                                     {5, 9},
167                                     {5, 11},
168                                     {4, 8},
169                                     {4, 14},
170                                     {4, 6},
171                                     {4, 5},
172                                     {4, 1},
173                                     {4, 7},
174                                     {4, 9},
175                                     {4, 11},
176                                     {7, 8},
177                                     {7, 14},
178                                     {7, 6},
179                                     {7, 5},
180                                     {7, 4},
181                                     {7, 7},
182                                     {7, 9},
183                                     {7, 11},
184                                     {10, 8},
185                                     {10, 14},
186                                     {12, 6},
187                                     {12, 5},
188                                     {10, 4},
189                                     {10, 7},
190                                     {10, 9},
191                                     {10, 11},
192                                     {12, 8 },
193                                     {12, 14},
194                                     {12, 6},
195                                     {15, 16},
196                                     {15, 16},
197                                     {9, 11},
198                                     {11, 11},
199                                     {11, 12},
200                                     {16, 18},
201                                     {17, 16},
202                                     {19, 19},
203                                     {19, 18}
204};
205
206
207static int nb_special_fargs;
208static double * spec_fargs;
209static float * spec_sp_fargs;
210
211static void build_special_fargs_table(void)
212{
213/*
214  Entry  Sign Exp   fraction                  Special value
215   0      0   3fd   0x8000000000000ULL         Positive finite number
216   1      0   404   0xf000000000000ULL         ...
217   2      0   001   0x8000000b77501ULL         ...
218   3      0   7fe   0x800000000051bULL         ...
219   4      0   012   0x3214569900000ULL         ...
220   5      0   000   0x0000000000000ULL         +0.0 (+zero)
221   6      1   000   0x0000000000000ULL         -0.0 (-zero)
222   7      0   7ff   0x0000000000000ULL         +infinity
223   8      1   7ff   0x0000000000000ULL         -infinity
224   9      0   7ff   0x7FFFFFFFFFFFFULL         +SNaN
225   10     1   7ff   0x7FFFFFFFFFFFFULL         -SNaN
226   11     0   7ff   0x8000000000000ULL         +QNaN
227   12     1   7ff   0x8000000000000ULL         -QNaN
228   13     1   000   0x8340000078000ULL         Denormalized val (zero exp and non-zero fraction)
229   14     1   40d   0x0650f5a07b353ULL         Negative finite number
230   15     0   412   0x32585a9900000ULL         A few more positive finite numbers
231   16     0   413   0x82511a2000000ULL         ...
232   17  . . . . . . . . . . . . . . . . . . . . . . .
233   18  . . . . . . . . . . . . . . . . . . . . . . .
234   19  . . . . . . . . . . . . . . . . . . . . . . .
235*/
236
237   uint64_t mant;
238   uint32_t mant_sp;
239   uint16_t _exp;
240   int s;
241   int j, i = 0;
242
243   if (spec_fargs)
244      return;
245
246   spec_fargs = malloc( 20 * sizeof(double) );
247   spec_sp_fargs = malloc( 20 * sizeof(float) );
248
249   // #0
250   s = 0;
251   _exp = 0x3fd;
252   mant = 0x8000000000000ULL;
253   register_farg(&spec_fargs[i++], s, _exp, mant);
254
255   // #1
256   s = 0;
257   _exp = 0x404;
258   mant = 0xf000000000000ULL;
259   register_farg(&spec_fargs[i++], s, _exp, mant);
260
261   // #2
262   s = 0;
263   _exp = 0x001;
264   mant = 0x8000000b77501ULL;
265   register_farg(&spec_fargs[i++], s, _exp, mant);
266
267   // #3
268   s = 0;
269   _exp = 0x7fe;
270   mant = 0x800000000051bULL;
271   register_farg(&spec_fargs[i++], s, _exp, mant);
272
273   // #4
274   s = 0;
275   _exp = 0x012;
276   mant = 0x3214569900000ULL;
277   register_farg(&spec_fargs[i++], s, _exp, mant);
278
279
280   /* Special values */
281   /* +0.0      : 0 0x000 0x0000000000000 */
282   // #5
283   s = 0;
284   _exp = 0x000;
285   mant = 0x0000000000000ULL;
286   register_farg(&spec_fargs[i++], s, _exp, mant);
287
288   /* -0.0      : 1 0x000 0x0000000000000 */
289   // #6
290   s = 1;
291   _exp = 0x000;
292   mant = 0x0000000000000ULL;
293   register_farg(&spec_fargs[i++], s, _exp, mant);
294
295   /* +infinity : 0 0x7FF 0x0000000000000  */
296   // #7
297   s = 0;
298   _exp = 0x7FF;
299   mant = 0x0000000000000ULL;
300   register_farg(&spec_fargs[i++], s, _exp, mant);
301
302   /* -infinity : 1 0x7FF 0x0000000000000 */
303   // #8
304   s = 1;
305   _exp = 0x7FF;
306   mant = 0x0000000000000ULL;
307   register_farg(&spec_fargs[i++], s, _exp, mant);
308
309   /*
310    * This comment applies to values #9 and #10 below:
311    * When src is a SNaN, it's converted to a QNaN first before rounding to single-precision,
312    * so we can't just copy the double-precision value to the corresponding slot in the
313    * single-precision array (i.e., in the loop at the end of this function).  Instead, we
314    * have to manually set the bits using register_sp_farg().
315    */
316
317   /* +SNaN     : 0 0x7FF 0x7FFFFFFFFFFFF */
318   // #9
319   s = 0;
320   _exp = 0x7FF;
321   mant = 0x7FFFFFFFFFFFFULL;
322   register_farg(&spec_fargs[i++], s, _exp, mant);
323   _exp = 0xff;
324   mant_sp = 0x3FFFFF;
325   register_sp_farg(&spec_sp_fargs[i-1], s, _exp, mant_sp);
326
327   /* -SNaN     : 1 0x7FF 0x7FFFFFFFFFFFF */
328   // #10
329   s = 1;
330   _exp = 0x7FF;
331   mant = 0x7FFFFFFFFFFFFULL;
332   register_farg(&spec_fargs[i++], s, _exp, mant);
333   _exp = 0xff;
334   mant_sp = 0x3FFFFF;
335   register_sp_farg(&spec_sp_fargs[i-1], s, _exp, mant_sp);
336
337   /* +QNaN     : 0 0x7FF 0x8000000000000 */
338   // #11
339   s = 0;
340   _exp = 0x7FF;
341   mant = 0x8000000000000ULL;
342   register_farg(&spec_fargs[i++], s, _exp, mant);
343
344   /* -QNaN     : 1 0x7FF 0x8000000000000 */
345   // #12
346   s = 1;
347   _exp = 0x7FF;
348   mant = 0x8000000000000ULL;
349   register_farg(&spec_fargs[i++], s, _exp, mant);
350
351   /* denormalized value */
352   // #13
353   s = 1;
354   _exp = 0x000;
355   mant = 0x8340000078000ULL;
356   register_farg(&spec_fargs[i++], s, _exp, mant);
357
358   /* Negative finite number */
359   // #14
360   s = 1;
361   _exp = 0x40d;
362   mant = 0x0650f5a07b353ULL;
363   register_farg(&spec_fargs[i++], s, _exp, mant);
364
365   /* A few positive finite numbers ... */
366   // #15
367   s = 0;
368   _exp = 0x412;
369   mant = 0x32585a9900000ULL;
370   register_farg(&spec_fargs[i++], s, _exp, mant);
371
372   // #16
373   s = 0;
374   _exp = 0x413;
375   mant = 0x82511a2000000ULL;
376   register_farg(&spec_fargs[i++], s, _exp, mant);
377
378   // #17
379   s = 0;
380   _exp = 0x403;
381   mant = 0x12ef5a9300000ULL;
382   register_farg(&spec_fargs[i++], s, _exp, mant);
383
384   // #18
385   s = 0;
386   _exp = 0x405;
387   mant = 0x14bf5d2300000ULL;
388   register_farg(&spec_fargs[i++], s, _exp, mant);
389
390   // #19
391   s = 0;
392   _exp = 0x409;
393   mant = 0x76bf982440000ULL;
394   register_farg(&spec_fargs[i++], s, _exp, mant);
395
396   nb_special_fargs = i;
397   for (j = 0; j < i; j++) {
398      if (!(j == 9 || j == 10))
399         spec_sp_fargs[j] = spec_fargs[j];
400   }
401}
402
403
404struct test_table
405{
406   test_func_t test_category;
407   char * name;
408   unsigned int test_group;
409};
410
411/*  Type of input for floating point operations.*/
412typedef enum {
413   SINGLE_TEST,
414   DOUBLE_TEST
415} precision_type_t;
416
417typedef enum {
418   VX_SCALAR_CONV_TO_WORD,
419   VX_CONV_TO_SINGLE,
420   VX_CONV_TO_DOUBLE,
421   VX_ESTIMATE,
422   VX_DEFAULT
423} vx_fp_test_type;
424
425static vector unsigned int vec_out, vec_inA, vec_inB;
426
427/* This function is for checking the reciprocal and reciprocal square root
428 * estimate instructions.
429 */
430Bool check_estimate(precision_type_t type, Bool is_rsqrte, int idx, int output_vec_idx)
431{
432   /* Technically, the number of bits of precision for xvredp and xvrsqrtedp is
433    * 14 bits (14 = log2 16384).  However, the VEX emulation of these instructions
434    * does an actual reciprocal calculation versus estimation, so the answer we get back from
435    * valgrind can easily differ from the estimate in the lower bits (within the 14 bits of
436    * precision) and the estimate may still be within expected tolerances.  On top of that,
437    * we can't count on these estimates always being the same across implementations.
438    * For example, with the fre[s] instruction (which should be correct to within one part
439    * in 256 -- i.e., 8 bits of precision) . . . When approximating the value 1.0111_1111_1111,
440    * one implementation could return 1.0111_1111_0000 and another implementation could return
441    * 1.1000_0000_0000.  Both estimates meet the 1/256 accuracy requirement, but share only a
442    * single bit in common.
443    *
444    * The upshot is we can't validate the VEX output for these instructions by comparing against
445    * stored bit patterns.  We must check that the result is within expected tolerances.
446    */
447
448
449   /* A mask to be used for validation as a last resort.
450    * Only use 12 bits of precision for reasons discussed above.
451    */
452#define VSX_RECIP_ESTIMATE_MASK_DP 0xFFFFFF0000000000ULL
453#define VSX_RECIP_ESTIMATE_MASK_SP 0xFFFFFF00
454
455   Bool result = False;
456   Bool dp_test = type == DOUBLE_TEST;
457   double src_dp, res_dp;
458   float src_sp, res_sp;
459   src_dp = res_dp = 0;
460   src_sp = res_sp = 0;
461#define SRC (dp_test ? src_dp : src_sp)
462#define RES (dp_test ? res_dp : res_sp)
463   Bool src_is_negative = False;
464   Bool res_is_negative = False;
465   unsigned long long * dst_dp = NULL;
466   unsigned int * dst_sp = NULL;
467   if (dp_test) {
468      unsigned long long * src_dp_ull;
469      dst_dp = (unsigned long long *) &vec_out;
470      src_dp = spec_fargs[idx];
471      src_dp_ull = (unsigned long long *) &src_dp;
472      src_is_negative = (*src_dp_ull & 0x8000000000000000ULL) ? True : False;
473      res_is_negative = (dst_dp[output_vec_idx] & 0x8000000000000000ULL) ? True : False;
474      memcpy(&res_dp, &dst_dp[output_vec_idx], 8);
475   } else {
476      unsigned int * src_sp_uint;
477      dst_sp = (unsigned int *) &vec_out;
478      src_sp = spec_sp_fargs[idx];
479      src_sp_uint = (unsigned int *) &src_sp;
480      src_is_negative = (*src_sp_uint & 0x80000000) ? True : False;
481      res_is_negative = (dst_sp[output_vec_idx] & 0x80000000) ? True : False;
482      memcpy(&res_sp, &dst_sp[output_vec_idx], 4);
483   }
484
485   // Below are common rules for xvre{d|s}p and xvrsqrte{d|s}p
486   if (isnan(SRC))
487      return isnan(RES);
488   if (fpclassify(SRC) == FP_ZERO)
489      return isinf(RES);
490   if (!src_is_negative && isinf(SRC))
491      return !res_is_negative && (fpclassify(RES) == FP_ZERO);
492   if (is_rsqrte) {
493      if (src_is_negative)
494         return isnan(RES);
495   } else {
496      if (src_is_negative && isinf(SRC))
497         return res_is_negative && (fpclassify(RES) == FP_ZERO);
498   }
499   if (dp_test) {
500      double calc_diff;
501      double real_diff;
502      double recip_divisor;
503      double div_result;
504      double calc_diff_tmp;
505
506      if (is_rsqrte)
507         recip_divisor = sqrt(src_dp);
508      else
509         recip_divisor = src_dp;
510
511      div_result = 1.0/recip_divisor;
512      calc_diff_tmp = recip_divisor * 16384.0;
513      if (isnormal(calc_diff_tmp)) {
514         calc_diff = fabs(1.0/calc_diff_tmp);
515         real_diff = fabs(res_dp - div_result);
516         result = ( ( res_dp == div_result )
517                  || ( real_diff <= calc_diff ) );
518      } else {
519         /* Unable to compute theoretical difference, so we fall back to masking out
520          * un-precise bits.
521          */
522         unsigned long long * div_result_dp = (unsigned long long *) &div_result;
523         result = (dst_dp[output_vec_idx] & VSX_RECIP_ESTIMATE_MASK_DP) == (*div_result_dp & VSX_RECIP_ESTIMATE_MASK_DP);
524      }
525      /* For debug use . . .
526         if (!result) {
527             unsigned long long * dv = &div_result;
528             unsigned long long * rd = &real_diff;
529             unsigned long long * cd = &calc_diff;
530             printf("\n\t {actual div_result: %016llx; real_diff:  %016llx; calc_diff:  %016llx}\n",
531       *dv, *rd, *cd);
532          }
533       */
534   } else {  // single precision test (only have xvrsqrtesp, since xvresp was implemented in stage 2)
535      float calc_diff;
536      float real_diff;
537      float div_result;
538      float calc_diff_tmp;
539      float recip_divisor = sqrt(src_sp);
540
541      div_result = 1.0/recip_divisor;
542      calc_diff_tmp = recip_divisor * 16384.0;
543      if (isnormal(calc_diff_tmp)) {
544         calc_diff = fabsf(1.0/calc_diff_tmp);
545         real_diff = fabsf(res_sp - div_result);
546         result = ( ( res_sp == div_result )
547                  || ( real_diff <= calc_diff ) );
548      } else {
549         /* Unable to compute theoretical difference, so we fall back to masking out
550          * un-precise bits.
551          */
552         unsigned int * div_result_sp = (unsigned int *) &div_result;
553         result = (dst_sp[output_vec_idx] & VSX_RECIP_ESTIMATE_MASK_SP) == (*div_result_sp & VSX_RECIP_ESTIMATE_MASK_SP);
554      }
555      /* For debug use . . .
556         if (!result) {
557             unsigned long long * dv = &div_result;
558             unsigned long long * rd = &real_diff;
559             unsigned long long * cd = &calc_diff;
560             printf("\n\t {actual div_result: %016llx; real_diff:  %016llx; calc_diff:  %016llx}\n",
561       *dv, *rd, *cd);
562          }
563       */
564   }
565   return result;
566}
567
568typedef struct vx_fp_test
569{
570   test_func_t test_func;
571   const char * name;
572   fp_test_args_t * targs;
573   int num_tests;
574   precision_type_t precision;
575   vx_fp_test_type type;
576   const char * op;
577} vx_fp_test_t;
578
579
580static Bool do_dot;
581
582static void test_xvredp(void)
583{
584   __asm__ __volatile__ ("xvredp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
585}
586
587static void test_xsredp(void)
588{
589   __asm__ __volatile__ ("xsredp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
590}
591
592static void test_xvrsqrtedp(void)
593{
594   __asm__ __volatile__ ("xvrsqrtedp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
595}
596
597static void test_xsrsqrtedp(void)
598{
599   __asm__ __volatile__ ("xsrsqrtedp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
600}
601
602static void test_xvrsqrtesp(void)
603{
604   __asm__ __volatile__ ("xvrsqrtesp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
605}
606
607static void test_xstsqrtdp(void)
608{
609   __asm__ __volatile__ ("xstsqrtdp   cr1, %x0" : : "wa" (vec_inB));
610}
611
612static void test_xvtsqrtdp(void)
613{
614   __asm__ __volatile__ ("xvtsqrtdp   cr1, %x0" : : "wa" (vec_inB));
615}
616
617static void test_xvtsqrtsp(void)
618{
619   __asm__ __volatile__ ("xvtsqrtsp   cr1, %x0" : : "wa" (vec_inB));
620}
621
622static void test_xvsqrtdp(void)
623{
624   __asm__ __volatile__ ("xvsqrtdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
625}
626
627static void test_xvsqrtsp(void)
628{
629   __asm__ __volatile__ ("xvsqrtsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
630}
631
632static void test_xvtdivdp(void)
633{
634   __asm__ __volatile__ ("xvtdivdp   cr1, %x0, %x1" : : "wa" (vec_inA), "wa" (vec_inB));
635}
636
637static void test_xvtdivsp(void)
638{
639   __asm__ __volatile__ ("xvtdivsp   cr1, %x0, %x1" : : "wa" (vec_inA), "wa" (vec_inB));
640}
641
642static void test_xscvdpsp(void)
643{
644   __asm__ __volatile__ ("xscvdpsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
645}
646
647static void test_xscvdpuxws(void)
648{
649   __asm__ __volatile__ ("xscvdpuxws   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
650}
651
652static void test_xscvspdp(void)
653{
654   __asm__ __volatile__ ("xscvspdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
655}
656
657static void test_xvcvdpsp(void)
658{
659   __asm__ __volatile__ ("xvcvdpsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
660}
661
662static void test_xvcvdpuxds(void)
663{
664   __asm__ __volatile__ ("xvcvdpuxds   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
665}
666
667static void test_xvcvdpuxws(void)
668{
669   __asm__ __volatile__ ("xvcvdpuxws   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
670}
671
672static void test_xvcvspdp(void)
673{
674   __asm__ __volatile__ ("xvcvspdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
675}
676
677static void test_xvcvspsxds(void)
678{
679   __asm__ __volatile__ ("xvcvspsxds   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
680}
681
682static void test_xvcvspuxds(void)
683{
684   __asm__ __volatile__ ("xvcvspuxds   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
685}
686
687static void test_xvcvdpsxds(void)
688{
689   __asm__ __volatile__ ("xvcvdpsxds   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
690}
691
692static void test_xvcvspuxws(void)
693{
694   __asm__ __volatile__ ("xvcvspuxws   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
695}
696
697static void test_xvcvsxddp(void)
698{
699   __asm__ __volatile__ ("xvcvsxddp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
700}
701
702static void test_xvcvuxddp(void)
703{
704   __asm__ __volatile__ ("xvcvuxddp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
705}
706
707static void test_xvcvsxdsp(void)
708{
709   __asm__ __volatile__ ("xvcvsxdsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
710}
711
712static void test_xvcvuxdsp(void)
713{
714   __asm__ __volatile__ ("xvcvuxdsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
715}
716
717static void test_xvcvsxwdp(void)
718{
719   __asm__ __volatile__ ("xvcvsxwdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
720}
721
722static void test_xvcvuxwdp(void)
723{
724   __asm__ __volatile__ ("xvcvuxwdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
725}
726
727static void test_xvcvsxwsp(void)
728{
729   __asm__ __volatile__ ("xvcvsxwsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
730}
731
732static void test_xvcvuxwsp(void)
733{
734   __asm__ __volatile__ ("xvcvuxwsp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
735}
736
737static void test_xsrdpic(void)
738{
739   __asm__ __volatile__ ("xsrdpic   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
740}
741
742static void test_xsrdpiz(void)
743{
744   __asm__ __volatile__ ("xsrdpiz   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
745}
746
747static void test_xsrdpi(void)
748{
749   __asm__ __volatile__ ("xsrdpi   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
750}
751
752static void test_xvabsdp(void)
753{
754   __asm__ __volatile__ ("xvabsdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
755}
756
757static void test_xvnabsdp(void)
758{
759   __asm__ __volatile__ ("xvnabsdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
760}
761
762static void test_xvnegdp(void)
763{
764   __asm__ __volatile__ ("xvnegdp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
765}
766
767static void test_xvabssp(void)
768{
769   __asm__ __volatile__ ("xvabssp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
770}
771
772static void test_xvnabssp(void)
773{
774   __asm__ __volatile__ ("xvnabssp   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
775}
776
777static void test_xvrdpi(void)
778{
779   __asm__ __volatile__ ("xvrdpi   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
780}
781
782static void test_xvrdpic(void)
783{
784   __asm__ __volatile__ ("xvrdpic   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
785}
786
787static void test_xvrdpim(void)
788{
789   __asm__ __volatile__ ("xvrdpim   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
790}
791
792static void test_xvrdpip(void)
793{
794   __asm__ __volatile__ ("xvrdpip   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
795}
796
797static void test_xvrdpiz(void)
798{
799   __asm__ __volatile__ ("xvrdpiz   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
800}
801
802static void test_xvrspi(void)
803{
804   __asm__ __volatile__ ("xvrspi   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
805}
806
807static void test_xvrspic(void)
808{
809   __asm__ __volatile__ ("xvrspic   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
810}
811
812static void test_xvrspim(void)
813{
814   __asm__ __volatile__ ("xvrspim   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
815}
816
817static void test_xvrspip(void)
818{
819   __asm__ __volatile__ ("xvrspip   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
820}
821
822static void test_xvrspiz(void)
823{
824   __asm__ __volatile__ ("xvrspiz   %x0, %x1" : "=wa" (vec_out): "wa" (vec_inB));
825}
826
827static vx_fp_test_t
828vsx_one_fp_arg_tests[] = {
829                                { &test_xvredp, "xvredp", NULL, 18, DOUBLE_TEST, VX_ESTIMATE, "1/x"},
830                                { &test_xsredp, "xsredp", NULL, 18, DOUBLE_TEST, VX_ESTIMATE, "1/x"},
831                                { &test_xvrsqrtedp, "xvrsqrtedp", NULL, 18, DOUBLE_TEST, VX_ESTIMATE, "1/x-sqrt"},
832                                { &test_xsrsqrtedp, "xsrsqrtedp", NULL, 18, DOUBLE_TEST, VX_ESTIMATE, "1/x-sqrt"},
833                                { &test_xvrsqrtesp, "xvrsqrtesp", NULL, 18, SINGLE_TEST, VX_ESTIMATE, "1/x-sqrt"},
834                                { &test_xvsqrtdp, "xvsqrtdp", NULL, 18, DOUBLE_TEST, VX_DEFAULT, "sqrt"},
835                                { &test_xvsqrtsp, "xvsqrtsp", NULL, 18, SINGLE_TEST, VX_DEFAULT, "sqrt"},
836                                { &test_xscvdpsp, "xscvdpsp", NULL, 20, DOUBLE_TEST, VX_CONV_TO_SINGLE, "conv"},
837                                { &test_xscvdpuxws, "xscvdpuxws", NULL, 20, DOUBLE_TEST, VX_SCALAR_CONV_TO_WORD, "conv"},
838                                { &test_xscvspdp, "xscvspdp", NULL, 20, SINGLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
839                                { &test_xvcvdpsp, "xvcvdpsp", NULL, 20, DOUBLE_TEST, VX_CONV_TO_SINGLE, "conv"},
840                                { &test_xvcvdpuxds, "xvcvdpuxds", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
841                                { &test_xvcvdpuxws, "xvcvdpuxws", NULL, 20, DOUBLE_TEST, VX_CONV_TO_SINGLE, "conv"},
842                                { &test_xvcvspdp, "xvcvspdp", NULL, 20, SINGLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
843                                { &test_xvcvspsxds, "xvcvspsxds", NULL, 20, SINGLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
844                                { &test_xvcvdpsxds, "xvcvdpsxds", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
845                                { &test_xvcvspuxds, "xvcvspuxds", NULL, 20, SINGLE_TEST, VX_CONV_TO_DOUBLE, "conv"},
846                                { &test_xvcvspuxws, "xvcvspuxws", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "conv"},
847                                { &test_xsrdpic, "xsrdpic", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
848                                { &test_xsrdpiz, "xsrdpiz", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
849                                { &test_xsrdpi, "xsrdpi", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
850                                { &test_xvabsdp, "xvabsdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "abs"},
851                                { &test_xvnabsdp, "xvnabsdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "nabs"},
852                                { &test_xvnegdp, "xvnegdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "neg"},
853                                { &test_xvabssp, "xvabssp", NULL, 20, SINGLE_TEST, VX_DEFAULT, "abs"},
854                                { &test_xvnabssp, "xvnabssp", NULL, 20, SINGLE_TEST, VX_DEFAULT, "nabs"},
855                                { &test_xvrdpi,  "xvrdpi",  NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
856                                { &test_xvrdpic, "xvrdpic", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
857                                { &test_xvrdpim, "xvrdpim", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
858                                { &test_xvrdpip, "xvrdpip", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
859                                { &test_xvrdpiz, "xvrdpiz", NULL, 20, DOUBLE_TEST, VX_CONV_TO_DOUBLE, "round"},
860                                { &test_xvrspi,  "xvrspi",  NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
861                                { &test_xvrspic, "xvrspic", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
862                                { &test_xvrspim, "xvrspim", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
863                                { &test_xvrspip, "xvrspip", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
864                                { &test_xvrspiz, "xvrspiz", NULL, 20, SINGLE_TEST, VX_CONV_TO_SINGLE, "round"},
865                                { NULL, NULL, NULL, 0, 0, 0, NULL}
866};
867
868static vx_fp_test_t
869vx_tdivORtsqrt_tests[] = {
870                          { &test_xstsqrtdp, "xstsqrtdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "test-sqrt"},
871                          { &test_xvtsqrtdp, "xvtsqrtdp", NULL, 20, DOUBLE_TEST, VX_DEFAULT, "test-sqrt"},
872                          { &test_xvtsqrtsp, "xvtsqrtsp", NULL, 20, SINGLE_TEST, VX_DEFAULT, "test-sqrt"},
873                          { &test_xvtdivdp, "xvtdivdp", two_arg_fp_tests, 68, DOUBLE_TEST, VX_DEFAULT, "test-div"},
874                          { &test_xvtdivsp, "xvtdivsp", two_arg_fp_tests, 68, SINGLE_TEST, VX_DEFAULT, "test-div"},
875                          { NULL, NULL, NULL, 0 , 0, 0, NULL}
876};
877
878static unsigned long long doubleWord[] = { 0,
879                                  0xffffffff00000000LL,
880                                  0x00000000ffffffffLL,
881                                  0xffffffffffffffffLL,
882                                  0x89abcde123456789LL,
883                                  0x0102030405060708LL,
884                                  0x00000000a0b1c2d3LL,
885                                  0x1111222233334444LL
886};
887
888static unsigned int singleWord[] = {0,
889                                  0xffff0000,
890                                  0x0000ffff,
891                                  0xffffffff,
892                                  0x89a73522,
893                                  0x01020304,
894                                  0x0000abcd,
895                                  0x11223344
896};
897
898typedef struct vx_intToFp_test
899{
900   test_func_t test_func;
901   const char * name;
902   void * targs;
903   int num_tests;
904   precision_type_t precision;
905   vx_fp_test_type type;
906} vx_intToFp_test_t;
907
908static vx_intToFp_test_t
909intToFp_tests[] = {
910                   { test_xvcvsxddp, "xvcvsxddp", (void *)doubleWord, 8, DOUBLE_TEST, VX_CONV_TO_DOUBLE },
911                   { test_xvcvuxddp, "xvcvuxddp", (void *)doubleWord, 8, DOUBLE_TEST, VX_CONV_TO_DOUBLE },
912                   { test_xvcvsxdsp, "xvcvsxdsp", (void *)doubleWord, 8, DOUBLE_TEST, VX_CONV_TO_SINGLE },
913                   { test_xvcvuxdsp, "xvcvuxdsp", (void *)doubleWord, 8, DOUBLE_TEST, VX_CONV_TO_SINGLE },
914                   { test_xvcvsxwdp, "xvcvsxwdp", (void *)singleWord, 8, SINGLE_TEST, VX_CONV_TO_DOUBLE },
915                   { test_xvcvuxwdp, "xvcvuxwdp", (void *)singleWord, 8, SINGLE_TEST, VX_CONV_TO_DOUBLE },
916                   { test_xvcvsxwsp, "xvcvsxwsp", (void *)singleWord, 8, SINGLE_TEST, VX_CONV_TO_SINGLE },
917                   { test_xvcvuxwsp, "xvcvuxwsp", (void *)singleWord, 8, SINGLE_TEST, VX_CONV_TO_SINGLE },
918                   { NULL, NULL, NULL, 0, 0 }
919};
920
921static Bool do_OE;
922typedef enum {
923   DIV_BASE = 1,
924   DIV_OE = 2,
925   DIV_DOT = 4,
926} div_type_t;
927/* Possible divde type combinations are:
928 *   - base
929 *   - base+dot
930 *   - base+OE
931 *   - base+OE+dot
932 */
933#ifdef __powerpc64__
934static void test_divdeu(void)
935{
936   int divdeu_type = DIV_BASE;
937   if (do_OE)
938      divdeu_type |= DIV_OE;
939   if (do_dot)
940      divdeu_type |= DIV_DOT;
941
942   switch (divdeu_type) {
943      case 1:
944        SET_CR_XER_ZERO;
945         __asm__ __volatile__ ("divdeu %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
946         GET_CR_XER(div_flags, div_xer);
947         break;
948      case 3:
949        SET_CR_XER_ZERO;
950         __asm__ __volatile__ ("divdeuo %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
951         GET_CR_XER(div_flags, div_xer);
952         break;
953      case 5:
954        SET_CR_XER_ZERO;
955         __asm__ __volatile__ ("divdeu. %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
956         GET_CR_XER(div_flags, div_xer);
957         break;
958      case 7:
959        SET_CR_XER_ZERO;
960         __asm__ __volatile__ ("divdeuo. %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
961         GET_CR_XER(div_flags, div_xer);
962         break;
963      default:
964         fprintf(stderr, "Invalid divdeu type. Exiting\n");
965         exit(1);
966   }
967}
968#endif
969
970static void test_divwe(void)
971{
972   int divwe_type = DIV_BASE;
973   if (do_OE)
974      divwe_type |= DIV_OE;
975   if (do_dot)
976      divwe_type |= DIV_DOT;
977
978   switch (divwe_type) {
979      case 1:
980        SET_CR_XER_ZERO;
981         __asm__ __volatile__ ("divwe %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
982         GET_CR_XER(div_flags, div_xer);
983         break;
984      case 3:
985        SET_CR_XER_ZERO;
986         __asm__ __volatile__ ("divweo %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
987         GET_CR_XER(div_flags, div_xer);
988         break;
989      case 5:
990        SET_CR_XER_ZERO;
991         __asm__ __volatile__ ("divwe. %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
992         GET_CR_XER(div_flags, div_xer);
993         break;
994      case 7:
995        SET_CR_XER_ZERO;
996         __asm__ __volatile__ ("divweo. %0, %1, %2" : "=r" (r17) : "r" (r14),"r" (r15));
997         GET_CR_XER(div_flags, div_xer);
998         break;
999      default:
1000         fprintf(stderr, "Invalid divweu type. Exiting\n");
1001         exit(1);
1002   }
1003}
1004
1005
1006typedef struct simple_test {
1007   test_func_t test_func;
1008   char * name;
1009   precision_type_t precision;
1010} simple_test_t;
1011
1012
1013static void setup_sp_fp_args(fp_test_args_t * targs, Bool swap_inputs)
1014{
1015   int a_idx, b_idx, i;
1016   void * inA, * inB;
1017   void * vec_src = swap_inputs ? &vec_out : &vec_inB;
1018
1019   for (i = 0; i < 4; i++) {
1020      a_idx = targs->fra_idx;
1021      b_idx = targs->frb_idx;
1022      inA = (void *)&spec_sp_fargs[a_idx];
1023      inB = (void *)&spec_sp_fargs[b_idx];
1024      // copy single precision FP  into vector element i
1025      memcpy(((void *)&vec_inA) + (i * 4), inA, 4);
1026      memcpy(vec_src + (i * 4), inB, 4);
1027      targs++;
1028   }
1029}
1030
1031static void setup_dp_fp_args(fp_test_args_t * targs, Bool swap_inputs)
1032{
1033   int a_idx, b_idx, i;
1034   void * inA, * inB;
1035   void * vec_src = swap_inputs ? (void *)&vec_out : (void *)&vec_inB;
1036
1037   for (i = 0; i < 2; i++) {
1038      a_idx = targs->fra_idx;
1039      b_idx = targs->frb_idx;
1040      inA = (void *)&spec_fargs[a_idx];
1041      inB = (void *)&spec_fargs[b_idx];
1042      // copy double precision FP  into vector element i
1043      memcpy(((void *)&vec_inA) + (i * 8), inA, 8);
1044      memcpy(vec_src + (i * 8), inB, 8);
1045      targs++;
1046   }
1047}
1048
1049#define VX_NOT_CMP_OP 0xffffffff
1050static void print_vector_fp_result(unsigned int cc, vx_fp_test_t * test_group, int i, Bool print_vec_out)
1051{
1052   int a_idx, b_idx, k;
1053   char * name = malloc(20);
1054   int dp = test_group->precision == DOUBLE_TEST ? 1 : 0;
1055   int loops = dp ? 2 : 4;
1056   fp_test_args_t * targs = &test_group->targs[i];
1057   unsigned long long * frA_dp, * frB_dp, * dst_dp;
1058   unsigned int * frA_sp, *frB_sp, * dst_sp;
1059   strcpy(name, test_group->name);
1060   printf("#%d: %s%s ", dp? i/2 : i/4, name, (do_dot ? "." : ""));
1061   for (k = 0; k < loops; k++) {
1062      a_idx = targs->fra_idx;
1063      b_idx = targs->frb_idx;
1064      if (k)
1065         printf(" AND ");
1066      if (dp) {
1067         frA_dp = (unsigned long long *)&spec_fargs[a_idx];
1068         frB_dp = (unsigned long long *)&spec_fargs[b_idx];
1069         printf("%016llx %s %016llx", *frA_dp, test_group->op, *frB_dp);
1070      } else {
1071         frA_sp = (unsigned int *)&spec_sp_fargs[a_idx];
1072         frB_sp = (unsigned int *)&spec_sp_fargs[b_idx];
1073         printf("%08x %s %08x", *frA_sp, test_group->op, *frB_sp);
1074      }
1075      targs++;
1076   }
1077   if (cc != VX_NOT_CMP_OP)
1078      printf(" ? cc=%x", cc);
1079
1080   if (print_vec_out) {
1081      if (dp) {
1082         dst_dp = (unsigned long long *) &vec_out;
1083         printf(" => %016llx %016llx\n", dst_dp[0], dst_dp[1]);
1084      } else {
1085         dst_sp = (unsigned int *) &vec_out;
1086         printf(" => %08x %08x %08x %08x\n", dst_sp[0], dst_sp[1], dst_sp[2], dst_sp[3]);
1087      }
1088   } else {
1089      printf("\n");
1090   }
1091   free(name);
1092}
1093
1094
1095
1096static void test_vsx_one_fp_arg(void)
1097{
1098   test_func_t func;
1099   int k;
1100   k = 0;
1101   build_special_fargs_table();
1102
1103   while ((func = vsx_one_fp_arg_tests[k].test_func)) {
1104      int idx, i;
1105      vx_fp_test_t test_group = vsx_one_fp_arg_tests[k];
1106      Bool estimate = (test_group.type == VX_ESTIMATE);
1107      Bool dp = (test_group.precision == DOUBLE_TEST) ? True : False;
1108      Bool is_sqrt = (strstr(test_group.name, "sqrt")) ? True : False;
1109      Bool is_scalar = (strstr(test_group.name, "xs")) ? True : False;
1110      Bool sparse_sp = False;
1111      int stride = dp ? 2 : 4;
1112      int loops = is_scalar ? 1 : stride;
1113      stride = is_scalar ? 1: stride;
1114
1115      /* For conversions of single to double, the 128-bit input register is sparsely populated:
1116       *    |___ SP___|_Unused_|___SP___|__Unused__|   // for vector op
1117       *                     or
1118       *    |___ SP___|_Unused_|_Unused_|__Unused__|   // for scalar op
1119       *
1120       * For the vector op case, we need to adjust stride from '4' to '2', since
1121       * we'll only be loading two values per loop into the input register.
1122       */
1123      if (!dp && !is_scalar && test_group.type == VX_CONV_TO_DOUBLE) {
1124         sparse_sp = True;
1125         stride = 2;
1126      }
1127
1128      for (i = 0; i < test_group.num_tests; i+=stride) {
1129         unsigned int * pv;
1130         void * inB, * vecB_void_ptr = (void *)&vec_inB;
1131
1132         pv = (unsigned int *)&vec_out;
1133         // clear vec_out
1134         for (idx = 0; idx < 4; idx++, pv++)
1135            *pv = 0;
1136
1137         if (dp) {
1138            int j;
1139            unsigned long long * frB_dp, *dst_dp;
1140            for (j = 0; j < loops; j++) {
1141               inB = (void *)&spec_fargs[i + j];
1142               // copy double precision FP into vector element i
1143               if (isLE && is_scalar)
1144                  vecB_void_ptr += 8;
1145               memcpy(vecB_void_ptr + (j * 8), inB, 8);
1146            }
1147            // execute test insn
1148            (*func)();
1149            dst_dp = (unsigned long long *) &vec_out;
1150            if (isLE && is_scalar)
1151               dst_dp++;
1152            printf("#%d: %s ", i/stride, test_group.name);
1153            for (j = 0; j < loops; j++) {
1154               if (j)
1155                  printf("; ");
1156               frB_dp = (unsigned long long *)&spec_fargs[i + j];
1157               printf("%s(%016llx)", test_group.op, *frB_dp);
1158               if (estimate) {
1159                  Bool res = check_estimate(DOUBLE_TEST, is_sqrt, i + j, (isLE && is_scalar) ? 1: j);
1160                  printf(" ==> %s)", res ? "PASS" : "FAIL");
1161                  /* For debugging . . .
1162                   printf(" ==> %s (res=%016llx)", res ? "PASS" : "FAIL", dst_dp[j]);
1163                   */
1164               } else {
1165                  vx_fp_test_type type = test_group.type;
1166                  switch (type) {
1167                     case VX_SCALAR_CONV_TO_WORD:
1168                        printf(" = %016llx", dst_dp[j] & 0x00000000ffffffffULL);
1169                        break;
1170                     case VX_CONV_TO_SINGLE:
1171                        printf(" = %016llx", dst_dp[j] & 0xffffffff00000000ULL);
1172                        break;
1173                     default:  // For VX_CONV_TO_DOUBLE and non-convert instructions . . .
1174                        printf(" = %016llx", dst_dp[j]);
1175                  }
1176               }
1177            }
1178            printf("\n");
1179         } else {
1180            int j;
1181            unsigned int * frB_sp, * dst_sp = NULL;
1182            unsigned long long * dst_dp = NULL;
1183            if (sparse_sp)
1184               loops = 2;
1185            for (j = 0; j < loops; j++) {
1186               inB = (void *)&spec_sp_fargs[i + j];
1187               // copy single precision FP into vector element i
1188               if (sparse_sp) {
1189                  if (isLE)
1190                     memcpy(vecB_void_ptr + ((2 * j * 4) + 4), inB, 4);
1191                  else
1192                     memcpy(vecB_void_ptr + ((2 * j * 4) ), inB, 4);
1193               } else {
1194                  if (isLE && is_scalar)
1195                     vecB_void_ptr += 12;
1196                  memcpy(vecB_void_ptr + (j * 4), inB, 4);
1197               }
1198            }
1199            // execute test insn
1200            (*func)();
1201            if (test_group.type == VX_CONV_TO_DOUBLE) {
1202               dst_dp = (unsigned long long *) &vec_out;
1203               if (isLE && is_scalar)
1204                  dst_dp++;
1205            } else {
1206               dst_sp = (unsigned int *) &vec_out;
1207               if (isLE && is_scalar)
1208                  dst_sp += 3;
1209            }
1210            // print result
1211            printf("#%d: %s ", i/stride, test_group.name);
1212            for (j = 0; j < loops; j++) {
1213               if (j)
1214                  printf("; ");
1215               frB_sp = (unsigned int *)&spec_sp_fargs[i + j];
1216               printf("%s(%08x)", test_group.op, *frB_sp);
1217               if (estimate) {
1218                  Bool res = check_estimate(SINGLE_TEST, is_sqrt, i + j, (isLE && is_scalar) ? 3 : j);
1219                  printf(" ==> %s)", res ? "PASS" : "FAIL");
1220               } else {
1221                  if (test_group.type == VX_CONV_TO_DOUBLE)
1222                        printf(" = %016llx", dst_dp[j]);
1223                  else
1224                  /* Special case: Current VEX implementation for fsqrts (single precision)
1225                   * uses the same implementation as that used for double precision fsqrt.
1226                   * However, I've found that for xvsqrtsp, the result from that implementation
1227                   * may be off by the two LSBs.  Generally, even this small inaccuracy can cause the
1228                   * output to appear very different if you end up with a carry.  But for the given
1229                   * inputs in this testcase, we can simply mask out these bits.
1230                   */
1231                     printf(" = %08x", is_sqrt ? (dst_sp[j] & 0xfffffffc) : dst_sp[j]);
1232               }
1233            }
1234            printf("\n");
1235         }
1236      }
1237      k++;
1238      printf( "\n" );
1239   }
1240}
1241
1242static void test_int_to_fp_convert(void)
1243{
1244   test_func_t func;
1245   int k;
1246   k = 0;
1247
1248   while ((func = intToFp_tests[k].test_func)) {
1249      int idx, i;
1250      vx_intToFp_test_t test_group = intToFp_tests[k];
1251      Bool dp = (test_group.precision == DOUBLE_TEST) ? True : False;
1252      Bool sparse_sp = False;
1253      int stride = dp ? 2 : 4;
1254      int loops = stride;
1255
1256      /* For conversions of single to double, the 128-bit input register is sparsely populated:
1257       *    |___ int___|_Unused_|___int___|__Unused__|   // for vector op
1258       *                     or
1259       * We need to adjust stride from '4' to '2', since we'll only be loading
1260       * two values per loop into the input register.
1261       */
1262      if (!dp && test_group.type == VX_CONV_TO_DOUBLE) {
1263         sparse_sp = True;
1264         stride = 2;
1265      }
1266
1267      for (i = 0; i < test_group.num_tests; i+=stride) {
1268         unsigned int * pv;
1269         void * inB;
1270
1271         pv = (unsigned int *)&vec_out;
1272         // clear vec_out
1273         for (idx = 0; idx < 4; idx++, pv++)
1274            *pv = 0;
1275
1276         if (dp) {
1277            int j;
1278            unsigned long long  *dst_dw, * targs = test_group.targs;
1279            for (j = 0; j < loops; j++) {
1280               inB = (void *)&targs[i + j];
1281               // copy doubleword into vector element i
1282               memcpy(((void *)&vec_inB) + (j * 8), inB, 8);
1283            }
1284            // execute test insn
1285            (*func)();
1286            dst_dw = (unsigned long long *) &vec_out;
1287            printf("#%d: %s ", i/stride, test_group.name);
1288            for (j = 0; j < loops; j++) {
1289               if (j)
1290                  printf("; ");
1291               printf("conv(%016llx)", targs[i + j]);
1292
1293               if (test_group.type == VX_CONV_TO_SINGLE)
1294                  printf(" = %016llx", dst_dw[j] & 0xffffffff00000000ULL);
1295               else
1296                  printf(" = %016llx", dst_dw[j]);
1297            }
1298            printf("\n");
1299         } else {
1300            int j;
1301            unsigned int * dst_sp = NULL;
1302            unsigned int * targs = test_group.targs;
1303            unsigned long long * dst_dp = NULL;
1304            void * vecB_void_ptr = (void *)&vec_inB;
1305            if (sparse_sp)
1306               loops = 2;
1307            for (j = 0; j < loops; j++) {
1308               inB = (void *)&targs[i + j];
1309               // copy single word into vector element i
1310               if (sparse_sp) {
1311                  if (isLE)
1312                     memcpy(vecB_void_ptr + ((2 * j * 4) + 4), inB, 4);
1313                  else
1314                     memcpy(vecB_void_ptr + ((2 * j * 4) ), inB, 4);
1315               } else {
1316                  memcpy(vecB_void_ptr + (j * 4), inB, 4);
1317               }
1318            }
1319            // execute test insn
1320            (*func)();
1321            if (test_group.type == VX_CONV_TO_DOUBLE)
1322               dst_dp = (unsigned long long *) &vec_out;
1323            else
1324               dst_sp = (unsigned int *) &vec_out;
1325            // print result
1326            printf("#%d: %s ", i/stride, test_group.name);
1327            for (j = 0; j < loops; j++) {
1328               if (j)
1329                  printf("; ");
1330               printf("conv(%08x)", targs[i + j]);
1331               if (test_group.type == VX_CONV_TO_DOUBLE)
1332                  printf(" = %016llx", dst_dp[j]);
1333               else
1334                  printf(" = %08x", dst_sp[j]);
1335            }
1336            printf("\n");
1337         }
1338      }
1339      k++;
1340      printf( "\n" );
1341   }
1342}
1343
1344
1345
1346// The div doubleword test data
1347signed long long div_dw_tdata[13][2] = {
1348                                       { 4, -4 },
1349                                       { 4, -3 },
1350                                       { 4, 4 },
1351                                       { 4, -5 },
1352                                       { 3, 8 },
1353                                       { 0x8000000000000000ULL, 0xa },
1354                                       { 0x50c, -1 },
1355                                       { 0x50c, -4096 },
1356                                       { 0x1234fedc, 0x8000a873 },
1357                                       { 0xabcd87651234fedcULL, 0xa123b893 },
1358                                       { 0x123456789abdcULL, 0 },
1359                                       { 0, 2 },
1360                                       { 0x77, 0xa3499 }
1361};
1362#define dw_tdata_len (sizeof(div_dw_tdata)/sizeof(signed long long)/2)
1363
1364// The div word test data
1365unsigned int div_w_tdata[6][2] = {
1366                              { 0, 2 },
1367                              { 2, 0 },
1368                              { 0x7abc1234, 0xf0000000 },
1369                              { 0xfabc1234, 5 },
1370                              { 77, 66 },
1371                              { 5, 0xfabc1234 },
1372};
1373#define w_tdata_len (sizeof(div_w_tdata)/sizeof(unsigned int)/2)
1374
1375typedef struct div_ext_test
1376{
1377   test_func_t test_func;
1378   const char *name;
1379   int num_tests;
1380   div_type_t div_type;
1381   precision_type_t precision;
1382} div_ext_test_t;
1383
1384static div_ext_test_t div_tests[] = {
1385#ifdef __powerpc64__
1386                                   { &test_divdeu, "divdeu", dw_tdata_len, DIV_BASE, DOUBLE_TEST },
1387                                   { &test_divdeu, "divdeuo", dw_tdata_len, DIV_OE, DOUBLE_TEST },
1388#endif
1389                                   { &test_divwe, "divwe", w_tdata_len, DIV_BASE, SINGLE_TEST },
1390                                   { &test_divwe, "divweo", w_tdata_len, DIV_OE, SINGLE_TEST },
1391                                   { NULL, NULL, 0, 0, 0 }
1392};
1393
1394static void test_div_extensions(void)
1395{
1396   test_func_t func;
1397   int k;
1398   k = 0;
1399
1400   while ((func = div_tests[k].test_func)) {
1401      int i, repeat = 1;
1402      div_ext_test_t test_group = div_tests[k];
1403      do_dot = False;
1404
1405again:
1406      for (i = 0; i < test_group.num_tests; i++) {
1407         unsigned int condreg;
1408
1409         if (test_group.div_type == DIV_OE)
1410            do_OE = True;
1411         else
1412            do_OE = False;
1413
1414         if (test_group.precision == DOUBLE_TEST) {
1415            r14 = div_dw_tdata[i][0];
1416            r15 = div_dw_tdata[i][1];
1417         } else {
1418            r14 = div_w_tdata[i][0];
1419            r15 = div_w_tdata[i][1];
1420         }
1421         // execute test insn
1422         (*func)();
1423         condreg = (div_flags & 0xf0000000) >> 28;
1424         printf("#%d: %s%s: ", i, test_group.name, do_dot ? "." : "");
1425         if (test_group.precision == DOUBLE_TEST) {
1426            printf("0x%016llx0000000000000000 / 0x%016llx = 0x%016llx;",
1427                   div_dw_tdata[i][0], div_dw_tdata[i][1], (signed long long) r17);
1428         } else {
1429            printf("0x%08x00000000 / 0x%08x = 0x%08x;",
1430                   div_w_tdata[i][0], div_w_tdata[i][1], (unsigned int) r17);
1431         }
1432         printf(" CR=%x; XER=%x\n", condreg, div_xer);
1433      }
1434      printf("\n");
1435      if (repeat) {
1436         repeat = 0;
1437         do_dot = True;
1438         goto again;
1439      }
1440      k++;
1441      printf( "\n" );
1442   }
1443}
1444
1445
1446static void test_vx_tdivORtsqrt(void)
1447{
1448   test_func_t func;
1449   int k, crx;
1450   unsigned int flags;
1451   k = 0;
1452   do_dot = False;
1453   build_special_fargs_table();
1454
1455   while ((func = vx_tdivORtsqrt_tests[k].test_func)) {
1456      int idx, i;
1457      vx_fp_test_t test_group = vx_tdivORtsqrt_tests[k];
1458      Bool dp = (test_group.precision == DOUBLE_TEST) ? True : False;
1459      Bool is_scalar = (strstr(test_group.name, "xs")) ? True : False;
1460      Bool two_args = test_group.targs ?  True : False;
1461      int stride = dp ? 2 : 4;
1462      int loops = is_scalar ? 1 : stride;
1463      stride = is_scalar ? 1: stride;
1464
1465      for (i = 0; i < test_group.num_tests; i+=stride) {
1466         unsigned int * pv;
1467         void * inB, * vecB_void_ptr = (void *)&vec_inB;
1468
1469         pv = (unsigned int *)&vec_out;
1470         // clear vec_out
1471         for (idx = 0; idx < 4; idx++, pv++)
1472            *pv = 0;
1473
1474         if (dp) {
1475            int j;
1476            unsigned long long * frB_dp;
1477            if (two_args) {
1478               setup_dp_fp_args(&test_group.targs[i], False);
1479            } else {
1480               for (j = 0; j < loops; j++) {
1481                  inB = (void *)&spec_fargs[i + j];
1482                  // copy double precision FP into vector element i
1483                  if (isLE && is_scalar)
1484                     vecB_void_ptr += 8;
1485                  memcpy(vecB_void_ptr + (j * 8), inB, 8);
1486               }
1487            }
1488            // execute test insn
1489            // Must do set/get of CRs immediately before/after calling the asm func
1490            // to avoid CRs being modified by other instructions.
1491            SET_FPSCR_ZERO;
1492            SET_CR_XER_ZERO;
1493            (*func)();
1494            GET_CR(flags);
1495            // assumes using CR1
1496            crx = (flags & 0x0f000000) >> 24;
1497            if (two_args) {
1498               print_vector_fp_result(crx, &test_group, i, False/*do not print vec_out*/);
1499            } else {
1500               printf("#%d: %s ", i/stride, test_group.name);
1501               for (j = 0; j < loops; j++) {
1502                  if (j)
1503                     printf("; ");
1504                  frB_dp = (unsigned long long *)&spec_fargs[i + j];
1505                  printf("%s(%016llx)", test_group.op, *frB_dp);
1506               }
1507               printf( " ? %x (CRx)\n", crx);
1508            }
1509         } else {
1510            int j;
1511            unsigned int * frB_sp;
1512            if (two_args) {
1513               setup_sp_fp_args(&test_group.targs[i], False);
1514            } else {
1515               for (j = 0; j < loops; j++) {
1516                  inB = (void *)&spec_sp_fargs[i + j];
1517                  // copy single precision FP into vector element i
1518                  memcpy(((void *)&vec_inB) + (j * 4), inB, 4);
1519               }
1520            }
1521            // execute test insn
1522            SET_FPSCR_ZERO;
1523            SET_CR_XER_ZERO;
1524            (*func)();
1525            GET_CR(flags);
1526            crx = (flags & 0x0f000000) >> 24;
1527            // print result
1528            if (two_args) {
1529               print_vector_fp_result(crx, &test_group, i, False/*do not print vec_out*/);
1530            } else {
1531               printf("#%d: %s ", i/stride, test_group.name);
1532               for (j = 0; j < loops; j++) {
1533                  if (j)
1534                     printf("; ");
1535                  frB_sp = (unsigned int *)&spec_sp_fargs[i + j];
1536                  printf("%s(%08x)", test_group.op, *frB_sp);
1537               }
1538               printf( " ? %x (CRx)\n", crx);
1539            }
1540         }
1541      }
1542      k++;
1543      printf( "\n" );
1544   }
1545}
1546
1547
1548static void test_ftsqrt(void)
1549{
1550   int i, crx;
1551   unsigned int flags;
1552   unsigned long long * frbp;
1553   build_special_fargs_table();
1554
1555
1556   for (i = 0; i < nb_special_fargs; i++) {
1557      f14 = spec_fargs[i];
1558      frbp = (unsigned long long *)&spec_fargs[i];
1559      SET_FPSCR_ZERO;
1560      SET_CR_XER_ZERO;
1561      __asm__ __volatile__ ("ftsqrt           cr1, %0" : : "d" (f14));
1562      GET_CR(flags);
1563      crx = (flags & 0x0f000000) >> 24;
1564      printf( "ftsqrt: %016llx ? %x (CRx)\n", *frbp, crx);
1565   }
1566   printf( "\n" );
1567}
1568
1569static void
1570test_popcntw(void)
1571{
1572#ifdef __powerpc64__
1573   uint64_t res;
1574   unsigned long long src = 0x9182736405504536ULL;
1575   r14 = src;
1576   __asm__ __volatile__ ("popcntw          %0, %1" : "=r" (res): "r" (r14));
1577   printf("popcntw: 0x%llx => 0x%016llx\n", (unsigned long long)src, (unsigned long long)res);
1578#else
1579   uint32_t res;
1580   unsigned int src = 0x9182730E;
1581   r14 = src;
1582   __asm__ __volatile__ ("popcntw          %0, %1" : "=r" (res): "r" (r14));
1583   printf("popcntw: 0x%x => 0x%08x\n", src, (int)res);
1584#endif
1585   printf( "\n" );
1586}
1587
1588
1589static test_table_t
1590         all_tests[] =
1591{
1592
1593                    { &test_vsx_one_fp_arg,
1594                      "Test VSX vector and scalar single argument instructions", OTHER_INST } ,
1595                    { &test_int_to_fp_convert,
1596                      "Test VSX vector integer to float conversion instructions", OTHER_INST },
1597                    { &test_div_extensions,
1598		      "Test div extensions", SCALAR_DIV_INST },
1599                    { &test_ftsqrt,
1600		      "Test ftsqrt instruction", OTHER_INST },
1601                    { &test_vx_tdivORtsqrt,
1602		      "Test vector and scalar tdiv and tsqrt instructions", OTHER_INST },
1603                    { &test_popcntw,
1604		      "Test popcntw instruction", OTHER_INST },
1605                    { NULL, NULL }
1606};
1607#endif // HAS_VSX
1608
1609static void usage (void)
1610{
1611  fprintf(stderr,
1612	  "Usage: test_isa_3_0 [OPTIONS]\n"
1613	  "\t-d: test scalar division instructions (default)\n"
1614	  "\t-o: test non scalar division instructions (default)\n"
1615	  "\t-A: test all instructions (default)\n"
1616	  "\t-h: display this help and exit\n"
1617	  );
1618}
1619
1620int main(int argc, char **argv)
1621{
1622#ifdef HAS_VSX
1623
1624   test_table_t aTest;
1625   test_func_t func;
1626   int c;
1627   int i = 0;
1628   unsigned int test_run_mask = 0;
1629
1630   /* NOTE, ISA 3.0 introduces the OV32 and CA32 bits in the FPSCR. These
1631    * bits are set on various arithimetic instructions.  This means this
1632    * test generates different FPSCR output for pre ISA 3.0 versus ISA 3.0
1633    * hardware.  The tests have been grouped so that the tests that generate
1634    * different results are in one test and the rest are in a different test.
1635    * this minimizes the size of the result expect files for the two cases.
1636    */
1637
1638   while ((c = getopt(argc, argv, "doAh")) != -1) {
1639      switch (c) {
1640      case 'd':
1641	test_run_mask |= SCALAR_DIV_INST;
1642         break;
1643      case 'o':
1644	test_run_mask |= OTHER_INST;
1645         break;
1646      case 'A':
1647	test_run_mask = 0xFFFF;
1648         break;
1649      case 'h':
1650         usage();
1651         return 0;
1652
1653      default:
1654         usage();
1655         fprintf(stderr, "Unknown argument: '%c'\n", c);
1656         return 1;
1657      }
1658   }
1659
1660   while ((func = all_tests[i].test_category)) {
1661      aTest = all_tests[i];
1662
1663      if(test_run_mask & aTest.test_group) {
1664	/* Test group  specified on command line */
1665
1666	printf( "%s\n", aTest.name );
1667	(*func)();
1668      }
1669      i++;
1670   }
1671   if (spec_fargs)
1672     free(spec_fargs);
1673   if (spec_sp_fargs)
1674     free(spec_sp_fargs);
1675
1676#endif // HAS _VSX
1677
1678   return 0;
1679}
1680