1
2/*---------------------------------------------------------------*/
3/*--- begin                             host_generic_simd64.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2013 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37   where the instruction selectors cannot generate code in-line.
38   These are purely back-end entities and cannot be seen/referenced
39   from IR.  There are also helpers for 32-bit arithmetic in here. */
40
41#include "libvex_basictypes.h"
42#include "main_util.h"              // LIKELY, UNLIKELY
43#include "host_generic_simd64.h"
44
45
46
47/* Tuple/select functions for 32x2 vectors. */
48
49static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
50   return (((ULong)w1) << 32) | ((ULong)w0);
51}
52
53static inline UInt sel32x2_1 ( ULong w64 ) {
54   return 0xFFFFFFFF & toUInt(w64 >> 32);
55}
56static inline UInt sel32x2_0 ( ULong w64 ) {
57   return 0xFFFFFFFF & toUInt(w64);
58}
59
60
61/* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
62   with 64-bit shifts so we give it a hand. */
63
64static inline ULong mk16x4 ( UShort w3, UShort w2,
65                             UShort w1, UShort w0 ) {
66   UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
67   UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
68   return mk32x2(hi32, lo32);
69}
70
71static inline UShort sel16x4_3 ( ULong w64 ) {
72   UInt hi32 = toUInt(w64 >> 32);
73   return toUShort(0xFFFF & (hi32 >> 16));
74}
75static inline UShort sel16x4_2 ( ULong w64 ) {
76   UInt hi32 = toUInt(w64 >> 32);
77   return toUShort(0xFFFF & hi32);
78}
79static inline UShort sel16x4_1 ( ULong w64 ) {
80   UInt lo32 = (UInt)w64;
81   return toUShort(0xFFFF & (lo32 >> 16));
82}
83static inline UShort sel16x4_0 ( ULong w64 ) {
84   UInt lo32 = (UInt)w64;
85   return toUShort(0xFFFF & lo32);
86}
87
88
89/* Tuple/select functions for 8x8 vectors. */
90
91static inline ULong mk8x8 ( UChar w7, UChar w6,
92                            UChar w5, UChar w4,
93                            UChar w3, UChar w2,
94                            UChar w1, UChar w0 ) {
95   UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
96               | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
97   UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
98               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
99   return mk32x2(hi32, lo32);
100}
101
102static inline UChar sel8x8_7 ( ULong w64 ) {
103   UInt hi32 = toUInt(w64 >> 32);
104   return toUChar(0xFF & (hi32 >> 24));
105}
106static inline UChar sel8x8_6 ( ULong w64 ) {
107   UInt hi32 = toUInt(w64 >> 32);
108   return toUChar(0xFF & (hi32 >> 16));
109}
110static inline UChar sel8x8_5 ( ULong w64 ) {
111   UInt hi32 = toUInt(w64 >> 32);
112   return toUChar(0xFF & (hi32 >> 8));
113}
114static inline UChar sel8x8_4 ( ULong w64 ) {
115   UInt hi32 = toUInt(w64 >> 32);
116   return toUChar(0xFF & (hi32 >> 0));
117}
118static inline UChar sel8x8_3 ( ULong w64 ) {
119   UInt lo32 = (UInt)w64;
120   return toUChar(0xFF & (lo32 >> 24));
121}
122static inline UChar sel8x8_2 ( ULong w64 ) {
123   UInt lo32 = (UInt)w64;
124   return toUChar(0xFF & (lo32 >> 16));
125}
126static inline UChar sel8x8_1 ( ULong w64 ) {
127   UInt lo32 = (UInt)w64;
128   return toUChar(0xFF & (lo32 >> 8));
129}
130static inline UChar sel8x8_0 ( ULong w64 ) {
131   UInt lo32 = (UInt)w64;
132   return toUChar(0xFF & (lo32 >> 0));
133}
134
135static inline UChar index8x8 ( ULong w64, UChar ix ) {
136   ix &= 7;
137   return toUChar((w64 >> (8*ix)) & 0xFF);
138}
139
140
141/* Scalar helpers. */
142
143static inline Int qadd32S ( Int xx, Int yy )
144{
145   Long t = ((Long)xx) + ((Long)yy);
146   const Long loLim = -0x80000000LL;
147   const Long hiLim =  0x7FFFFFFFLL;
148   if (t < loLim) t = loLim;
149   if (t > hiLim) t = hiLim;
150   return (Int)t;
151}
152
153static inline Short qadd16S ( Short xx, Short yy )
154{
155   Int t = ((Int)xx) + ((Int)yy);
156   if (t < -32768) t = -32768;
157   if (t > 32767)  t = 32767;
158   return (Short)t;
159}
160
161static inline Char qadd8S ( Char xx, Char yy )
162{
163   Int t = ((Int)xx) + ((Int)yy);
164   if (t < -128) t = -128;
165   if (t > 127)  t = 127;
166   return (Char)t;
167}
168
169static inline UShort qadd16U ( UShort xx, UShort yy )
170{
171   UInt t = ((UInt)xx) + ((UInt)yy);
172   if (t > 0xFFFF) t = 0xFFFF;
173   return (UShort)t;
174}
175
176static inline UChar qadd8U ( UChar xx, UChar yy )
177{
178   UInt t = ((UInt)xx) + ((UInt)yy);
179   if (t > 0xFF) t = 0xFF;
180   return (UChar)t;
181}
182
183static inline Int qsub32S ( Int xx, Int yy )
184{
185   Long t = ((Long)xx) - ((Long)yy);
186   const Long loLim = -0x80000000LL;
187   const Long hiLim =  0x7FFFFFFFLL;
188   if (t < loLim) t = loLim;
189   if (t > hiLim) t = hiLim;
190   return (Int)t;
191}
192
193static inline Short qsub16S ( Short xx, Short yy )
194{
195   Int t = ((Int)xx) - ((Int)yy);
196   if (t < -32768) t = -32768;
197   if (t > 32767)  t = 32767;
198   return (Short)t;
199}
200
201static inline Char qsub8S ( Char xx, Char yy )
202{
203   Int t = ((Int)xx) - ((Int)yy);
204   if (t < -128) t = -128;
205   if (t > 127)  t = 127;
206   return (Char)t;
207}
208
209static inline UShort qsub16U ( UShort xx, UShort yy )
210{
211   Int t = ((Int)xx) - ((Int)yy);
212   if (t < 0)      t = 0;
213   if (t > 0xFFFF) t = 0xFFFF;
214   return (UShort)t;
215}
216
217static inline UChar qsub8U ( UChar xx, UChar yy )
218{
219   Int t = ((Int)xx) - ((Int)yy);
220   if (t < 0)    t = 0;
221   if (t > 0xFF) t = 0xFF;
222   return (UChar)t;
223}
224
225static inline Short mul16 ( Short xx, Short yy )
226{
227   Int t = ((Int)xx) * ((Int)yy);
228   return (Short)t;
229}
230
231static inline Int mul32 ( Int xx, Int yy )
232{
233   Int t = ((Int)xx) * ((Int)yy);
234   return (Int)t;
235}
236
237static inline Short mulhi16S ( Short xx, Short yy )
238{
239   Int t = ((Int)xx) * ((Int)yy);
240   t >>=/*s*/ 16;
241   return (Short)t;
242}
243
244static inline UShort mulhi16U ( UShort xx, UShort yy )
245{
246   UInt t = ((UInt)xx) * ((UInt)yy);
247   t >>=/*u*/ 16;
248   return (UShort)t;
249}
250
251static inline UInt cmpeq32 ( UInt xx, UInt yy )
252{
253   return xx==yy ? 0xFFFFFFFF : 0;
254}
255
256static inline UShort cmpeq16 ( UShort xx, UShort yy )
257{
258   return toUShort(xx==yy ? 0xFFFF : 0);
259}
260
261static inline UChar cmpeq8 ( UChar xx, UChar yy )
262{
263   return toUChar(xx==yy ? 0xFF : 0);
264}
265
266static inline UInt cmpgt32S ( Int xx, Int yy )
267{
268   return xx>yy ? 0xFFFFFFFF : 0;
269}
270
271static inline UShort cmpgt16S ( Short xx, Short yy )
272{
273   return toUShort(xx>yy ? 0xFFFF : 0);
274}
275
276static inline UChar cmpgt8S ( Char xx, Char yy )
277{
278   return toUChar(xx>yy ? 0xFF : 0);
279}
280
281static inline UInt cmpnez32 ( UInt xx )
282{
283   return xx==0 ? 0 : 0xFFFFFFFF;
284}
285
286static inline UShort cmpnez16 ( UShort xx )
287{
288   return toUShort(xx==0 ? 0 : 0xFFFF);
289}
290
291static inline UChar cmpnez8 ( UChar xx )
292{
293   return toUChar(xx==0 ? 0 : 0xFF);
294}
295
296static inline Short qnarrow32Sto16S ( UInt xx0 )
297{
298   Int xx = (Int)xx0;
299   if (xx < -32768) xx = -32768;
300   if (xx > 32767)  xx = 32767;
301   return (Short)xx;
302}
303
304static inline Char qnarrow16Sto8S ( UShort xx0 )
305{
306   Short xx = (Short)xx0;
307   if (xx < -128) xx = -128;
308   if (xx > 127)  xx = 127;
309   return (Char)xx;
310}
311
312static inline UChar qnarrow16Sto8U ( UShort xx0 )
313{
314   Short xx = (Short)xx0;
315   if (xx < 0)   xx = 0;
316   if (xx > 255) xx = 255;
317   return (UChar)xx;
318}
319
320static inline UShort narrow32to16 ( UInt xx )
321{
322   return (UShort)xx;
323}
324
325static inline UChar narrow16to8 ( UShort xx )
326{
327   return (UChar)xx;
328}
329
330/* shifts: we don't care about out-of-range ones, since
331   that is dealt with at a higher level. */
332
333static inline UChar shl8 ( UChar v, UInt n )
334{
335   return toUChar(v << n);
336}
337
338static inline UChar sar8 ( UChar v, UInt n )
339{
340   return toUChar(((Char)v) >> n);
341}
342
343static inline UShort shl16 ( UShort v, UInt n )
344{
345   return toUShort(v << n);
346}
347
348static inline UShort shr16 ( UShort v, UInt n )
349{
350   return toUShort((((UShort)v) >> n));
351}
352
353static inline UShort sar16 ( UShort v, UInt n )
354{
355   return toUShort(((Short)v) >> n);
356}
357
358static inline UInt shl32 ( UInt v, UInt n )
359{
360   return v << n;
361}
362
363static inline UInt shr32 ( UInt v, UInt n )
364{
365   return (((UInt)v) >> n);
366}
367
368static inline UInt sar32 ( UInt v, UInt n )
369{
370   return ((Int)v) >> n;
371}
372
373static inline UChar avg8U ( UChar xx, UChar yy )
374{
375   UInt xxi = (UInt)xx;
376   UInt yyi = (UInt)yy;
377   UInt r   = (xxi + yyi + 1) >> 1;
378   return (UChar)r;
379}
380
381static inline UShort avg16U ( UShort xx, UShort yy )
382{
383   UInt xxi = (UInt)xx;
384   UInt yyi = (UInt)yy;
385   UInt r   = (xxi + yyi + 1) >> 1;
386   return (UShort)r;
387}
388
389static inline Short max16S ( Short xx, Short yy )
390{
391   return toUShort((xx > yy) ? xx : yy);
392}
393
394static inline UChar max8U ( UChar xx, UChar yy )
395{
396   return toUChar((xx > yy) ? xx : yy);
397}
398
399static inline Short min16S ( Short xx, Short yy )
400{
401   return toUShort((xx < yy) ? xx : yy);
402}
403
404static inline UChar min8U ( UChar xx, UChar yy )
405{
406   return toUChar((xx < yy) ? xx : yy);
407}
408
409static inline UShort hadd16U ( UShort xx, UShort yy )
410{
411   UInt xxi = (UInt)xx;
412   UInt yyi = (UInt)yy;
413   UInt r   = (xxi + yyi) >> 1;
414   return (UShort)r;
415}
416
417static inline Short hadd16S ( Short xx, Short yy )
418{
419   Int xxi = (Int)xx;
420   Int yyi = (Int)yy;
421   Int r   = (xxi + yyi) >> 1;
422   return (Short)r;
423}
424
425static inline UShort hsub16U ( UShort xx, UShort yy )
426{
427   UInt xxi = (UInt)xx;
428   UInt yyi = (UInt)yy;
429   UInt r   = (xxi - yyi) >> 1;
430   return (UShort)r;
431}
432
433static inline Short hsub16S ( Short xx, Short yy )
434{
435   Int xxi = (Int)xx;
436   Int yyi = (Int)yy;
437   Int r   = (xxi - yyi) >> 1;
438   return (Short)r;
439}
440
441static inline UChar hadd8U ( UChar xx, UChar yy )
442{
443   UInt xxi = (UInt)xx;
444   UInt yyi = (UInt)yy;
445   UInt r   = (xxi + yyi) >> 1;
446   return (UChar)r;
447}
448
449static inline Char hadd8S ( Char xx, Char yy )
450{
451   Int xxi = (Int)xx;
452   Int yyi = (Int)yy;
453   Int r   = (xxi + yyi) >> 1;
454   return (Char)r;
455}
456
457static inline UChar hsub8U ( UChar xx, UChar yy )
458{
459   UInt xxi = (UInt)xx;
460   UInt yyi = (UInt)yy;
461   UInt r   = (xxi - yyi) >> 1;
462   return (UChar)r;
463}
464
465static inline Char hsub8S ( Char xx, Char yy )
466{
467   Int xxi = (Int)xx;
468   Int yyi = (Int)yy;
469   Int r   = (xxi - yyi) >> 1;
470   return (Char)r;
471}
472
473static inline UInt absdiff8U ( UChar xx, UChar yy )
474{
475   UInt xxu = (UChar)xx;
476   UInt yyu = (UChar)yy;
477   return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
478}
479
480/* ----------------------------------------------------- */
481/* Start of the externally visible functions.  These simply
482   implement the corresponding IR primops. */
483/* ----------------------------------------------------- */
484
485/* ------------ Normal addition ------------ */
486
487ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
488{
489   return mk32x2(
490             sel32x2_1(xx) + sel32x2_1(yy),
491             sel32x2_0(xx) + sel32x2_0(yy)
492          );
493}
494
495ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
496{
497   return mk16x4(
498             toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
499             toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
500             toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
501             toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
502          );
503}
504
505ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
506{
507   return mk8x8(
508             toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
509             toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
510             toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
511             toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
512             toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
513             toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
514             toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
515             toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
516          );
517}
518
519/* ------------ Saturating addition ------------ */
520
521ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
522{
523   return mk16x4(
524             qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
525             qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
526             qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
527             qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
528          );
529}
530
531ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
532{
533   return mk8x8(
534             qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
535             qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
536             qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
537             qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
538             qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
539             qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
540             qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
541             qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
542          );
543}
544
545ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
546{
547   return mk16x4(
548             qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
549             qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
550             qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
551             qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
552          );
553}
554
555ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
556{
557   return mk8x8(
558             qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
559             qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
560             qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
561             qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
562             qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
563             qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
564             qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
565             qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
566          );
567}
568
569/* ------------ Normal subtraction ------------ */
570
571ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
572{
573   return mk32x2(
574             sel32x2_1(xx) - sel32x2_1(yy),
575             sel32x2_0(xx) - sel32x2_0(yy)
576          );
577}
578
579ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
580{
581   return mk16x4(
582             toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
583             toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
584             toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
585             toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
586          );
587}
588
589ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
590{
591   return mk8x8(
592             toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
593             toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
594             toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
595             toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
596             toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
597             toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
598             toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
599             toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
600          );
601}
602
603/* ------------ Saturating subtraction ------------ */
604
605ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
606{
607   return mk16x4(
608             qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
609             qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
610             qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
611             qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
612          );
613}
614
615ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
616{
617   return mk8x8(
618             qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
619             qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
620             qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
621             qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
622             qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
623             qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
624             qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
625             qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
626          );
627}
628
629ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
630{
631   return mk16x4(
632             qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
633             qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
634             qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
635             qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
636          );
637}
638
639ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
640{
641   return mk8x8(
642             qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
643             qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
644             qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
645             qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
646             qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
647             qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
648             qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
649             qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
650          );
651}
652
653/* ------------ Multiplication ------------ */
654
655ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
656{
657   return mk16x4(
658             mul16( sel16x4_3(xx), sel16x4_3(yy) ),
659             mul16( sel16x4_2(xx), sel16x4_2(yy) ),
660             mul16( sel16x4_1(xx), sel16x4_1(yy) ),
661             mul16( sel16x4_0(xx), sel16x4_0(yy) )
662          );
663}
664
665ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
666{
667   return mk32x2(
668             mul32( sel32x2_1(xx), sel32x2_1(yy) ),
669             mul32( sel32x2_0(xx), sel32x2_0(yy) )
670          );
671}
672
673ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
674{
675   return mk16x4(
676             mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
677             mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
678             mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
679             mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
680          );
681}
682
683ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
684{
685   return mk16x4(
686             mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
687             mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
688             mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
689             mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
690          );
691}
692
693/* ------------ Comparison ------------ */
694
695ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
696{
697   return mk32x2(
698             cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
699             cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
700          );
701}
702
703ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
704{
705   return mk16x4(
706             cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
707             cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
708             cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
709             cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
710          );
711}
712
713ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
714{
715   return mk8x8(
716             cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
717             cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
718             cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
719             cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
720             cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
721             cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
722             cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
723             cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
724          );
725}
726
727ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
728{
729   return mk32x2(
730             cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
731             cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
732          );
733}
734
735ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
736{
737   return mk16x4(
738             cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
739             cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
740             cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
741             cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
742          );
743}
744
745ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
746{
747   return mk8x8(
748             cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
749             cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
750             cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
751             cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
752             cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
753             cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
754             cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
755             cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
756          );
757}
758
759ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
760{
761   return mk32x2(
762             cmpnez32( sel32x2_1(xx) ),
763             cmpnez32( sel32x2_0(xx) )
764          );
765}
766
767ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
768{
769   return mk16x4(
770             cmpnez16( sel16x4_3(xx) ),
771             cmpnez16( sel16x4_2(xx) ),
772             cmpnez16( sel16x4_1(xx) ),
773             cmpnez16( sel16x4_0(xx) )
774          );
775}
776
777ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
778{
779   return mk8x8(
780             cmpnez8( sel8x8_7(xx) ),
781             cmpnez8( sel8x8_6(xx) ),
782             cmpnez8( sel8x8_5(xx) ),
783             cmpnez8( sel8x8_4(xx) ),
784             cmpnez8( sel8x8_3(xx) ),
785             cmpnez8( sel8x8_2(xx) ),
786             cmpnez8( sel8x8_1(xx) ),
787             cmpnez8( sel8x8_0(xx) )
788          );
789}
790
791/* ------------ Saturating narrowing ------------ */
792
793ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
794{
795   UInt d = sel32x2_1(aa);
796   UInt c = sel32x2_0(aa);
797   UInt b = sel32x2_1(bb);
798   UInt a = sel32x2_0(bb);
799   return mk16x4(
800             qnarrow32Sto16S(d),
801             qnarrow32Sto16S(c),
802             qnarrow32Sto16S(b),
803             qnarrow32Sto16S(a)
804          );
805}
806
807ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
808{
809   UShort h = sel16x4_3(aa);
810   UShort g = sel16x4_2(aa);
811   UShort f = sel16x4_1(aa);
812   UShort e = sel16x4_0(aa);
813   UShort d = sel16x4_3(bb);
814   UShort c = sel16x4_2(bb);
815   UShort b = sel16x4_1(bb);
816   UShort a = sel16x4_0(bb);
817   return mk8x8(
818             qnarrow16Sto8S(h),
819             qnarrow16Sto8S(g),
820             qnarrow16Sto8S(f),
821             qnarrow16Sto8S(e),
822             qnarrow16Sto8S(d),
823             qnarrow16Sto8S(c),
824             qnarrow16Sto8S(b),
825             qnarrow16Sto8S(a)
826          );
827}
828
829ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
830{
831   UShort h = sel16x4_3(aa);
832   UShort g = sel16x4_2(aa);
833   UShort f = sel16x4_1(aa);
834   UShort e = sel16x4_0(aa);
835   UShort d = sel16x4_3(bb);
836   UShort c = sel16x4_2(bb);
837   UShort b = sel16x4_1(bb);
838   UShort a = sel16x4_0(bb);
839   return mk8x8(
840             qnarrow16Sto8U(h),
841             qnarrow16Sto8U(g),
842             qnarrow16Sto8U(f),
843             qnarrow16Sto8U(e),
844             qnarrow16Sto8U(d),
845             qnarrow16Sto8U(c),
846             qnarrow16Sto8U(b),
847             qnarrow16Sto8U(a)
848          );
849}
850
851/* ------------ Truncating narrowing ------------ */
852
853ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
854{
855   UInt d = sel32x2_1(aa);
856   UInt c = sel32x2_0(aa);
857   UInt b = sel32x2_1(bb);
858   UInt a = sel32x2_0(bb);
859   return mk16x4(
860             narrow32to16(d),
861             narrow32to16(c),
862             narrow32to16(b),
863             narrow32to16(a)
864          );
865}
866
867ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
868{
869   UShort h = sel16x4_3(aa);
870   UShort g = sel16x4_2(aa);
871   UShort f = sel16x4_1(aa);
872   UShort e = sel16x4_0(aa);
873   UShort d = sel16x4_3(bb);
874   UShort c = sel16x4_2(bb);
875   UShort b = sel16x4_1(bb);
876   UShort a = sel16x4_0(bb);
877   return mk8x8(
878             narrow16to8(h),
879             narrow16to8(g),
880             narrow16to8(f),
881             narrow16to8(e),
882             narrow16to8(d),
883             narrow16to8(c),
884             narrow16to8(b),
885             narrow16to8(a)
886          );
887}
888
889/* ------------ Interleaving ------------ */
890
891ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
892{
893   return mk8x8(
894             sel8x8_7(aa),
895             sel8x8_7(bb),
896             sel8x8_6(aa),
897             sel8x8_6(bb),
898             sel8x8_5(aa),
899             sel8x8_5(bb),
900             sel8x8_4(aa),
901             sel8x8_4(bb)
902          );
903}
904
905ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
906{
907   return mk8x8(
908             sel8x8_3(aa),
909             sel8x8_3(bb),
910             sel8x8_2(aa),
911             sel8x8_2(bb),
912             sel8x8_1(aa),
913             sel8x8_1(bb),
914             sel8x8_0(aa),
915             sel8x8_0(bb)
916          );
917}
918
919ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
920{
921   return mk16x4(
922             sel16x4_3(aa),
923             sel16x4_3(bb),
924             sel16x4_2(aa),
925             sel16x4_2(bb)
926          );
927}
928
929ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
930{
931   return mk16x4(
932             sel16x4_1(aa),
933             sel16x4_1(bb),
934             sel16x4_0(aa),
935             sel16x4_0(bb)
936          );
937}
938
939ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
940{
941   return mk32x2(
942             sel32x2_1(aa),
943             sel32x2_1(bb)
944          );
945}
946
947ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
948{
949   return mk32x2(
950             sel32x2_0(aa),
951             sel32x2_0(bb)
952          );
953}
954
955/* ------------ Concatenation ------------ */
956
957ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
958{
959   return mk16x4(
960             sel16x4_3(aa),
961             sel16x4_1(aa),
962             sel16x4_3(bb),
963             sel16x4_1(bb)
964          );
965}
966
967ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
968{
969   return mk16x4(
970             sel16x4_2(aa),
971             sel16x4_0(aa),
972             sel16x4_2(bb),
973             sel16x4_0(bb)
974          );
975}
976
977/* misc hack looking for a proper home */
978ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
979{
980   return mk8x8(
981             index8x8(aa, sel8x8_7(bb)),
982             index8x8(aa, sel8x8_6(bb)),
983             index8x8(aa, sel8x8_5(bb)),
984             index8x8(aa, sel8x8_4(bb)),
985             index8x8(aa, sel8x8_3(bb)),
986             index8x8(aa, sel8x8_2(bb)),
987             index8x8(aa, sel8x8_1(bb)),
988             index8x8(aa, sel8x8_0(bb))
989          );
990}
991
992/* ------------ Shifting ------------ */
993/* Note that because these primops are undefined if the shift amount
994   equals or exceeds the lane width, the shift amount is masked so
995   that the scalar shifts are always in range.  In fact, given the
996   semantics of these primops (ShlN16x4, etc) it is an error if in
997   fact we are ever given an out-of-range shift amount.
998*/
999ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
1000{
1001   /* vassert(nn < 32); */
1002   nn &= 31;
1003   return mk32x2(
1004             shl32( sel32x2_1(xx), nn ),
1005             shl32( sel32x2_0(xx), nn )
1006          );
1007}
1008
1009ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
1010{
1011   /* vassert(nn < 16); */
1012   nn &= 15;
1013   return mk16x4(
1014             shl16( sel16x4_3(xx), nn ),
1015             shl16( sel16x4_2(xx), nn ),
1016             shl16( sel16x4_1(xx), nn ),
1017             shl16( sel16x4_0(xx), nn )
1018          );
1019}
1020
1021ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
1022{
1023   /* vassert(nn < 8); */
1024   nn &= 7;
1025   return mk8x8(
1026             shl8( sel8x8_7(xx), nn ),
1027             shl8( sel8x8_6(xx), nn ),
1028             shl8( sel8x8_5(xx), nn ),
1029             shl8( sel8x8_4(xx), nn ),
1030             shl8( sel8x8_3(xx), nn ),
1031             shl8( sel8x8_2(xx), nn ),
1032             shl8( sel8x8_1(xx), nn ),
1033             shl8( sel8x8_0(xx), nn )
1034          );
1035}
1036
1037ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1038{
1039   /* vassert(nn < 32); */
1040   nn &= 31;
1041   return mk32x2(
1042             shr32( sel32x2_1(xx), nn ),
1043             shr32( sel32x2_0(xx), nn )
1044          );
1045}
1046
1047ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1048{
1049   /* vassert(nn < 16); */
1050   nn &= 15;
1051   return mk16x4(
1052             shr16( sel16x4_3(xx), nn ),
1053             shr16( sel16x4_2(xx), nn ),
1054             shr16( sel16x4_1(xx), nn ),
1055             shr16( sel16x4_0(xx), nn )
1056          );
1057}
1058
1059ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1060{
1061   /* vassert(nn < 32); */
1062   nn &= 31;
1063   return mk32x2(
1064             sar32( sel32x2_1(xx), nn ),
1065             sar32( sel32x2_0(xx), nn )
1066          );
1067}
1068
1069ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1070{
1071   /* vassert(nn < 16); */
1072   nn &= 15;
1073   return mk16x4(
1074             sar16( sel16x4_3(xx), nn ),
1075             sar16( sel16x4_2(xx), nn ),
1076             sar16( sel16x4_1(xx), nn ),
1077             sar16( sel16x4_0(xx), nn )
1078          );
1079}
1080
1081ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1082{
1083   /* vassert(nn < 8); */
1084   nn &= 7;
1085   return mk8x8(
1086             sar8( sel8x8_7(xx), nn ),
1087             sar8( sel8x8_6(xx), nn ),
1088             sar8( sel8x8_5(xx), nn ),
1089             sar8( sel8x8_4(xx), nn ),
1090             sar8( sel8x8_3(xx), nn ),
1091             sar8( sel8x8_2(xx), nn ),
1092             sar8( sel8x8_1(xx), nn ),
1093             sar8( sel8x8_0(xx), nn )
1094          );
1095}
1096
1097/* ------------ Averaging ------------ */
1098
1099ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1100{
1101   return mk8x8(
1102             avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1103             avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1104             avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1105             avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1106             avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1107             avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1108             avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1109             avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1110          );
1111}
1112
1113ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1114{
1115   return mk16x4(
1116             avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1117             avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1118             avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1119             avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1120          );
1121}
1122
1123/* ------------ max/min ------------ */
1124
1125ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1126{
1127   return mk16x4(
1128             max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1129             max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1130             max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1131             max16S( sel16x4_0(xx), sel16x4_0(yy) )
1132          );
1133}
1134
1135ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1136{
1137   return mk8x8(
1138             max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1139             max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1140             max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1141             max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1142             max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1143             max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1144             max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1145             max8U( sel8x8_0(xx), sel8x8_0(yy) )
1146          );
1147}
1148
1149ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1150{
1151   return mk16x4(
1152             min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1153             min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1154             min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1155             min16S( sel16x4_0(xx), sel16x4_0(yy) )
1156          );
1157}
1158
1159ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1160{
1161   return mk8x8(
1162             min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1163             min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1164             min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1165             min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1166             min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1167             min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1168             min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1169             min8U( sel8x8_0(xx), sel8x8_0(yy) )
1170          );
1171}
1172
1173UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
1174{
1175   UInt r = 0;
1176   if (xx & (1ULL << (64-1))) r |= (1<<7);
1177   if (xx & (1ULL << (56-1))) r |= (1<<6);
1178   if (xx & (1ULL << (48-1))) r |= (1<<5);
1179   if (xx & (1ULL << (40-1))) r |= (1<<4);
1180   if (xx & (1ULL << (32-1))) r |= (1<<3);
1181   if (xx & (1ULL << (24-1))) r |= (1<<2);
1182   if (xx & (1ULL << (16-1))) r |= (1<<1);
1183   if (xx & (1ULL << ( 8-1))) r |= (1<<0);
1184   return r;
1185}
1186
1187/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1188
1189/* Tuple/select functions for 16x2 vectors. */
1190static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1191   return (((UInt)w1) << 16) | ((UInt)w2);
1192}
1193
1194static inline UShort sel16x2_1 ( UInt w32 ) {
1195   return 0xFFFF & (UShort)(w32 >> 16);
1196}
1197static inline UShort sel16x2_0 ( UInt w32 ) {
1198   return 0xFFFF & (UShort)(w32);
1199}
1200
1201static inline UInt mk8x4 ( UChar w3, UChar w2,
1202                           UChar w1, UChar w0 ) {
1203   UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1204              | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1205   return w32;
1206}
1207
1208static inline UChar sel8x4_3 ( UInt w32 ) {
1209   return toUChar(0xFF & (w32 >> 24));
1210}
1211static inline UChar sel8x4_2 ( UInt w32 ) {
1212   return toUChar(0xFF & (w32 >> 16));
1213}
1214static inline UChar sel8x4_1 ( UInt w32 ) {
1215   return toUChar(0xFF & (w32 >> 8));
1216}
1217static inline UChar sel8x4_0 ( UInt w32 ) {
1218   return toUChar(0xFF & (w32 >> 0));
1219}
1220
1221
1222/* ----------------------------------------------------- */
1223/* More externally visible functions.  These simply
1224   implement the corresponding IR primops. */
1225/* ----------------------------------------------------- */
1226
1227/* ------ 16x2 ------ */
1228
1229UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1230{
1231   return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1232                  sel16x2_0(xx) + sel16x2_0(yy) );
1233}
1234
1235UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1236{
1237   return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1238                  sel16x2_0(xx) - sel16x2_0(yy) );
1239}
1240
1241UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1242{
1243   return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1244                  hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1245}
1246
1247UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1248{
1249   return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1250                  hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1251}
1252
1253UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1254{
1255   return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1256                  hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1257}
1258
1259UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1260{
1261   return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1262                  hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1263}
1264
1265UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1266{
1267   return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1268                  qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1269}
1270
1271UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1272{
1273   return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1274                  qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1275}
1276
1277UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1278{
1279   return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1280                  qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1281}
1282
1283UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1284{
1285   return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1286                  qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1287}
1288
1289/* ------ 8x4 ------ */
1290
1291UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1292{
1293   return mk8x4(
1294             sel8x4_3(xx) + sel8x4_3(yy),
1295             sel8x4_2(xx) + sel8x4_2(yy),
1296             sel8x4_1(xx) + sel8x4_1(yy),
1297             sel8x4_0(xx) + sel8x4_0(yy)
1298          );
1299}
1300
1301UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1302{
1303   return mk8x4(
1304             sel8x4_3(xx) - sel8x4_3(yy),
1305             sel8x4_2(xx) - sel8x4_2(yy),
1306             sel8x4_1(xx) - sel8x4_1(yy),
1307             sel8x4_0(xx) - sel8x4_0(yy)
1308          );
1309}
1310
1311UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1312{
1313   return mk8x4(
1314             hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1315             hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1316             hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1317             hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1318          );
1319}
1320
1321UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1322{
1323   return mk8x4(
1324             hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1325             hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1326             hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1327             hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1328          );
1329}
1330
1331UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1332{
1333   return mk8x4(
1334             hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1335             hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1336             hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1337             hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1338          );
1339}
1340
1341UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1342{
1343   return mk8x4(
1344             hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1345             hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1346             hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1347             hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1348          );
1349}
1350
1351UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1352{
1353   return mk8x4(
1354             qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1355             qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1356             qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1357             qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1358          );
1359}
1360
1361UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1362{
1363   return mk8x4(
1364             qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1365             qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1366             qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1367             qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1368          );
1369}
1370
1371UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1372{
1373   return mk8x4(
1374             qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1375             qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1376             qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1377             qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1378          );
1379}
1380
1381UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1382{
1383   return mk8x4(
1384             qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1385             qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1386             qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1387             qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1388          );
1389}
1390
1391UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1392{
1393   return mk16x2(
1394             cmpnez16( sel16x2_1(xx) ),
1395             cmpnez16( sel16x2_0(xx) )
1396          );
1397}
1398
1399UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1400{
1401   return mk8x4(
1402             cmpnez8( sel8x4_3(xx) ),
1403             cmpnez8( sel8x4_2(xx) ),
1404             cmpnez8( sel8x4_1(xx) ),
1405             cmpnez8( sel8x4_0(xx) )
1406          );
1407}
1408
1409UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1410{
1411   return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1412          + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1413          + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1414          + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1415}
1416
1417UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
1418{
1419   return qadd32S( xx, yy );
1420}
1421
1422UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
1423{
1424   return qsub32S( xx, yy );
1425}
1426
1427
1428/*------------------------------------------------------------------*/
1429/* Decimal Floating Point (DFP) externally visible helper functions */
1430/* that implement Iop_BCDtoDPB and Iop_DPBtoBCD                     */
1431/*------------------------------------------------------------------*/
1432
1433#define NOT( x )    ( ( ( x ) == 0) ? 1 : 0)
1434#define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
1435#define PUT( x, y ) ( ( x )<< ( y ) )
1436
1437static ULong dpb_to_bcd( ULong chunk )
1438{
1439   Short a, b, c, d, e, f, g, h, i, j, k, m;
1440   Short p, q, r, s, t, u, v, w, x, y;
1441   ULong value;
1442
1443   /* convert 10 bit densely packed BCD to BCD */
1444   p = GET( chunk, 9 );
1445   q = GET( chunk, 8 );
1446   r = GET( chunk, 7 );
1447   s = GET( chunk, 6 );
1448   t = GET( chunk, 5 );
1449   u = GET( chunk, 4 );
1450   v = GET( chunk, 3 );
1451   w = GET( chunk, 2 );
1452   x = GET( chunk, 1 );
1453   y = GET( chunk, 0 );
1454
1455   /* The BCD bit values are given by the following boolean equations.*/
1456   a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
1457   b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
1458   c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
1459   d = r;
1460   e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
1461   f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
1462   g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
1463   h = u;
1464   i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
1465   j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
1466            | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
1467   k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
1468            | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
1469   m = y;
1470
1471   value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
1472            | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
1473            | PUT(k, 1) | PUT(m, 0);
1474   return value;
1475}
1476
1477static ULong bcd_to_dpb( ULong chunk )
1478{
1479   Short a, b, c, d, e, f, g, h, i, j, k, m;
1480   Short p, q, r, s, t, u, v, w, x, y;
1481   ULong value;
1482   /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
1483    The boolean equations to calculate the value of each of the DPD bit
1484    is given in Appendix B  of Book 1: Power ISA User Instruction set.  The
1485    bits for the DPD number are [abcdefghijkm].  The bits for the BCD value
1486    are [pqrstuvwxy].  The boolean logic equations in psuedo C code are:
1487    */
1488   a = GET( chunk, 11 );
1489   b = GET( chunk, 10 );
1490   c = GET( chunk, 9 );
1491   d = GET( chunk, 8 );
1492   e = GET( chunk, 7 );
1493   f = GET( chunk, 6 );
1494   g = GET( chunk, 5 );
1495   h = GET( chunk, 4 );
1496   i = GET( chunk, 3 );
1497   j = GET( chunk, 2 );
1498   k = GET( chunk, 1 );
1499   m = GET( chunk, 0 );
1500
1501   p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
1502   q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
1503   r = d;
1504   s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
1505            | ( f & NOT(a) & NOT(e) ) | ( e & i );
1506   t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
1507            | ( g & NOT(a) & NOT(e) ) | ( a & i );
1508   u = h;
1509   v = a | e | i;
1510   w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
1511   x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
1512   y = m;
1513
1514   value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
1515            | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
1516
1517   return value;
1518}
1519
1520ULong h_calc_DPBtoBCD( ULong dpb )
1521{
1522   ULong result, chunk;
1523   Int i;
1524
1525   result = 0;
1526
1527   for (i = 0; i < 5; i++) {
1528      chunk = dpb >> ( 4 - i ) * 10;
1529      result = result << 12;
1530      result |= dpb_to_bcd( chunk & 0x3FF );
1531   }
1532   return result;
1533}
1534
1535ULong h_calc_BCDtoDPB( ULong bcd )
1536{
1537   ULong result, chunk;
1538   Int i;
1539
1540   result = 0;
1541
1542   for (i = 0; i < 5; i++) {
1543      chunk = bcd >> ( 4 - i ) * 12;
1544      result = result << 10;
1545      result |= bcd_to_dpb( chunk & 0xFFF );
1546   }
1547   return result;
1548}
1549#undef NOT
1550#undef GET
1551#undef PUT
1552
1553
1554/* ----------------------------------------------------- */
1555/* Signed and unsigned integer division, that behave like
1556   the ARMv7 UDIV ansd SDIV instructions.
1557
1558   sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
1559   udiv32 also behaves like 64-bit v8 UDIV on w-regs.
1560*/
1561/* ----------------------------------------------------- */
1562
1563UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
1564{
1565   // Division by zero --> zero
1566   if (UNLIKELY(y == 0)) return 0;
1567   // C requires rounding towards zero, which is also what we need.
1568   return x / y;
1569}
1570
1571ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
1572{
1573   // Division by zero --> zero
1574   if (UNLIKELY(y == 0)) return 0;
1575   // C requires rounding towards zero, which is also what we need.
1576   return x / y;
1577}
1578
1579Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
1580{
1581   // Division by zero --> zero
1582   if (UNLIKELY(y == 0)) return 0;
1583   // The single case that produces an unrepresentable result
1584   if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
1585                 && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
1586      return (Int)(UInt)0x80000000;
1587   // Else return the result rounded towards zero.  C89 says
1588   // this is implementation defined (in the signed case), but gcc
1589   // promises to round towards zero.  Nevertheless, at startup,
1590   // in main_main.c, do a check for that.
1591   return x / y;
1592}
1593
1594Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
1595{
1596   // Division by zero --> zero
1597   if (UNLIKELY(y == 0)) return 0;
1598   // The single case that produces an unrepresentable result
1599   if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
1600                 && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
1601      return (Long)(ULong)0x8000000000000000ULL;
1602   // Else return the result rounded towards zero.  C89 says
1603   // this is implementation defined (in the signed case), but gcc
1604   // promises to round towards zero.  Nevertheless, at startup,
1605   // in main_main.c, do a check for that.
1606   return x / y;
1607}
1608
1609
1610/*---------------------------------------------------------------*/
1611/*--- end                               host_generic_simd64.c ---*/
1612/*---------------------------------------------------------------*/
1613