1
2/*---------------------------------------------------------------*/
3/*--- begin                             host_generic_simd64.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2011 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37   where the instruction selectors cannot generate code in-line.
38   These are purely back-end entities and cannot be seen/referenced
39   from IR. */
40
41#include "libvex_basictypes.h"
42#include "host_generic_simd64.h"
43
44
45
46/* Tuple/select functions for 32x2 vectors. */
47
48static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49   return (((ULong)w1) << 32) | ((ULong)w0);
50}
51
52static inline UInt sel32x2_1 ( ULong w64 ) {
53   return 0xFFFFFFFF & toUInt(w64 >> 32);
54}
55static inline UInt sel32x2_0 ( ULong w64 ) {
56   return 0xFFFFFFFF & toUInt(w64);
57}
58
59
60/* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
61   with 64-bit shifts so we give it a hand. */
62
63static inline ULong mk16x4 ( UShort w3, UShort w2,
64                             UShort w1, UShort w0 ) {
65   UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66   UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67   return mk32x2(hi32, lo32);
68}
69
70static inline UShort sel16x4_3 ( ULong w64 ) {
71   UInt hi32 = toUInt(w64 >> 32);
72   return toUShort(0xFFFF & (hi32 >> 16));
73}
74static inline UShort sel16x4_2 ( ULong w64 ) {
75   UInt hi32 = toUInt(w64 >> 32);
76   return toUShort(0xFFFF & hi32);
77}
78static inline UShort sel16x4_1 ( ULong w64 ) {
79   UInt lo32 = (UInt)w64;
80   return toUShort(0xFFFF & (lo32 >> 16));
81}
82static inline UShort sel16x4_0 ( ULong w64 ) {
83   UInt lo32 = (UInt)w64;
84   return toUShort(0xFFFF & lo32);
85}
86
87
88/* Tuple/select functions for 8x8 vectors. */
89
90static inline ULong mk8x8 ( UChar w7, UChar w6,
91                            UChar w5, UChar w4,
92                            UChar w3, UChar w2,
93                            UChar w1, UChar w0 ) {
94   UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
95               | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
96   UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
97               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
98   return mk32x2(hi32, lo32);
99}
100
101static inline UChar sel8x8_7 ( ULong w64 ) {
102   UInt hi32 = toUInt(w64 >> 32);
103   return toUChar(0xFF & (hi32 >> 24));
104}
105static inline UChar sel8x8_6 ( ULong w64 ) {
106   UInt hi32 = toUInt(w64 >> 32);
107   return toUChar(0xFF & (hi32 >> 16));
108}
109static inline UChar sel8x8_5 ( ULong w64 ) {
110   UInt hi32 = toUInt(w64 >> 32);
111   return toUChar(0xFF & (hi32 >> 8));
112}
113static inline UChar sel8x8_4 ( ULong w64 ) {
114   UInt hi32 = toUInt(w64 >> 32);
115   return toUChar(0xFF & (hi32 >> 0));
116}
117static inline UChar sel8x8_3 ( ULong w64 ) {
118   UInt lo32 = (UInt)w64;
119   return toUChar(0xFF & (lo32 >> 24));
120}
121static inline UChar sel8x8_2 ( ULong w64 ) {
122   UInt lo32 = (UInt)w64;
123   return toUChar(0xFF & (lo32 >> 16));
124}
125static inline UChar sel8x8_1 ( ULong w64 ) {
126   UInt lo32 = (UInt)w64;
127   return toUChar(0xFF & (lo32 >> 8));
128}
129static inline UChar sel8x8_0 ( ULong w64 ) {
130   UInt lo32 = (UInt)w64;
131   return toUChar(0xFF & (lo32 >> 0));
132}
133
134static inline UChar index8x8 ( ULong w64, UChar ix ) {
135   ix &= 7;
136   return toUChar((w64 >> (8*ix)) & 0xFF);
137}
138
139
140/* Scalar helpers. */
141
142static inline Short qadd16S ( Short xx, Short yy )
143{
144   Int t = ((Int)xx) + ((Int)yy);
145   if (t < -32768) t = -32768;
146   if (t > 32767)  t = 32767;
147   return (Short)t;
148}
149
150static inline Char qadd8S ( Char xx, Char yy )
151{
152   Int t = ((Int)xx) + ((Int)yy);
153   if (t < -128) t = -128;
154   if (t > 127)  t = 127;
155   return (Char)t;
156}
157
158static inline UShort qadd16U ( UShort xx, UShort yy )
159{
160   UInt t = ((UInt)xx) + ((UInt)yy);
161   if (t > 0xFFFF) t = 0xFFFF;
162   return (UShort)t;
163}
164
165static inline UChar qadd8U ( UChar xx, UChar yy )
166{
167   UInt t = ((UInt)xx) + ((UInt)yy);
168   if (t > 0xFF) t = 0xFF;
169   return (UChar)t;
170}
171
172static inline Short qsub16S ( Short xx, Short yy )
173{
174   Int t = ((Int)xx) - ((Int)yy);
175   if (t < -32768) t = -32768;
176   if (t > 32767)  t = 32767;
177   return (Short)t;
178}
179
180static inline Char qsub8S ( Char xx, Char yy )
181{
182   Int t = ((Int)xx) - ((Int)yy);
183   if (t < -128) t = -128;
184   if (t > 127)  t = 127;
185   return (Char)t;
186}
187
188static inline UShort qsub16U ( UShort xx, UShort yy )
189{
190   Int t = ((Int)xx) - ((Int)yy);
191   if (t < 0)      t = 0;
192   if (t > 0xFFFF) t = 0xFFFF;
193   return (UShort)t;
194}
195
196static inline UChar qsub8U ( UChar xx, UChar yy )
197{
198   Int t = ((Int)xx) - ((Int)yy);
199   if (t < 0)    t = 0;
200   if (t > 0xFF) t = 0xFF;
201   return (UChar)t;
202}
203
204static inline Short mul16 ( Short xx, Short yy )
205{
206   Int t = ((Int)xx) * ((Int)yy);
207   return (Short)t;
208}
209
210static inline Int mul32 ( Int xx, Int yy )
211{
212   Int t = ((Int)xx) * ((Int)yy);
213   return (Int)t;
214}
215
216static inline Short mulhi16S ( Short xx, Short yy )
217{
218   Int t = ((Int)xx) * ((Int)yy);
219   t >>=/*s*/ 16;
220   return (Short)t;
221}
222
223static inline UShort mulhi16U ( UShort xx, UShort yy )
224{
225   UInt t = ((UInt)xx) * ((UInt)yy);
226   t >>=/*u*/ 16;
227   return (UShort)t;
228}
229
230static inline UInt cmpeq32 ( UInt xx, UInt yy )
231{
232   return xx==yy ? 0xFFFFFFFF : 0;
233}
234
235static inline UShort cmpeq16 ( UShort xx, UShort yy )
236{
237   return toUShort(xx==yy ? 0xFFFF : 0);
238}
239
240static inline UChar cmpeq8 ( UChar xx, UChar yy )
241{
242   return toUChar(xx==yy ? 0xFF : 0);
243}
244
245static inline UInt cmpgt32S ( Int xx, Int yy )
246{
247   return xx>yy ? 0xFFFFFFFF : 0;
248}
249
250static inline UShort cmpgt16S ( Short xx, Short yy )
251{
252   return toUShort(xx>yy ? 0xFFFF : 0);
253}
254
255static inline UChar cmpgt8S ( Char xx, Char yy )
256{
257   return toUChar(xx>yy ? 0xFF : 0);
258}
259
260static inline UInt cmpnez32 ( UInt xx )
261{
262   return xx==0 ? 0 : 0xFFFFFFFF;
263}
264
265static inline UShort cmpnez16 ( UShort xx )
266{
267   return toUShort(xx==0 ? 0 : 0xFFFF);
268}
269
270static inline UChar cmpnez8 ( UChar xx )
271{
272   return toUChar(xx==0 ? 0 : 0xFF);
273}
274
275static inline Short qnarrow32Sto16S ( UInt xx0 )
276{
277   Int xx = (Int)xx0;
278   if (xx < -32768) xx = -32768;
279   if (xx > 32767)  xx = 32767;
280   return (Short)xx;
281}
282
283static inline Char qnarrow16Sto8S ( UShort xx0 )
284{
285   Short xx = (Short)xx0;
286   if (xx < -128) xx = -128;
287   if (xx > 127)  xx = 127;
288   return (Char)xx;
289}
290
291static inline UChar qnarrow16Sto8U ( UShort xx0 )
292{
293   Short xx = (Short)xx0;
294   if (xx < 0)   xx = 0;
295   if (xx > 255) xx = 255;
296   return (UChar)xx;
297}
298
299static inline UShort narrow32to16 ( UInt xx )
300{
301   return (UShort)xx;
302}
303
304static inline UChar narrow16to8 ( UShort xx )
305{
306   return (UChar)xx;
307}
308
309/* shifts: we don't care about out-of-range ones, since
310   that is dealt with at a higher level. */
311
312static inline UChar shl8 ( UChar v, UInt n )
313{
314   return toUChar(v << n);
315}
316
317static inline UChar sar8 ( UChar v, UInt n )
318{
319   return toUChar(((Char)v) >> n);
320}
321
322static inline UShort shl16 ( UShort v, UInt n )
323{
324   return toUShort(v << n);
325}
326
327static inline UShort shr16 ( UShort v, UInt n )
328{
329   return toUShort((((UShort)v) >> n));
330}
331
332static inline UShort sar16 ( UShort v, UInt n )
333{
334   return toUShort(((Short)v) >> n);
335}
336
337static inline UInt shl32 ( UInt v, UInt n )
338{
339   return v << n;
340}
341
342static inline UInt shr32 ( UInt v, UInt n )
343{
344   return (((UInt)v) >> n);
345}
346
347static inline UInt sar32 ( UInt v, UInt n )
348{
349   return ((Int)v) >> n;
350}
351
352static inline UChar avg8U ( UChar xx, UChar yy )
353{
354   UInt xxi = (UInt)xx;
355   UInt yyi = (UInt)yy;
356   UInt r   = (xxi + yyi + 1) >> 1;
357   return (UChar)r;
358}
359
360static inline UShort avg16U ( UShort xx, UShort yy )
361{
362   UInt xxi = (UInt)xx;
363   UInt yyi = (UInt)yy;
364   UInt r   = (xxi + yyi + 1) >> 1;
365   return (UShort)r;
366}
367
368static inline Short max16S ( Short xx, Short yy )
369{
370   return toUShort((xx > yy) ? xx : yy);
371}
372
373static inline UChar max8U ( UChar xx, UChar yy )
374{
375   return toUChar((xx > yy) ? xx : yy);
376}
377
378static inline Short min16S ( Short xx, Short yy )
379{
380   return toUShort((xx < yy) ? xx : yy);
381}
382
383static inline UChar min8U ( UChar xx, UChar yy )
384{
385   return toUChar((xx < yy) ? xx : yy);
386}
387
388static inline UShort hadd16U ( UShort xx, UShort yy )
389{
390   UInt xxi = (UInt)xx;
391   UInt yyi = (UInt)yy;
392   UInt r   = (xxi + yyi) >> 1;
393   return (UShort)r;
394}
395
396static inline Short hadd16S ( Short xx, Short yy )
397{
398   Int xxi = (Int)xx;
399   Int yyi = (Int)yy;
400   Int r   = (xxi + yyi) >> 1;
401   return (Short)r;
402}
403
404static inline UShort hsub16U ( UShort xx, UShort yy )
405{
406   UInt xxi = (UInt)xx;
407   UInt yyi = (UInt)yy;
408   UInt r   = (xxi - yyi) >> 1;
409   return (UShort)r;
410}
411
412static inline Short hsub16S ( Short xx, Short yy )
413{
414   Int xxi = (Int)xx;
415   Int yyi = (Int)yy;
416   Int r   = (xxi - yyi) >> 1;
417   return (Short)r;
418}
419
420static inline UChar hadd8U ( UChar xx, UChar yy )
421{
422   UInt xxi = (UInt)xx;
423   UInt yyi = (UInt)yy;
424   UInt r   = (xxi + yyi) >> 1;
425   return (UChar)r;
426}
427
428static inline Char hadd8S ( Char xx, Char yy )
429{
430   Int xxi = (Int)xx;
431   Int yyi = (Int)yy;
432   Int r   = (xxi + yyi) >> 1;
433   return (Char)r;
434}
435
436static inline UChar hsub8U ( UChar xx, UChar yy )
437{
438   UInt xxi = (UInt)xx;
439   UInt yyi = (UInt)yy;
440   UInt r   = (xxi - yyi) >> 1;
441   return (UChar)r;
442}
443
444static inline Char hsub8S ( Char xx, Char yy )
445{
446   Int xxi = (Int)xx;
447   Int yyi = (Int)yy;
448   Int r   = (xxi - yyi) >> 1;
449   return (Char)r;
450}
451
452static inline UInt absdiff8U ( UChar xx, UChar yy )
453{
454   UInt xxu = (UChar)xx;
455   UInt yyu = (UChar)yy;
456   return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
457}
458
459/* ----------------------------------------------------- */
460/* Start of the externally visible functions.  These simply
461   implement the corresponding IR primops. */
462/* ----------------------------------------------------- */
463
464/* ------------ Normal addition ------------ */
465
466ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
467{
468   return mk32x2(
469             sel32x2_1(xx) + sel32x2_1(yy),
470             sel32x2_0(xx) + sel32x2_0(yy)
471          );
472}
473
474ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
475{
476   return mk16x4(
477             toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
478             toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
479             toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
480             toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
481          );
482}
483
484ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
485{
486   return mk8x8(
487             toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
488             toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
489             toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
490             toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
491             toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
492             toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
493             toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
494             toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
495          );
496}
497
498/* ------------ Saturating addition ------------ */
499
500ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
501{
502   return mk16x4(
503             qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
504             qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
505             qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
506             qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
507          );
508}
509
510ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
511{
512   return mk8x8(
513             qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
514             qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
515             qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
516             qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
517             qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
518             qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
519             qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
520             qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
521          );
522}
523
524ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
525{
526   return mk16x4(
527             qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
528             qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
529             qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
530             qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
531          );
532}
533
534ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
535{
536   return mk8x8(
537             qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
538             qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
539             qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
540             qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
541             qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
542             qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
543             qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
544             qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
545          );
546}
547
548/* ------------ Normal subtraction ------------ */
549
550ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
551{
552   return mk32x2(
553             sel32x2_1(xx) - sel32x2_1(yy),
554             sel32x2_0(xx) - sel32x2_0(yy)
555          );
556}
557
558ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
559{
560   return mk16x4(
561             toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
562             toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
563             toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
564             toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
565          );
566}
567
568ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
569{
570   return mk8x8(
571             toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
572             toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
573             toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
574             toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
575             toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
576             toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
577             toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
578             toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
579          );
580}
581
582/* ------------ Saturating subtraction ------------ */
583
584ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
585{
586   return mk16x4(
587             qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
588             qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
589             qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
590             qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
591          );
592}
593
594ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
595{
596   return mk8x8(
597             qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
598             qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
599             qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
600             qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
601             qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
602             qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
603             qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
604             qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
605          );
606}
607
608ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
609{
610   return mk16x4(
611             qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
612             qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
613             qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
614             qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
615          );
616}
617
618ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
619{
620   return mk8x8(
621             qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
622             qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
623             qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
624             qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
625             qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
626             qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
627             qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
628             qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
629          );
630}
631
632/* ------------ Multiplication ------------ */
633
634ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
635{
636   return mk16x4(
637             mul16( sel16x4_3(xx), sel16x4_3(yy) ),
638             mul16( sel16x4_2(xx), sel16x4_2(yy) ),
639             mul16( sel16x4_1(xx), sel16x4_1(yy) ),
640             mul16( sel16x4_0(xx), sel16x4_0(yy) )
641          );
642}
643
644ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
645{
646   return mk32x2(
647             mul32( sel32x2_1(xx), sel32x2_1(yy) ),
648             mul32( sel32x2_0(xx), sel32x2_0(yy) )
649          );
650}
651
652ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
653{
654   return mk16x4(
655             mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
656             mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
657             mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
658             mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
659          );
660}
661
662ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
663{
664   return mk16x4(
665             mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
666             mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
667             mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
668             mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
669          );
670}
671
672/* ------------ Comparison ------------ */
673
674ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
675{
676   return mk32x2(
677             cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
678             cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
679          );
680}
681
682ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
683{
684   return mk16x4(
685             cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
686             cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
687             cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
688             cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
689          );
690}
691
692ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
693{
694   return mk8x8(
695             cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
696             cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
697             cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
698             cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
699             cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
700             cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
701             cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
702             cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
703          );
704}
705
706ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
707{
708   return mk32x2(
709             cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
710             cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
711          );
712}
713
714ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
715{
716   return mk16x4(
717             cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
718             cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
719             cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
720             cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
721          );
722}
723
724ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
725{
726   return mk8x8(
727             cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
728             cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
729             cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
730             cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
731             cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
732             cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
733             cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
734             cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
735          );
736}
737
738ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
739{
740   return mk32x2(
741             cmpnez32( sel32x2_1(xx) ),
742             cmpnez32( sel32x2_0(xx) )
743          );
744}
745
746ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
747{
748   return mk16x4(
749             cmpnez16( sel16x4_3(xx) ),
750             cmpnez16( sel16x4_2(xx) ),
751             cmpnez16( sel16x4_1(xx) ),
752             cmpnez16( sel16x4_0(xx) )
753          );
754}
755
756ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
757{
758   return mk8x8(
759             cmpnez8( sel8x8_7(xx) ),
760             cmpnez8( sel8x8_6(xx) ),
761             cmpnez8( sel8x8_5(xx) ),
762             cmpnez8( sel8x8_4(xx) ),
763             cmpnez8( sel8x8_3(xx) ),
764             cmpnez8( sel8x8_2(xx) ),
765             cmpnez8( sel8x8_1(xx) ),
766             cmpnez8( sel8x8_0(xx) )
767          );
768}
769
770/* ------------ Saturating narrowing ------------ */
771
772ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
773{
774   UInt d = sel32x2_1(aa);
775   UInt c = sel32x2_0(aa);
776   UInt b = sel32x2_1(bb);
777   UInt a = sel32x2_0(bb);
778   return mk16x4(
779             qnarrow32Sto16S(d),
780             qnarrow32Sto16S(c),
781             qnarrow32Sto16S(b),
782             qnarrow32Sto16S(a)
783          );
784}
785
786ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
787{
788   UShort h = sel16x4_3(aa);
789   UShort g = sel16x4_2(aa);
790   UShort f = sel16x4_1(aa);
791   UShort e = sel16x4_0(aa);
792   UShort d = sel16x4_3(bb);
793   UShort c = sel16x4_2(bb);
794   UShort b = sel16x4_1(bb);
795   UShort a = sel16x4_0(bb);
796   return mk8x8(
797             qnarrow16Sto8S(h),
798             qnarrow16Sto8S(g),
799             qnarrow16Sto8S(f),
800             qnarrow16Sto8S(e),
801             qnarrow16Sto8S(d),
802             qnarrow16Sto8S(c),
803             qnarrow16Sto8S(b),
804             qnarrow16Sto8S(a)
805          );
806}
807
808ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
809{
810   UShort h = sel16x4_3(aa);
811   UShort g = sel16x4_2(aa);
812   UShort f = sel16x4_1(aa);
813   UShort e = sel16x4_0(aa);
814   UShort d = sel16x4_3(bb);
815   UShort c = sel16x4_2(bb);
816   UShort b = sel16x4_1(bb);
817   UShort a = sel16x4_0(bb);
818   return mk8x8(
819             qnarrow16Sto8U(h),
820             qnarrow16Sto8U(g),
821             qnarrow16Sto8U(f),
822             qnarrow16Sto8U(e),
823             qnarrow16Sto8U(d),
824             qnarrow16Sto8U(c),
825             qnarrow16Sto8U(b),
826             qnarrow16Sto8U(a)
827          );
828}
829
830/* ------------ Truncating narrowing ------------ */
831
832ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
833{
834   UInt d = sel32x2_1(aa);
835   UInt c = sel32x2_0(aa);
836   UInt b = sel32x2_1(bb);
837   UInt a = sel32x2_0(bb);
838   return mk16x4(
839             narrow32to16(d),
840             narrow32to16(c),
841             narrow32to16(b),
842             narrow32to16(a)
843          );
844}
845
846ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
847{
848   UShort h = sel16x4_3(aa);
849   UShort g = sel16x4_2(aa);
850   UShort f = sel16x4_1(aa);
851   UShort e = sel16x4_0(aa);
852   UShort d = sel16x4_3(bb);
853   UShort c = sel16x4_2(bb);
854   UShort b = sel16x4_1(bb);
855   UShort a = sel16x4_0(bb);
856   return mk8x8(
857             narrow16to8(h),
858             narrow16to8(g),
859             narrow16to8(f),
860             narrow16to8(e),
861             narrow16to8(d),
862             narrow16to8(c),
863             narrow16to8(b),
864             narrow16to8(a)
865          );
866}
867
868/* ------------ Interleaving ------------ */
869
870ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
871{
872   return mk8x8(
873             sel8x8_7(aa),
874             sel8x8_7(bb),
875             sel8x8_6(aa),
876             sel8x8_6(bb),
877             sel8x8_5(aa),
878             sel8x8_5(bb),
879             sel8x8_4(aa),
880             sel8x8_4(bb)
881          );
882}
883
884ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
885{
886   return mk8x8(
887             sel8x8_3(aa),
888             sel8x8_3(bb),
889             sel8x8_2(aa),
890             sel8x8_2(bb),
891             sel8x8_1(aa),
892             sel8x8_1(bb),
893             sel8x8_0(aa),
894             sel8x8_0(bb)
895          );
896}
897
898ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
899{
900   return mk16x4(
901             sel16x4_3(aa),
902             sel16x4_3(bb),
903             sel16x4_2(aa),
904             sel16x4_2(bb)
905          );
906}
907
908ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
909{
910   return mk16x4(
911             sel16x4_1(aa),
912             sel16x4_1(bb),
913             sel16x4_0(aa),
914             sel16x4_0(bb)
915          );
916}
917
918ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
919{
920   return mk32x2(
921             sel32x2_1(aa),
922             sel32x2_1(bb)
923          );
924}
925
926ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
927{
928   return mk32x2(
929             sel32x2_0(aa),
930             sel32x2_0(bb)
931          );
932}
933
934/* ------------ Concatenation ------------ */
935
936ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
937{
938   return mk16x4(
939             sel16x4_3(aa),
940             sel16x4_1(aa),
941             sel16x4_3(bb),
942             sel16x4_1(bb)
943          );
944}
945
946ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
947{
948   return mk16x4(
949             sel16x4_2(aa),
950             sel16x4_0(aa),
951             sel16x4_2(bb),
952             sel16x4_0(bb)
953          );
954}
955
956/* misc hack looking for a proper home */
957ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
958{
959   return mk8x8(
960             index8x8(aa, sel8x8_7(bb)),
961             index8x8(aa, sel8x8_6(bb)),
962             index8x8(aa, sel8x8_5(bb)),
963             index8x8(aa, sel8x8_4(bb)),
964             index8x8(aa, sel8x8_3(bb)),
965             index8x8(aa, sel8x8_2(bb)),
966             index8x8(aa, sel8x8_1(bb)),
967             index8x8(aa, sel8x8_0(bb))
968          );
969}
970
971/* ------------ Shifting ------------ */
972/* Note that because these primops are undefined if the shift amount
973   equals or exceeds the lane width, the shift amount is masked so
974   that the scalar shifts are always in range.  In fact, given the
975   semantics of these primops (ShlN16x4, etc) it is an error if in
976   fact we are ever given an out-of-range shift amount.
977*/
978ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
979{
980   /* vassert(nn < 32); */
981   nn &= 31;
982   return mk32x2(
983             shl32( sel32x2_1(xx), nn ),
984             shl32( sel32x2_0(xx), nn )
985          );
986}
987
988ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
989{
990   /* vassert(nn < 16); */
991   nn &= 15;
992   return mk16x4(
993             shl16( sel16x4_3(xx), nn ),
994             shl16( sel16x4_2(xx), nn ),
995             shl16( sel16x4_1(xx), nn ),
996             shl16( sel16x4_0(xx), nn )
997          );
998}
999
1000ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
1001{
1002   /* vassert(nn < 8); */
1003   nn &= 7;
1004   return mk8x8(
1005             shl8( sel8x8_7(xx), nn ),
1006             shl8( sel8x8_6(xx), nn ),
1007             shl8( sel8x8_5(xx), nn ),
1008             shl8( sel8x8_4(xx), nn ),
1009             shl8( sel8x8_3(xx), nn ),
1010             shl8( sel8x8_2(xx), nn ),
1011             shl8( sel8x8_1(xx), nn ),
1012             shl8( sel8x8_0(xx), nn )
1013          );
1014}
1015
1016ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
1017{
1018   /* vassert(nn < 32); */
1019   nn &= 31;
1020   return mk32x2(
1021             shr32( sel32x2_1(xx), nn ),
1022             shr32( sel32x2_0(xx), nn )
1023          );
1024}
1025
1026ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
1027{
1028   /* vassert(nn < 16); */
1029   nn &= 15;
1030   return mk16x4(
1031             shr16( sel16x4_3(xx), nn ),
1032             shr16( sel16x4_2(xx), nn ),
1033             shr16( sel16x4_1(xx), nn ),
1034             shr16( sel16x4_0(xx), nn )
1035          );
1036}
1037
1038ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
1039{
1040   /* vassert(nn < 32); */
1041   nn &= 31;
1042   return mk32x2(
1043             sar32( sel32x2_1(xx), nn ),
1044             sar32( sel32x2_0(xx), nn )
1045          );
1046}
1047
1048ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1049{
1050   /* vassert(nn < 16); */
1051   nn &= 15;
1052   return mk16x4(
1053             sar16( sel16x4_3(xx), nn ),
1054             sar16( sel16x4_2(xx), nn ),
1055             sar16( sel16x4_1(xx), nn ),
1056             sar16( sel16x4_0(xx), nn )
1057          );
1058}
1059
1060ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1061{
1062   /* vassert(nn < 8); */
1063   nn &= 7;
1064   return mk8x8(
1065             sar8( sel8x8_7(xx), nn ),
1066             sar8( sel8x8_6(xx), nn ),
1067             sar8( sel8x8_5(xx), nn ),
1068             sar8( sel8x8_4(xx), nn ),
1069             sar8( sel8x8_3(xx), nn ),
1070             sar8( sel8x8_2(xx), nn ),
1071             sar8( sel8x8_1(xx), nn ),
1072             sar8( sel8x8_0(xx), nn )
1073          );
1074}
1075
1076/* ------------ Averaging ------------ */
1077
1078ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1079{
1080   return mk8x8(
1081             avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1082             avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1083             avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1084             avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1085             avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1086             avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1087             avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1088             avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1089          );
1090}
1091
1092ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1093{
1094   return mk16x4(
1095             avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1096             avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1097             avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1098             avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1099          );
1100}
1101
1102/* ------------ max/min ------------ */
1103
1104ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1105{
1106   return mk16x4(
1107             max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1108             max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1109             max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1110             max16S( sel16x4_0(xx), sel16x4_0(yy) )
1111          );
1112}
1113
1114ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1115{
1116   return mk8x8(
1117             max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1118             max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1119             max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1120             max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1121             max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1122             max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1123             max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1124             max8U( sel8x8_0(xx), sel8x8_0(yy) )
1125          );
1126}
1127
1128ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1129{
1130   return mk16x4(
1131             min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1132             min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1133             min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1134             min16S( sel16x4_0(xx), sel16x4_0(yy) )
1135          );
1136}
1137
1138ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1139{
1140   return mk8x8(
1141             min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1142             min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1143             min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1144             min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1145             min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1146             min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1147             min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1148             min8U( sel8x8_0(xx), sel8x8_0(yy) )
1149          );
1150}
1151
1152/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1153
1154/* Tuple/select functions for 16x2 vectors. */
1155static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1156   return (((UInt)w1) << 16) | ((UInt)w2);
1157}
1158
1159static inline UShort sel16x2_1 ( UInt w32 ) {
1160   return 0xFFFF & (UShort)(w32 >> 16);
1161}
1162static inline UShort sel16x2_0 ( UInt w32 ) {
1163   return 0xFFFF & (UShort)(w32);
1164}
1165
1166static inline UInt mk8x4 ( UChar w3, UChar w2,
1167                           UChar w1, UChar w0 ) {
1168   UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1169              | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1170   return w32;
1171}
1172
1173static inline UChar sel8x4_3 ( UInt w32 ) {
1174   return toUChar(0xFF & (w32 >> 24));
1175}
1176static inline UChar sel8x4_2 ( UInt w32 ) {
1177   return toUChar(0xFF & (w32 >> 16));
1178}
1179static inline UChar sel8x4_1 ( UInt w32 ) {
1180   return toUChar(0xFF & (w32 >> 8));
1181}
1182static inline UChar sel8x4_0 ( UInt w32 ) {
1183   return toUChar(0xFF & (w32 >> 0));
1184}
1185
1186
1187/* ----------------------------------------------------- */
1188/* More externally visible functions.  These simply
1189   implement the corresponding IR primops. */
1190/* ----------------------------------------------------- */
1191
1192/* ------ 16x2 ------ */
1193
1194UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1195{
1196   return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1197                  sel16x2_0(xx) + sel16x2_0(yy) );
1198}
1199
1200UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1201{
1202   return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1203                  sel16x2_0(xx) - sel16x2_0(yy) );
1204}
1205
1206UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1207{
1208   return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1209                  hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1210}
1211
1212UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1213{
1214   return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1215                  hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1216}
1217
1218UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1219{
1220   return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1221                  hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1222}
1223
1224UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1225{
1226   return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1227                  hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1228}
1229
1230UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1231{
1232   return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1233                  qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1234}
1235
1236UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1237{
1238   return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1239                  qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1240}
1241
1242UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1243{
1244   return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1245                  qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1246}
1247
1248UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1249{
1250   return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1251                  qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1252}
1253
1254/* ------ 8x4 ------ */
1255
1256UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1257{
1258   return mk8x4(
1259             sel8x4_3(xx) + sel8x4_3(yy),
1260             sel8x4_2(xx) + sel8x4_2(yy),
1261             sel8x4_1(xx) + sel8x4_1(yy),
1262             sel8x4_0(xx) + sel8x4_0(yy)
1263          );
1264}
1265
1266UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1267{
1268   return mk8x4(
1269             sel8x4_3(xx) - sel8x4_3(yy),
1270             sel8x4_2(xx) - sel8x4_2(yy),
1271             sel8x4_1(xx) - sel8x4_1(yy),
1272             sel8x4_0(xx) - sel8x4_0(yy)
1273          );
1274}
1275
1276UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1277{
1278   return mk8x4(
1279             hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1280             hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1281             hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1282             hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1283          );
1284}
1285
1286UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1287{
1288   return mk8x4(
1289             hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1290             hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1291             hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1292             hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1293          );
1294}
1295
1296UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1297{
1298   return mk8x4(
1299             hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1300             hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1301             hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1302             hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1303          );
1304}
1305
1306UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1307{
1308   return mk8x4(
1309             hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1310             hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1311             hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1312             hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1313          );
1314}
1315
1316UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1317{
1318   return mk8x4(
1319             qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1320             qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1321             qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1322             qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1323          );
1324}
1325
1326UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1327{
1328   return mk8x4(
1329             qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1330             qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1331             qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1332             qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1333          );
1334}
1335
1336UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1337{
1338   return mk8x4(
1339             qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1340             qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1341             qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1342             qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1343          );
1344}
1345
1346UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1347{
1348   return mk8x4(
1349             qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1350             qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1351             qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1352             qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1353          );
1354}
1355
1356UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1357{
1358   return mk16x2(
1359             cmpnez16( sel16x2_1(xx) ),
1360             cmpnez16( sel16x2_0(xx) )
1361          );
1362}
1363
1364UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1365{
1366   return mk8x4(
1367             cmpnez8( sel8x4_3(xx) ),
1368             cmpnez8( sel8x4_2(xx) ),
1369             cmpnez8( sel8x4_1(xx) ),
1370             cmpnez8( sel8x4_0(xx) )
1371          );
1372}
1373
1374UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1375{
1376   return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1377          + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1378          + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1379          + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1380}
1381
1382
1383/*---------------------------------------------------------------*/
1384/*--- end                               host_generic_simd64.c ---*/
1385/*---------------------------------------------------------------*/
1386