host_generic_simd64.c revision 9bea4c13fca0e3bb4b719dcb3ed63d47d479294e
1
2/*---------------------------------------------------------------*/
3/*--- begin                             host_generic_simd64.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2010 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
37   where the instruction selectors cannot generate code in-line.
38   These are purely back-end entities and cannot be seen/referenced
39   from IR. */
40
41#include "libvex_basictypes.h"
42#include "host_generic_simd64.h"
43
44
45
46/* Tuple/select functions for 32x2 vectors. */
47
48static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
49   return (((ULong)w1) << 32) | ((ULong)w0);
50}
51
52static inline UInt sel32x2_1 ( ULong w64 ) {
53   return 0xFFFFFFFF & toUInt(w64 >> 32);
54}
55static inline UInt sel32x2_0 ( ULong w64 ) {
56   return 0xFFFFFFFF & toUInt(w64);
57}
58
59
60/* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
61   with 64-bit shifts so we give it a hand. */
62
63static inline ULong mk16x4 ( UShort w3, UShort w2,
64                             UShort w1, UShort w0 ) {
65   UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
66   UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
67   return mk32x2(hi32, lo32);
68}
69
70static inline UShort sel16x4_3 ( ULong w64 ) {
71   UInt hi32 = toUInt(w64 >> 32);
72   return toUShort(0xFFFF & (hi32 >> 16));
73}
74static inline UShort sel16x4_2 ( ULong w64 ) {
75   UInt hi32 = toUInt(w64 >> 32);
76   return toUShort(0xFFFF & hi32);
77}
78static inline UShort sel16x4_1 ( ULong w64 ) {
79   UInt lo32 = (UInt)w64;
80   return toUShort(0xFFFF & (lo32 >> 16));
81}
82static inline UShort sel16x4_0 ( ULong w64 ) {
83   UInt lo32 = (UInt)w64;
84   return toUShort(0xFFFF & lo32);
85}
86
87
88/* Tuple/select functions for 8x8 vectors. */
89
90static inline ULong mk8x8 ( UChar w7, UChar w6,
91                            UChar w5, UChar w4,
92                            UChar w3, UChar w2,
93                            UChar w1, UChar w0 ) {
94   UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
95               | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
96   UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
97               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
98   return mk32x2(hi32, lo32);
99}
100
101static inline UChar sel8x8_7 ( ULong w64 ) {
102   UInt hi32 = toUInt(w64 >> 32);
103   return toUChar(0xFF & (hi32 >> 24));
104}
105static inline UChar sel8x8_6 ( ULong w64 ) {
106   UInt hi32 = toUInt(w64 >> 32);
107   return toUChar(0xFF & (hi32 >> 16));
108}
109static inline UChar sel8x8_5 ( ULong w64 ) {
110   UInt hi32 = toUInt(w64 >> 32);
111   return toUChar(0xFF & (hi32 >> 8));
112}
113static inline UChar sel8x8_4 ( ULong w64 ) {
114   UInt hi32 = toUInt(w64 >> 32);
115   return toUChar(0xFF & (hi32 >> 0));
116}
117static inline UChar sel8x8_3 ( ULong w64 ) {
118   UInt lo32 = (UInt)w64;
119   return toUChar(0xFF & (lo32 >> 24));
120}
121static inline UChar sel8x8_2 ( ULong w64 ) {
122   UInt lo32 = (UInt)w64;
123   return toUChar(0xFF & (lo32 >> 16));
124}
125static inline UChar sel8x8_1 ( ULong w64 ) {
126   UInt lo32 = (UInt)w64;
127   return toUChar(0xFF & (lo32 >> 8));
128}
129static inline UChar sel8x8_0 ( ULong w64 ) {
130   UInt lo32 = (UInt)w64;
131   return toUChar(0xFF & (lo32 >> 0));
132}
133
134static inline UChar index8x8 ( ULong w64, UChar ix ) {
135   ix &= 7;
136   return toUChar((w64 >> (8*ix)) & 0xFF);
137}
138
139
140/* Scalar helpers. */
141
142static inline Short qadd16S ( Short xx, Short yy )
143{
144   Int t = ((Int)xx) + ((Int)yy);
145   if (t < -32768) t = -32768;
146   if (t > 32767)  t = 32767;
147   return (Short)t;
148}
149
150static inline Char qadd8S ( Char xx, Char yy )
151{
152   Int t = ((Int)xx) + ((Int)yy);
153   if (t < -128) t = -128;
154   if (t > 127)  t = 127;
155   return (Char)t;
156}
157
158static inline UShort qadd16U ( UShort xx, UShort yy )
159{
160   UInt t = ((UInt)xx) + ((UInt)yy);
161   if (t > 0xFFFF) t = 0xFFFF;
162   return (UShort)t;
163}
164
165static inline UChar qadd8U ( UChar xx, UChar yy )
166{
167   UInt t = ((UInt)xx) + ((UInt)yy);
168   if (t > 0xFF) t = 0xFF;
169   return (UChar)t;
170}
171
172static inline Short qsub16S ( Short xx, Short yy )
173{
174   Int t = ((Int)xx) - ((Int)yy);
175   if (t < -32768) t = -32768;
176   if (t > 32767)  t = 32767;
177   return (Short)t;
178}
179
180static inline Char qsub8S ( Char xx, Char yy )
181{
182   Int t = ((Int)xx) - ((Int)yy);
183   if (t < -128) t = -128;
184   if (t > 127)  t = 127;
185   return (Char)t;
186}
187
188static inline UShort qsub16U ( UShort xx, UShort yy )
189{
190   Int t = ((Int)xx) - ((Int)yy);
191   if (t < 0)      t = 0;
192   if (t > 0xFFFF) t = 0xFFFF;
193   return (UShort)t;
194}
195
196static inline UChar qsub8U ( UChar xx, UChar yy )
197{
198   Int t = ((Int)xx) - ((Int)yy);
199   if (t < 0)    t = 0;
200   if (t > 0xFF) t = 0xFF;
201   return (UChar)t;
202}
203
204static inline Short mul16 ( Short xx, Short yy )
205{
206   Int t = ((Int)xx) * ((Int)yy);
207   return (Short)t;
208}
209
210static inline Int mul32 ( Int xx, Int yy )
211{
212   Int t = ((Int)xx) * ((Int)yy);
213   return (Int)t;
214}
215
216static inline Short mulhi16S ( Short xx, Short yy )
217{
218   Int t = ((Int)xx) * ((Int)yy);
219   t >>=/*s*/ 16;
220   return (Short)t;
221}
222
223static inline UShort mulhi16U ( UShort xx, UShort yy )
224{
225   UInt t = ((UInt)xx) * ((UInt)yy);
226   t >>=/*u*/ 16;
227   return (UShort)t;
228}
229
230static inline UInt cmpeq32 ( UInt xx, UInt yy )
231{
232   return xx==yy ? 0xFFFFFFFF : 0;
233}
234
235static inline UShort cmpeq16 ( UShort xx, UShort yy )
236{
237   return toUShort(xx==yy ? 0xFFFF : 0);
238}
239
240static inline UChar cmpeq8 ( UChar xx, UChar yy )
241{
242   return toUChar(xx==yy ? 0xFF : 0);
243}
244
245static inline UInt cmpgt32S ( Int xx, Int yy )
246{
247   return xx>yy ? 0xFFFFFFFF : 0;
248}
249
250static inline UShort cmpgt16S ( Short xx, Short yy )
251{
252   return toUShort(xx>yy ? 0xFFFF : 0);
253}
254
255static inline UChar cmpgt8S ( Char xx, Char yy )
256{
257   return toUChar(xx>yy ? 0xFF : 0);
258}
259
260static inline UInt cmpnez32 ( UInt xx )
261{
262   return xx==0 ? 0 : 0xFFFFFFFF;
263}
264
265static inline UShort cmpnez16 ( UShort xx )
266{
267   return toUShort(xx==0 ? 0 : 0xFFFF);
268}
269
270static inline UChar cmpnez8 ( UChar xx )
271{
272   return toUChar(xx==0 ? 0 : 0xFF);
273}
274
275static inline Short qnarrow32Sto16 ( UInt xx0 )
276{
277   Int xx = (Int)xx0;
278   if (xx < -32768) xx = -32768;
279   if (xx > 32767)  xx = 32767;
280   return (Short)xx;
281}
282
283static inline Char qnarrow16Sto8 ( UShort xx0 )
284{
285   Short xx = (Short)xx0;
286   if (xx < -128) xx = -128;
287   if (xx > 127)  xx = 127;
288   return (Char)xx;
289}
290
291static inline UChar qnarrow16Uto8 ( UShort xx0 )
292{
293   Short xx = (Short)xx0;
294   if (xx < 0)   xx = 0;
295   if (xx > 255) xx = 255;
296   return (UChar)xx;
297}
298
299/* shifts: we don't care about out-of-range ones, since
300   that is dealt with at a higher level. */
301
302static inline UChar shl8 ( UChar v, UInt n )
303{
304   return toUChar(v << n);
305}
306
307static inline UChar sar8 ( UChar v, UInt n )
308{
309   return toUChar(((Char)v) >> n);
310}
311
312static inline UShort shl16 ( UShort v, UInt n )
313{
314   return toUShort(v << n);
315}
316
317static inline UShort shr16 ( UShort v, UInt n )
318{
319   return toUShort((((UShort)v) >> n));
320}
321
322static inline UShort sar16 ( UShort v, UInt n )
323{
324   return toUShort(((Short)v) >> n);
325}
326
327static inline UInt shl32 ( UInt v, UInt n )
328{
329   return v << n;
330}
331
332static inline UInt shr32 ( UInt v, UInt n )
333{
334   return (((UInt)v) >> n);
335}
336
337static inline UInt sar32 ( UInt v, UInt n )
338{
339   return ((Int)v) >> n;
340}
341
342static inline UChar avg8U ( UChar xx, UChar yy )
343{
344   UInt xxi = (UInt)xx;
345   UInt yyi = (UInt)yy;
346   UInt r   = (xxi + yyi + 1) >> 1;
347   return (UChar)r;
348}
349
350static inline UShort avg16U ( UShort xx, UShort yy )
351{
352   UInt xxi = (UInt)xx;
353   UInt yyi = (UInt)yy;
354   UInt r   = (xxi + yyi + 1) >> 1;
355   return (UShort)r;
356}
357
358static inline Short max16S ( Short xx, Short yy )
359{
360   return toUShort((xx > yy) ? xx : yy);
361}
362
363static inline UChar max8U ( UChar xx, UChar yy )
364{
365   return toUChar((xx > yy) ? xx : yy);
366}
367
368static inline Short min16S ( Short xx, Short yy )
369{
370   return toUShort((xx < yy) ? xx : yy);
371}
372
373static inline UChar min8U ( UChar xx, UChar yy )
374{
375   return toUChar((xx < yy) ? xx : yy);
376}
377
378static inline UShort hadd16U ( UShort xx, UShort yy )
379{
380   UInt xxi = (UInt)xx;
381   UInt yyi = (UInt)yy;
382   UInt r   = (xxi + yyi) >> 1;
383   return (UShort)r;
384}
385
386static inline Short hadd16S ( Short xx, Short yy )
387{
388   Int xxi = (Int)xx;
389   Int yyi = (Int)yy;
390   Int r   = (xxi + yyi) >> 1;
391   return (Short)r;
392}
393
394static inline UShort hsub16U ( UShort xx, UShort yy )
395{
396   UInt xxi = (UInt)xx;
397   UInt yyi = (UInt)yy;
398   UInt r   = (xxi - yyi) >> 1;
399   return (UShort)r;
400}
401
402static inline Short hsub16S ( Short xx, Short yy )
403{
404   Int xxi = (Int)xx;
405   Int yyi = (Int)yy;
406   Int r   = (xxi - yyi) >> 1;
407   return (Short)r;
408}
409
410static inline UChar hadd8U ( UChar xx, UChar yy )
411{
412   UInt xxi = (UInt)xx;
413   UInt yyi = (UInt)yy;
414   UInt r   = (xxi + yyi) >> 1;
415   return (UChar)r;
416}
417
418static inline Char hadd8S ( Char xx, Char yy )
419{
420   Int xxi = (Int)xx;
421   Int yyi = (Int)yy;
422   Int r   = (xxi + yyi) >> 1;
423   return (Char)r;
424}
425
426static inline UChar hsub8U ( UChar xx, UChar yy )
427{
428   UInt xxi = (UInt)xx;
429   UInt yyi = (UInt)yy;
430   UInt r   = (xxi - yyi) >> 1;
431   return (UChar)r;
432}
433
434static inline Char hsub8S ( Char xx, Char yy )
435{
436   Int xxi = (Int)xx;
437   Int yyi = (Int)yy;
438   Int r   = (xxi - yyi) >> 1;
439   return (Char)r;
440}
441
442static inline UInt absdiff8U ( UChar xx, UChar yy )
443{
444   UInt xxu = (UChar)xx;
445   UInt yyu = (UChar)yy;
446   return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
447}
448
449/* ----------------------------------------------------- */
450/* Start of the externally visible functions.  These simply
451   implement the corresponding IR primops. */
452/* ----------------------------------------------------- */
453
454/* ------------ Normal addition ------------ */
455
456ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
457{
458   return mk32x2(
459             sel32x2_1(xx) + sel32x2_1(yy),
460             sel32x2_0(xx) + sel32x2_0(yy)
461          );
462}
463
464ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
465{
466   return mk16x4(
467             toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
468             toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
469             toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
470             toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
471          );
472}
473
474ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
475{
476   return mk8x8(
477             toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
478             toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
479             toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
480             toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
481             toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
482             toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
483             toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
484             toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
485          );
486}
487
488/* ------------ Saturating addition ------------ */
489
490ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
491{
492   return mk16x4(
493             qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
494             qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
495             qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
496             qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
497          );
498}
499
500ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
501{
502   return mk8x8(
503             qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
504             qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
505             qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
506             qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
507             qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
508             qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
509             qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
510             qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
511          );
512}
513
514ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
515{
516   return mk16x4(
517             qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
518             qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
519             qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
520             qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
521          );
522}
523
524ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
525{
526   return mk8x8(
527             qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
528             qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
529             qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
530             qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
531             qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
532             qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
533             qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
534             qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
535          );
536}
537
538/* ------------ Normal subtraction ------------ */
539
540ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
541{
542   return mk32x2(
543             sel32x2_1(xx) - sel32x2_1(yy),
544             sel32x2_0(xx) - sel32x2_0(yy)
545          );
546}
547
548ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
549{
550   return mk16x4(
551             toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
552             toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
553             toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
554             toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
555          );
556}
557
558ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
559{
560   return mk8x8(
561             toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
562             toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
563             toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
564             toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
565             toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
566             toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
567             toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
568             toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
569          );
570}
571
572/* ------------ Saturating subtraction ------------ */
573
574ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
575{
576   return mk16x4(
577             qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
578             qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
579             qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
580             qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
581          );
582}
583
584ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
585{
586   return mk8x8(
587             qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
588             qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
589             qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
590             qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
591             qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
592             qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
593             qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
594             qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
595          );
596}
597
598ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
599{
600   return mk16x4(
601             qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
602             qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
603             qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
604             qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
605          );
606}
607
608ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
609{
610   return mk8x8(
611             qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
612             qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
613             qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
614             qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
615             qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
616             qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
617             qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
618             qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
619          );
620}
621
622/* ------------ Multiplication ------------ */
623
624ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
625{
626   return mk16x4(
627             mul16( sel16x4_3(xx), sel16x4_3(yy) ),
628             mul16( sel16x4_2(xx), sel16x4_2(yy) ),
629             mul16( sel16x4_1(xx), sel16x4_1(yy) ),
630             mul16( sel16x4_0(xx), sel16x4_0(yy) )
631          );
632}
633
634ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
635{
636   return mk32x2(
637             mul32( sel32x2_1(xx), sel32x2_1(yy) ),
638             mul32( sel32x2_0(xx), sel32x2_0(yy) )
639          );
640}
641
642ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
643{
644   return mk16x4(
645             mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
646             mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
647             mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
648             mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
649          );
650}
651
652ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
653{
654   return mk16x4(
655             mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
656             mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
657             mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
658             mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
659          );
660}
661
662/* ------------ Comparison ------------ */
663
664ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
665{
666   return mk32x2(
667             cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
668             cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
669          );
670}
671
672ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
673{
674   return mk16x4(
675             cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
676             cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
677             cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
678             cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
679          );
680}
681
682ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
683{
684   return mk8x8(
685             cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
686             cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
687             cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
688             cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
689             cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
690             cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
691             cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
692             cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
693          );
694}
695
696ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
697{
698   return mk32x2(
699             cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
700             cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
701          );
702}
703
704ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
705{
706   return mk16x4(
707             cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
708             cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
709             cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
710             cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
711          );
712}
713
714ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
715{
716   return mk8x8(
717             cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
718             cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
719             cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
720             cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
721             cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
722             cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
723             cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
724             cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
725          );
726}
727
728ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
729{
730   return mk32x2(
731             cmpnez32( sel32x2_1(xx) ),
732             cmpnez32( sel32x2_0(xx) )
733          );
734}
735
736ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
737{
738   return mk16x4(
739             cmpnez16( sel16x4_3(xx) ),
740             cmpnez16( sel16x4_2(xx) ),
741             cmpnez16( sel16x4_1(xx) ),
742             cmpnez16( sel16x4_0(xx) )
743          );
744}
745
746ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
747{
748   return mk8x8(
749             cmpnez8( sel8x8_7(xx) ),
750             cmpnez8( sel8x8_6(xx) ),
751             cmpnez8( sel8x8_5(xx) ),
752             cmpnez8( sel8x8_4(xx) ),
753             cmpnez8( sel8x8_3(xx) ),
754             cmpnez8( sel8x8_2(xx) ),
755             cmpnez8( sel8x8_1(xx) ),
756             cmpnez8( sel8x8_0(xx) )
757          );
758}
759
760/* ------------ Saturating narrowing ------------ */
761
762ULong h_generic_calc_QNarrow32Sx2 ( ULong aa, ULong bb )
763{
764   UInt d = sel32x2_1(aa);
765   UInt c = sel32x2_0(aa);
766   UInt b = sel32x2_1(bb);
767   UInt a = sel32x2_0(bb);
768   return mk16x4(
769             qnarrow32Sto16(d),
770             qnarrow32Sto16(c),
771             qnarrow32Sto16(b),
772             qnarrow32Sto16(a)
773          );
774}
775
776ULong h_generic_calc_QNarrow16Sx4 ( ULong aa, ULong bb )
777{
778   UShort h = sel16x4_3(aa);
779   UShort g = sel16x4_2(aa);
780   UShort f = sel16x4_1(aa);
781   UShort e = sel16x4_0(aa);
782   UShort d = sel16x4_3(bb);
783   UShort c = sel16x4_2(bb);
784   UShort b = sel16x4_1(bb);
785   UShort a = sel16x4_0(bb);
786   return mk8x8(
787             qnarrow16Sto8(h),
788             qnarrow16Sto8(g),
789             qnarrow16Sto8(f),
790             qnarrow16Sto8(e),
791             qnarrow16Sto8(d),
792             qnarrow16Sto8(c),
793             qnarrow16Sto8(b),
794             qnarrow16Sto8(a)
795          );
796}
797
798ULong h_generic_calc_QNarrow16Ux4 ( ULong aa, ULong bb )
799{
800   UShort h = sel16x4_3(aa);
801   UShort g = sel16x4_2(aa);
802   UShort f = sel16x4_1(aa);
803   UShort e = sel16x4_0(aa);
804   UShort d = sel16x4_3(bb);
805   UShort c = sel16x4_2(bb);
806   UShort b = sel16x4_1(bb);
807   UShort a = sel16x4_0(bb);
808   return mk8x8(
809             qnarrow16Uto8(h),
810             qnarrow16Uto8(g),
811             qnarrow16Uto8(f),
812             qnarrow16Uto8(e),
813             qnarrow16Uto8(d),
814             qnarrow16Uto8(c),
815             qnarrow16Uto8(b),
816             qnarrow16Uto8(a)
817          );
818}
819
820/* ------------ Interleaving ------------ */
821
822ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
823{
824   return mk8x8(
825             sel8x8_7(aa),
826             sel8x8_7(bb),
827             sel8x8_6(aa),
828             sel8x8_6(bb),
829             sel8x8_5(aa),
830             sel8x8_5(bb),
831             sel8x8_4(aa),
832             sel8x8_4(bb)
833          );
834}
835
836ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
837{
838   return mk8x8(
839             sel8x8_3(aa),
840             sel8x8_3(bb),
841             sel8x8_2(aa),
842             sel8x8_2(bb),
843             sel8x8_1(aa),
844             sel8x8_1(bb),
845             sel8x8_0(aa),
846             sel8x8_0(bb)
847          );
848}
849
850ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
851{
852   return mk16x4(
853             sel16x4_3(aa),
854             sel16x4_3(bb),
855             sel16x4_2(aa),
856             sel16x4_2(bb)
857          );
858}
859
860ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
861{
862   return mk16x4(
863             sel16x4_1(aa),
864             sel16x4_1(bb),
865             sel16x4_0(aa),
866             sel16x4_0(bb)
867          );
868}
869
870ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
871{
872   return mk32x2(
873             sel32x2_1(aa),
874             sel32x2_1(bb)
875          );
876}
877
878ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
879{
880   return mk32x2(
881             sel32x2_0(aa),
882             sel32x2_0(bb)
883          );
884}
885
886/* ------------ Concatenation ------------ */
887
888ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
889{
890   return mk16x4(
891             sel16x4_3(aa),
892             sel16x4_1(aa),
893             sel16x4_3(bb),
894             sel16x4_1(bb)
895          );
896}
897
898ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
899{
900   return mk16x4(
901             sel16x4_2(aa),
902             sel16x4_0(aa),
903             sel16x4_2(bb),
904             sel16x4_0(bb)
905          );
906}
907
908/* misc hack looking for a proper home */
909ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
910{
911   return mk8x8(
912             index8x8(aa, sel8x8_7(bb)),
913             index8x8(aa, sel8x8_6(bb)),
914             index8x8(aa, sel8x8_5(bb)),
915             index8x8(aa, sel8x8_4(bb)),
916             index8x8(aa, sel8x8_3(bb)),
917             index8x8(aa, sel8x8_2(bb)),
918             index8x8(aa, sel8x8_1(bb)),
919             index8x8(aa, sel8x8_0(bb))
920          );
921}
922
923/* ------------ Shifting ------------ */
924/* Note that because these primops are undefined if the shift amount
925   equals or exceeds the lane width, the shift amount is masked so
926   that the scalar shifts are always in range.  In fact, given the
927   semantics of these primops (ShlN16x4, etc) it is an error if in
928   fact we are ever given an out-of-range shift amount.
929*/
930ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
931{
932   /* vassert(nn < 32); */
933   nn &= 31;
934   return mk32x2(
935             shl32( sel32x2_1(xx), nn ),
936             shl32( sel32x2_0(xx), nn )
937          );
938}
939
940ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
941{
942   /* vassert(nn < 16); */
943   nn &= 15;
944   return mk16x4(
945             shl16( sel16x4_3(xx), nn ),
946             shl16( sel16x4_2(xx), nn ),
947             shl16( sel16x4_1(xx), nn ),
948             shl16( sel16x4_0(xx), nn )
949          );
950}
951
952ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
953{
954   /* vassert(nn < 8); */
955   nn &= 7;
956   return mk8x8(
957             shl8( sel8x8_7(xx), nn ),
958             shl8( sel8x8_6(xx), nn ),
959             shl8( sel8x8_5(xx), nn ),
960             shl8( sel8x8_4(xx), nn ),
961             shl8( sel8x8_3(xx), nn ),
962             shl8( sel8x8_2(xx), nn ),
963             shl8( sel8x8_1(xx), nn ),
964             shl8( sel8x8_0(xx), nn )
965          );
966}
967
968ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
969{
970   /* vassert(nn < 32); */
971   nn &= 31;
972   return mk32x2(
973             shr32( sel32x2_1(xx), nn ),
974             shr32( sel32x2_0(xx), nn )
975          );
976}
977
978ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
979{
980   /* vassert(nn < 16); */
981   nn &= 15;
982   return mk16x4(
983             shr16( sel16x4_3(xx), nn ),
984             shr16( sel16x4_2(xx), nn ),
985             shr16( sel16x4_1(xx), nn ),
986             shr16( sel16x4_0(xx), nn )
987          );
988}
989
990ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
991{
992   /* vassert(nn < 32); */
993   nn &= 31;
994   return mk32x2(
995             sar32( sel32x2_1(xx), nn ),
996             sar32( sel32x2_0(xx), nn )
997          );
998}
999
1000ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
1001{
1002   /* vassert(nn < 16); */
1003   nn &= 15;
1004   return mk16x4(
1005             sar16( sel16x4_3(xx), nn ),
1006             sar16( sel16x4_2(xx), nn ),
1007             sar16( sel16x4_1(xx), nn ),
1008             sar16( sel16x4_0(xx), nn )
1009          );
1010}
1011
1012ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
1013{
1014   /* vassert(nn < 8); */
1015   nn &= 7;
1016   return mk8x8(
1017             sar8( sel8x8_7(xx), nn ),
1018             sar8( sel8x8_6(xx), nn ),
1019             sar8( sel8x8_5(xx), nn ),
1020             sar8( sel8x8_4(xx), nn ),
1021             sar8( sel8x8_3(xx), nn ),
1022             sar8( sel8x8_2(xx), nn ),
1023             sar8( sel8x8_1(xx), nn ),
1024             sar8( sel8x8_0(xx), nn )
1025          );
1026}
1027
1028/* ------------ Averaging ------------ */
1029
1030ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
1031{
1032   return mk8x8(
1033             avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
1034             avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
1035             avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
1036             avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
1037             avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
1038             avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
1039             avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
1040             avg8U( sel8x8_0(xx), sel8x8_0(yy) )
1041          );
1042}
1043
1044ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
1045{
1046   return mk16x4(
1047             avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
1048             avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
1049             avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
1050             avg16U( sel16x4_0(xx), sel16x4_0(yy) )
1051          );
1052}
1053
1054/* ------------ max/min ------------ */
1055
1056ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
1057{
1058   return mk16x4(
1059             max16S( sel16x4_3(xx), sel16x4_3(yy) ),
1060             max16S( sel16x4_2(xx), sel16x4_2(yy) ),
1061             max16S( sel16x4_1(xx), sel16x4_1(yy) ),
1062             max16S( sel16x4_0(xx), sel16x4_0(yy) )
1063          );
1064}
1065
1066ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
1067{
1068   return mk8x8(
1069             max8U( sel8x8_7(xx), sel8x8_7(yy) ),
1070             max8U( sel8x8_6(xx), sel8x8_6(yy) ),
1071             max8U( sel8x8_5(xx), sel8x8_5(yy) ),
1072             max8U( sel8x8_4(xx), sel8x8_4(yy) ),
1073             max8U( sel8x8_3(xx), sel8x8_3(yy) ),
1074             max8U( sel8x8_2(xx), sel8x8_2(yy) ),
1075             max8U( sel8x8_1(xx), sel8x8_1(yy) ),
1076             max8U( sel8x8_0(xx), sel8x8_0(yy) )
1077          );
1078}
1079
1080ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
1081{
1082   return mk16x4(
1083             min16S( sel16x4_3(xx), sel16x4_3(yy) ),
1084             min16S( sel16x4_2(xx), sel16x4_2(yy) ),
1085             min16S( sel16x4_1(xx), sel16x4_1(yy) ),
1086             min16S( sel16x4_0(xx), sel16x4_0(yy) )
1087          );
1088}
1089
1090ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
1091{
1092   return mk8x8(
1093             min8U( sel8x8_7(xx), sel8x8_7(yy) ),
1094             min8U( sel8x8_6(xx), sel8x8_6(yy) ),
1095             min8U( sel8x8_5(xx), sel8x8_5(yy) ),
1096             min8U( sel8x8_4(xx), sel8x8_4(yy) ),
1097             min8U( sel8x8_3(xx), sel8x8_3(yy) ),
1098             min8U( sel8x8_2(xx), sel8x8_2(yy) ),
1099             min8U( sel8x8_1(xx), sel8x8_1(yy) ),
1100             min8U( sel8x8_0(xx), sel8x8_0(yy) )
1101          );
1102}
1103
1104/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
1105
1106/* Tuple/select functions for 16x2 vectors. */
1107static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
1108   return (((UInt)w1) << 16) | ((UInt)w2);
1109}
1110
1111static inline UShort sel16x2_1 ( UInt w32 ) {
1112   return 0xFFFF & (UShort)(w32 >> 16);
1113}
1114static inline UShort sel16x2_0 ( UInt w32 ) {
1115   return 0xFFFF & (UShort)(w32);
1116}
1117
1118static inline UInt mk8x4 ( UChar w3, UChar w2,
1119                           UChar w1, UChar w0 ) {
1120   UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
1121              | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
1122   return w32;
1123}
1124
1125static inline UChar sel8x4_3 ( UInt w32 ) {
1126   return toUChar(0xFF & (w32 >> 24));
1127}
1128static inline UChar sel8x4_2 ( UInt w32 ) {
1129   return toUChar(0xFF & (w32 >> 16));
1130}
1131static inline UChar sel8x4_1 ( UInt w32 ) {
1132   return toUChar(0xFF & (w32 >> 8));
1133}
1134static inline UChar sel8x4_0 ( UInt w32 ) {
1135   return toUChar(0xFF & (w32 >> 0));
1136}
1137
1138
1139/* ----------------------------------------------------- */
1140/* More externally visible functions.  These simply
1141   implement the corresponding IR primops. */
1142/* ----------------------------------------------------- */
1143
1144/* ------ 16x2 ------ */
1145
1146UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
1147{
1148   return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
1149                  sel16x2_0(xx) + sel16x2_0(yy) );
1150}
1151
1152UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
1153{
1154   return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
1155                  sel16x2_0(xx) - sel16x2_0(yy) );
1156}
1157
1158UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
1159{
1160   return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1161                  hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1162}
1163
1164UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
1165{
1166   return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1167                  hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1168}
1169
1170UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
1171{
1172   return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1173                  hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1174}
1175
1176UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
1177{
1178   return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1179                  hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1180}
1181
1182UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
1183{
1184   return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
1185                  qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1186}
1187
1188UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
1189{
1190   return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
1191                  qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1192}
1193
1194UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
1195{
1196   return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
1197                  qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
1198}
1199
1200UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
1201{
1202   return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
1203                  qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
1204}
1205
1206/* ------ 8x4 ------ */
1207
1208UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
1209{
1210   return mk8x4(
1211             sel8x4_3(xx) + sel8x4_3(yy),
1212             sel8x4_2(xx) + sel8x4_2(yy),
1213             sel8x4_1(xx) + sel8x4_1(yy),
1214             sel8x4_0(xx) + sel8x4_0(yy)
1215          );
1216}
1217
1218UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
1219{
1220   return mk8x4(
1221             sel8x4_3(xx) - sel8x4_3(yy),
1222             sel8x4_2(xx) - sel8x4_2(yy),
1223             sel8x4_1(xx) - sel8x4_1(yy),
1224             sel8x4_0(xx) - sel8x4_0(yy)
1225          );
1226}
1227
1228UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
1229{
1230   return mk8x4(
1231             hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1232             hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1233             hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1234             hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1235          );
1236}
1237
1238UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
1239{
1240   return mk8x4(
1241             hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1242             hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1243             hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1244             hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1245          );
1246}
1247
1248UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
1249{
1250   return mk8x4(
1251             hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1252             hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1253             hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1254             hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1255          );
1256}
1257
1258UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
1259{
1260   return mk8x4(
1261             hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1262             hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1263             hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1264             hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1265          );
1266}
1267
1268UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
1269{
1270   return mk8x4(
1271             qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
1272             qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
1273             qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
1274             qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
1275          );
1276}
1277
1278UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
1279{
1280   return mk8x4(
1281             qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
1282             qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
1283             qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
1284             qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
1285          );
1286}
1287
1288UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
1289{
1290   return mk8x4(
1291             qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
1292             qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
1293             qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
1294             qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
1295          );
1296}
1297
1298UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
1299{
1300   return mk8x4(
1301             qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
1302             qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
1303             qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
1304             qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
1305          );
1306}
1307
1308UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
1309{
1310   return mk16x2(
1311             cmpnez16( sel16x2_1(xx) ),
1312             cmpnez16( sel16x2_0(xx) )
1313          );
1314}
1315
1316UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
1317{
1318   return mk8x4(
1319             cmpnez8( sel8x4_3(xx) ),
1320             cmpnez8( sel8x4_2(xx) ),
1321             cmpnez8( sel8x4_1(xx) ),
1322             cmpnez8( sel8x4_0(xx) )
1323          );
1324}
1325
1326UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
1327{
1328   return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
1329          + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
1330          + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
1331          + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
1332}
1333
1334
1335/*---------------------------------------------------------------*/
1336/*--- end                               host_generic_simd64.c ---*/
1337/*---------------------------------------------------------------*/
1338