169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/*---------------------------------------------------------------*/
369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/*--- begin                            host_generic_simd128.c ---*/
469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/*---------------------------------------------------------------*/
569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/*
769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   This file is part of Valgrind, a dynamic binary instrumentation
869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   framework.
969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
10785952d4bf502fa756b2ac58595fd31fe0f88559sewardj   Copyright (C) 2010-2015 OpenWorks GbR
1169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj      info@open-works.net
1269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
1369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   This program is free software; you can redistribute it and/or
1469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   modify it under the terms of the GNU General Public License as
1569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   published by the Free Software Foundation; either version 2 of the
1669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   License, or (at your option) any later version.
1769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
1869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   This program is distributed in the hope that it will be useful, but
1969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   WITHOUT ANY WARRANTY; without even the implied warranty of
2069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   General Public License for more details.
2269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
2369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   You should have received a copy of the GNU General Public License
2469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   along with this program; if not, write to the Free Software
2569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
2669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   02110-1301, USA.
2769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
2869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   The GNU General Public License is contained in the file COPYING.
2969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj*/
3069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
3169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
3269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   where the instruction selectors cannot generate code in-line.
3369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   These are purely back-end entities and cannot be seen/referenced
3469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   from IR. */
3569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
3669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj#include "libvex_basictypes.h"
3769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj#include "host_generic_simd128.h"
3869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
3969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
4069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/* Primitive helpers always take args of the real type (signed vs
4169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   unsigned) but return an unsigned result, so there's no conversion
4269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   weirdness when stuffing results back in the V128 union fields,
4369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   which are all unsigned. */
4469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
4569d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UInt mul32 ( Int xx, Int yy )
4669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
47108e03fcb0a4ef42164235b1988aa540aa1e5298florian   Long t = ((Long)xx) * ((Long)yy);
4869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUInt(t);
4969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
5069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
5169d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UInt max32S ( Int xx, Int yy )
5269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
5369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUInt((xx > yy) ? xx : yy);
5469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
5569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
5669d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UInt min32S ( Int xx, Int yy )
5769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
5869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUInt((xx < yy) ? xx : yy);
5969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
6069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
6169d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UInt max32U ( UInt xx, UInt yy )
6269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
6369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUInt((xx > yy) ? xx : yy);
6469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
6569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
6669d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UInt min32U ( UInt xx, UInt yy )
6769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
6869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUInt((xx < yy) ? xx : yy);
6969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
7069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
7169d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UShort max16U ( UShort xx, UShort yy )
7269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
7369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUShort((xx > yy) ? xx : yy);
7469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
7569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
7669d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UShort min16U ( UShort xx, UShort yy )
7769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
7869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUShort((xx < yy) ? xx : yy);
7969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
8069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
8169d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UChar max8S ( Char xx, Char yy )
8269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
8369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUChar((xx > yy) ? xx : yy);
8469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
8569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
8669d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline UChar min8S ( Char xx, Char yy )
8769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
8869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return toUChar((xx < yy) ? xx : yy);
8969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
9069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
91d881562f1d18ab33038dab4a1b24823dba3422c0sewardjstatic inline ULong cmpEQ64 ( Long xx, Long yy )
92d881562f1d18ab33038dab4a1b24823dba3422c0sewardj{
93d881562f1d18ab33038dab4a1b24823dba3422c0sewardj   return (((Long)xx) == ((Long)yy))
94d881562f1d18ab33038dab4a1b24823dba3422c0sewardj             ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
95d881562f1d18ab33038dab4a1b24823dba3422c0sewardj}
96d881562f1d18ab33038dab4a1b24823dba3422c0sewardj
9769d98e3853a63e578e039894e2ef00ca6f9878c8sewardjstatic inline ULong cmpGT64S ( Long xx, Long yy )
9869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
9969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   return (((Long)xx) > ((Long)yy))
10069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj             ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
10169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
10269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
1030874beee91898d987c327212c7fb35fdb5db0735sewardjstatic inline ULong sar64 ( ULong v, UInt n )
1040874beee91898d987c327212c7fb35fdb5db0735sewardj{
1050874beee91898d987c327212c7fb35fdb5db0735sewardj   return ((Long)v) >> n;
1060874beee91898d987c327212c7fb35fdb5db0735sewardj}
1070874beee91898d987c327212c7fb35fdb5db0735sewardj
1080874beee91898d987c327212c7fb35fdb5db0735sewardjstatic inline UChar sar8 ( UChar v, UInt n )
1090874beee91898d987c327212c7fb35fdb5db0735sewardj{
1100874beee91898d987c327212c7fb35fdb5db0735sewardj   return toUChar(((Char)v) >> n);
1110874beee91898d987c327212c7fb35fdb5db0735sewardj}
1120874beee91898d987c327212c7fb35fdb5db0735sewardj
1132260b993536a3d41d5833504371981718b43154esewardjstatic inline UShort qnarrow32Sto16U ( UInt xx0 )
1142260b993536a3d41d5833504371981718b43154esewardj{
1152260b993536a3d41d5833504371981718b43154esewardj   Int xx = (Int)xx0;
1162260b993536a3d41d5833504371981718b43154esewardj   if (xx < 0)     xx = 0;
1172260b993536a3d41d5833504371981718b43154esewardj   if (xx > 65535) xx = 65535;
1182260b993536a3d41d5833504371981718b43154esewardj   return (UShort)xx;
1192260b993536a3d41d5833504371981718b43154esewardj}
1202260b993536a3d41d5833504371981718b43154esewardj
121ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjstatic inline UShort narrow32to16 ( UInt xx )
122ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj{
123ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   return (UShort)xx;
124ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj}
125ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj
126ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjstatic inline UChar narrow16to8 ( UShort xx )
127ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj{
128ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   return (UChar)xx;
129ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj}
130ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj
131ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj
132ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
133ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
13469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                              V128* argL, V128* argR )
13569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
13669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
13769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
13869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
13969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
14069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
14169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
142ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
143ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
14469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                               V128* argL, V128* argR )
14569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
14669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
14769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
14869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
14969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
15069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
15169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
152ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
153ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
15469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                               V128* argL, V128* argR )
15569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
15669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
15769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
15869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
15969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
16069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
16169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
162ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
163ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
16469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                               V128* argL, V128* argR )
16569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
16669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
16769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
16869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
16969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
17069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
17169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
172ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
173ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
17469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                               V128* argL, V128* argR )
17569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
17669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
17769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
17869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
17969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
18069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
18169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
182ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
183ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
18469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                               V128* argL, V128* argR )
18569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
18669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
18769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
18869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
18969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
19069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
19169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
19269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
19369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
19469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
19569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
196ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
197ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
19869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                               V128* argL, V128* argR )
19969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
20069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
20169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
20269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
20369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
20469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
20569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
20669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
20769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
20869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
20969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
210ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
211ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
21269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                               V128* argL, V128* argR )
21369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
21469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
21569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
21669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
21769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
21869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
21969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
22069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
22169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
22269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
22369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
22469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
22569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
22669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
22769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
22869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
22969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
23069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
23169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
232ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
233ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
23469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                               V128* argL, V128* argR )
23569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
23669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
23769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
23869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
23969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
24069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
24169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
24269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
24369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
24469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
24569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
24669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
24769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
24869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
24969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
25069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
25169d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
25269d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
25369d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
254ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
255ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
256d881562f1d18ab33038dab4a1b24823dba3422c0sewardj                                V128* argL, V128* argR )
257d881562f1d18ab33038dab4a1b24823dba3422c0sewardj{
258d881562f1d18ab33038dab4a1b24823dba3422c0sewardj   res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
259d881562f1d18ab33038dab4a1b24823dba3422c0sewardj   res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
260d881562f1d18ab33038dab4a1b24823dba3422c0sewardj}
261d881562f1d18ab33038dab4a1b24823dba3422c0sewardj
262ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
263ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
26469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj                                 V128* argL, V128* argR )
26569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj{
26669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
26769d98e3853a63e578e039894e2ef00ca6f9878c8sewardj   res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
26869d98e3853a63e578e039894e2ef00ca6f9878c8sewardj}
26969d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
2700874beee91898d987c327212c7fb35fdb5db0735sewardj/* ------------ Shifting ------------ */
2710874beee91898d987c327212c7fb35fdb5db0735sewardj/* Note that because these primops are undefined if the shift amount
2720874beee91898d987c327212c7fb35fdb5db0735sewardj   equals or exceeds the lane width, the shift amount is masked so
2730874beee91898d987c327212c7fb35fdb5db0735sewardj   that the scalar shifts are always in range.  In fact, given the
2740874beee91898d987c327212c7fb35fdb5db0735sewardj   semantics of these primops (Sar64x2, etc) it is an error if in
2750874beee91898d987c327212c7fb35fdb5db0735sewardj   fact we are ever given an out-of-range shift amount.
2760874beee91898d987c327212c7fb35fdb5db0735sewardj*/
277ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid /*not-regparm*/
278ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
2790874beee91898d987c327212c7fb35fdb5db0735sewardj                               V128* argL, UInt nn)
2800874beee91898d987c327212c7fb35fdb5db0735sewardj{
2810874beee91898d987c327212c7fb35fdb5db0735sewardj   /* vassert(nn < 64); */
2820874beee91898d987c327212c7fb35fdb5db0735sewardj   nn &= 63;
2830874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w64[0] = sar64(argL->w64[0], nn);
2840874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w64[1] = sar64(argL->w64[1], nn);
2850874beee91898d987c327212c7fb35fdb5db0735sewardj}
2860874beee91898d987c327212c7fb35fdb5db0735sewardj
287ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid /*not-regparm*/
288ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
2890874beee91898d987c327212c7fb35fdb5db0735sewardj                              V128* argL, UInt nn)
2900874beee91898d987c327212c7fb35fdb5db0735sewardj{
2910874beee91898d987c327212c7fb35fdb5db0735sewardj   /* vassert(nn < 8); */
2920874beee91898d987c327212c7fb35fdb5db0735sewardj   nn &= 7;
2930874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 0] = sar8(argL->w8[ 0], nn);
2940874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 1] = sar8(argL->w8[ 1], nn);
2950874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 2] = sar8(argL->w8[ 2], nn);
2960874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 3] = sar8(argL->w8[ 3], nn);
2970874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 4] = sar8(argL->w8[ 4], nn);
2980874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 5] = sar8(argL->w8[ 5], nn);
2990874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 6] = sar8(argL->w8[ 6], nn);
3000874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 7] = sar8(argL->w8[ 7], nn);
3010874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 8] = sar8(argL->w8[ 8], nn);
3020874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[ 9] = sar8(argL->w8[ 9], nn);
3030874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[10] = sar8(argL->w8[10], nn);
3040874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[11] = sar8(argL->w8[11], nn);
3050874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[12] = sar8(argL->w8[12], nn);
3060874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[13] = sar8(argL->w8[13], nn);
3070874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[14] = sar8(argL->w8[14], nn);
3080874beee91898d987c327212c7fb35fdb5db0735sewardj   res->w8[15] = sar8(argL->w8[15], nn);
3090874beee91898d987c327212c7fb35fdb5db0735sewardj}
31069d98e3853a63e578e039894e2ef00ca6f9878c8sewardj
311ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
312ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
3135f438dd73072211989c6d496845bdc9b777ecbecsewardj                                           V128* argL, V128* argR )
3142260b993536a3d41d5833504371981718b43154esewardj{
3152260b993536a3d41d5833504371981718b43154esewardj   res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
3162260b993536a3d41d5833504371981718b43154esewardj   res->w16[1] = qnarrow32Sto16U(argR->w32[1]);
3172260b993536a3d41d5833504371981718b43154esewardj   res->w16[2] = qnarrow32Sto16U(argR->w32[2]);
3182260b993536a3d41d5833504371981718b43154esewardj   res->w16[3] = qnarrow32Sto16U(argR->w32[3]);
3192260b993536a3d41d5833504371981718b43154esewardj   res->w16[4] = qnarrow32Sto16U(argL->w32[0]);
3202260b993536a3d41d5833504371981718b43154esewardj   res->w16[5] = qnarrow32Sto16U(argL->w32[1]);
3212260b993536a3d41d5833504371981718b43154esewardj   res->w16[6] = qnarrow32Sto16U(argL->w32[2]);
3222260b993536a3d41d5833504371981718b43154esewardj   res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
3232260b993536a3d41d5833504371981718b43154esewardj}
3242260b993536a3d41d5833504371981718b43154esewardj
325ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
326ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128* res,
327ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj                                        V128* argL, V128* argR )
328ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj{
329ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 0] = narrow16to8(argR->w16[0]);
330ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 1] = narrow16to8(argR->w16[1]);
331ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 2] = narrow16to8(argR->w16[2]);
332ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 3] = narrow16to8(argR->w16[3]);
333ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 4] = narrow16to8(argR->w16[4]);
334ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 5] = narrow16to8(argR->w16[5]);
335ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 6] = narrow16to8(argR->w16[6]);
336ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 7] = narrow16to8(argR->w16[7]);
337ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 8] = narrow16to8(argL->w16[0]);
338ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[ 9] = narrow16to8(argL->w16[1]);
339ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[10] = narrow16to8(argL->w16[2]);
340ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[11] = narrow16to8(argL->w16[3]);
341ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[12] = narrow16to8(argL->w16[4]);
342ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[13] = narrow16to8(argL->w16[5]);
343ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[14] = narrow16to8(argL->w16[6]);
344ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w8[15] = narrow16to8(argL->w16[7]);
345ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj}
346ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj
347ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardjvoid VEX_REGPARM(3)
348ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj     h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128* res,
349ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj                                        V128* argL, V128* argR )
350ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj{
351ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w16[0] = narrow32to16(argR->w32[0]);
352ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w16[1] = narrow32to16(argR->w32[1]);
353ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w16[2] = narrow32to16(argR->w32[2]);
354ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w16[3] = narrow32to16(argR->w32[3]);
355ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w16[4] = narrow32to16(argL->w32[0]);
356ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w16[5] = narrow32to16(argL->w32[1]);
357ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w16[6] = narrow32to16(argL->w32[2]);
358ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj   res->w16[7] = narrow32to16(argL->w32[3]);
359ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj}
360ad2c9ea0c360fced134b2dd0d4b28c0be3639cfbsewardj
361d8bca7e48c8b3ad028878ab501197ba701df43e5sewardjvoid VEX_REGPARM(3)
362d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj     h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
363d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj                               V128* argL, V128* argR )
364d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj{
365d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj   res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
366d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj   res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
367d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj   res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
368d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj   res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
369d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj}
370d8bca7e48c8b3ad028878ab501197ba701df43e5sewardj
37178a20592024c096430f630006412fec063568bfdsewardjUInt /*not-regparm*/
37278a20592024c096430f630006412fec063568bfdsewardj     h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
37378a20592024c096430f630006412fec063568bfdsewardj{
37478a20592024c096430f630006412fec063568bfdsewardj   UInt r = 0;
37578a20592024c096430f630006412fec063568bfdsewardj   if (w64hi & (1ULL << (64-1))) r |= (1<<15);
37678a20592024c096430f630006412fec063568bfdsewardj   if (w64hi & (1ULL << (56-1))) r |= (1<<14);
37778a20592024c096430f630006412fec063568bfdsewardj   if (w64hi & (1ULL << (48-1))) r |= (1<<13);
37878a20592024c096430f630006412fec063568bfdsewardj   if (w64hi & (1ULL << (40-1))) r |= (1<<12);
37978a20592024c096430f630006412fec063568bfdsewardj   if (w64hi & (1ULL << (32-1))) r |= (1<<11);
38078a20592024c096430f630006412fec063568bfdsewardj   if (w64hi & (1ULL << (24-1))) r |= (1<<10);
38178a20592024c096430f630006412fec063568bfdsewardj   if (w64hi & (1ULL << (16-1))) r |= (1<<9);
38278a20592024c096430f630006412fec063568bfdsewardj   if (w64hi & (1ULL << ( 8-1))) r |= (1<<8);
38378a20592024c096430f630006412fec063568bfdsewardj   if (w64lo & (1ULL << (64-1))) r |= (1<<7);
38478a20592024c096430f630006412fec063568bfdsewardj   if (w64lo & (1ULL << (56-1))) r |= (1<<6);
38578a20592024c096430f630006412fec063568bfdsewardj   if (w64lo & (1ULL << (48-1))) r |= (1<<5);
38678a20592024c096430f630006412fec063568bfdsewardj   if (w64lo & (1ULL << (40-1))) r |= (1<<4);
38778a20592024c096430f630006412fec063568bfdsewardj   if (w64lo & (1ULL << (32-1))) r |= (1<<3);
38878a20592024c096430f630006412fec063568bfdsewardj   if (w64lo & (1ULL << (24-1))) r |= (1<<2);
38978a20592024c096430f630006412fec063568bfdsewardj   if (w64lo & (1ULL << (16-1))) r |= (1<<1);
39078a20592024c096430f630006412fec063568bfdsewardj   if (w64lo & (1ULL << ( 8-1))) r |= (1<<0);
39178a20592024c096430f630006412fec063568bfdsewardj   return r;
39278a20592024c096430f630006412fec063568bfdsewardj}
3932260b993536a3d41d5833504371981718b43154esewardj
39469d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/*---------------------------------------------------------------*/
39569d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/*--- end                              host_generic_simd128.c ---*/
39669d98e3853a63e578e039894e2ef00ca6f9878c8sewardj/*---------------------------------------------------------------*/
397