1
2/*---------------------------------------------------------------*/
3/*--- begin                            host_generic_simd128.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2010-2013 OpenWorks GbR
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29*/
30
31/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
32   where the instruction selectors cannot generate code in-line.
33   These are purely back-end entities and cannot be seen/referenced
34   from IR. */
35
36#include "libvex_basictypes.h"
37#include "host_generic_simd128.h"
38
39
40/* Primitive helpers always take args of the real type (signed vs
41   unsigned) but return an unsigned result, so there's no conversion
42   weirdness when stuffing results back in the V128 union fields,
43   which are all unsigned. */
44
45static inline UInt mul32 ( Int xx, Int yy )
46{
47   Long t = ((Long)xx) * ((Long)yy);
48   return toUInt(t);
49}
50
51static inline UInt max32S ( Int xx, Int yy )
52{
53   return toUInt((xx > yy) ? xx : yy);
54}
55
56static inline UInt min32S ( Int xx, Int yy )
57{
58   return toUInt((xx < yy) ? xx : yy);
59}
60
61static inline UInt max32U ( UInt xx, UInt yy )
62{
63   return toUInt((xx > yy) ? xx : yy);
64}
65
66static inline UInt min32U ( UInt xx, UInt yy )
67{
68   return toUInt((xx < yy) ? xx : yy);
69}
70
71static inline UShort max16U ( UShort xx, UShort yy )
72{
73   return toUShort((xx > yy) ? xx : yy);
74}
75
76static inline UShort min16U ( UShort xx, UShort yy )
77{
78   return toUShort((xx < yy) ? xx : yy);
79}
80
81static inline UChar max8S ( Char xx, Char yy )
82{
83   return toUChar((xx > yy) ? xx : yy);
84}
85
86static inline UChar min8S ( Char xx, Char yy )
87{
88   return toUChar((xx < yy) ? xx : yy);
89}
90
91static inline ULong cmpEQ64 ( Long xx, Long yy )
92{
93   return (((Long)xx) == ((Long)yy))
94             ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
95}
96
97static inline ULong cmpGT64S ( Long xx, Long yy )
98{
99   return (((Long)xx) > ((Long)yy))
100             ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
101}
102
103static inline ULong sar64 ( ULong v, UInt n )
104{
105   return ((Long)v) >> n;
106}
107
108static inline UChar sar8 ( UChar v, UInt n )
109{
110   return toUChar(((Char)v) >> n);
111}
112
113static inline UShort qnarrow32Sto16U ( UInt xx0 )
114{
115   Int xx = (Int)xx0;
116   if (xx < 0)     xx = 0;
117   if (xx > 65535) xx = 65535;
118   return (UShort)xx;
119}
120
121static inline UShort narrow32to16 ( UInt xx )
122{
123   return (UShort)xx;
124}
125
126static inline UChar narrow16to8 ( UShort xx )
127{
128   return (UChar)xx;
129}
130
131
132void VEX_REGPARM(3)
133     h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
134                              V128* argL, V128* argR )
135{
136   res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
137   res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
138   res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
139   res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
140}
141
142void VEX_REGPARM(3)
143     h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
144                               V128* argL, V128* argR )
145{
146   res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
147   res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
148   res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
149   res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
150}
151
152void VEX_REGPARM(3)
153     h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
154                               V128* argL, V128* argR )
155{
156   res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
157   res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
158   res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
159   res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
160}
161
162void VEX_REGPARM(3)
163     h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
164                               V128* argL, V128* argR )
165{
166   res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
167   res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
168   res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
169   res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
170}
171
172void VEX_REGPARM(3)
173     h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
174                               V128* argL, V128* argR )
175{
176   res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
177   res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
178   res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
179   res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
180}
181
182void VEX_REGPARM(3)
183     h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
184                               V128* argL, V128* argR )
185{
186   res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
187   res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
188   res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
189   res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
190   res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
191   res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
192   res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
193   res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
194}
195
196void VEX_REGPARM(3)
197     h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
198                               V128* argL, V128* argR )
199{
200   res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
201   res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
202   res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
203   res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
204   res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
205   res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
206   res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
207   res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
208}
209
210void VEX_REGPARM(3)
211     h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
212                               V128* argL, V128* argR )
213{
214   res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
215   res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
216   res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
217   res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
218   res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
219   res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
220   res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
221   res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
222   res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
223   res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
224   res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
225   res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
226   res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
227   res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
228   res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
229   res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
230}
231
232void VEX_REGPARM(3)
233     h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
234                               V128* argL, V128* argR )
235{
236   res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
237   res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
238   res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
239   res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
240   res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
241   res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
242   res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
243   res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
244   res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
245   res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
246   res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
247   res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
248   res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
249   res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
250   res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
251   res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
252}
253
254void VEX_REGPARM(3)
255     h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
256                                V128* argL, V128* argR )
257{
258   res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
259   res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
260}
261
262void VEX_REGPARM(3)
263     h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
264                                 V128* argL, V128* argR )
265{
266   res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
267   res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
268}
269
270/* ------------ Shifting ------------ */
271/* Note that because these primops are undefined if the shift amount
272   equals or exceeds the lane width, the shift amount is masked so
273   that the scalar shifts are always in range.  In fact, given the
274   semantics of these primops (Sar64x2, etc) it is an error if in
275   fact we are ever given an out-of-range shift amount.
276*/
277void /*not-regparm*/
278     h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
279                               V128* argL, UInt nn)
280{
281   /* vassert(nn < 64); */
282   nn &= 63;
283   res->w64[0] = sar64(argL->w64[0], nn);
284   res->w64[1] = sar64(argL->w64[1], nn);
285}
286
287void /*not-regparm*/
288     h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
289                              V128* argL, UInt nn)
290{
291   /* vassert(nn < 8); */
292   nn &= 7;
293   res->w8[ 0] = sar8(argL->w8[ 0], nn);
294   res->w8[ 1] = sar8(argL->w8[ 1], nn);
295   res->w8[ 2] = sar8(argL->w8[ 2], nn);
296   res->w8[ 3] = sar8(argL->w8[ 3], nn);
297   res->w8[ 4] = sar8(argL->w8[ 4], nn);
298   res->w8[ 5] = sar8(argL->w8[ 5], nn);
299   res->w8[ 6] = sar8(argL->w8[ 6], nn);
300   res->w8[ 7] = sar8(argL->w8[ 7], nn);
301   res->w8[ 8] = sar8(argL->w8[ 8], nn);
302   res->w8[ 9] = sar8(argL->w8[ 9], nn);
303   res->w8[10] = sar8(argL->w8[10], nn);
304   res->w8[11] = sar8(argL->w8[11], nn);
305   res->w8[12] = sar8(argL->w8[12], nn);
306   res->w8[13] = sar8(argL->w8[13], nn);
307   res->w8[14] = sar8(argL->w8[14], nn);
308   res->w8[15] = sar8(argL->w8[15], nn);
309}
310
311void VEX_REGPARM(3)
312     h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
313                                           V128* argL, V128* argR )
314{
315   res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
316   res->w16[1] = qnarrow32Sto16U(argR->w32[1]);
317   res->w16[2] = qnarrow32Sto16U(argR->w32[2]);
318   res->w16[3] = qnarrow32Sto16U(argR->w32[3]);
319   res->w16[4] = qnarrow32Sto16U(argL->w32[0]);
320   res->w16[5] = qnarrow32Sto16U(argL->w32[1]);
321   res->w16[6] = qnarrow32Sto16U(argL->w32[2]);
322   res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
323}
324
325void VEX_REGPARM(3)
326     h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128* res,
327                                        V128* argL, V128* argR )
328{
329   res->w8[ 0] = narrow16to8(argR->w16[0]);
330   res->w8[ 1] = narrow16to8(argR->w16[1]);
331   res->w8[ 2] = narrow16to8(argR->w16[2]);
332   res->w8[ 3] = narrow16to8(argR->w16[3]);
333   res->w8[ 4] = narrow16to8(argR->w16[4]);
334   res->w8[ 5] = narrow16to8(argR->w16[5]);
335   res->w8[ 6] = narrow16to8(argR->w16[6]);
336   res->w8[ 7] = narrow16to8(argR->w16[7]);
337   res->w8[ 8] = narrow16to8(argL->w16[0]);
338   res->w8[ 9] = narrow16to8(argL->w16[1]);
339   res->w8[10] = narrow16to8(argL->w16[2]);
340   res->w8[11] = narrow16to8(argL->w16[3]);
341   res->w8[12] = narrow16to8(argL->w16[4]);
342   res->w8[13] = narrow16to8(argL->w16[5]);
343   res->w8[14] = narrow16to8(argL->w16[6]);
344   res->w8[15] = narrow16to8(argL->w16[7]);
345}
346
347void VEX_REGPARM(3)
348     h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128* res,
349                                        V128* argL, V128* argR )
350{
351   res->w16[0] = narrow32to16(argR->w32[0]);
352   res->w16[1] = narrow32to16(argR->w32[1]);
353   res->w16[2] = narrow32to16(argR->w32[2]);
354   res->w16[3] = narrow32to16(argR->w32[3]);
355   res->w16[4] = narrow32to16(argL->w32[0]);
356   res->w16[5] = narrow32to16(argL->w32[1]);
357   res->w16[6] = narrow32to16(argL->w32[2]);
358   res->w16[7] = narrow32to16(argL->w32[3]);
359}
360
361void VEX_REGPARM(3)
362     h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
363                               V128* argL, V128* argR )
364{
365   res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
366   res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
367   res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
368   res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
369}
370
371UInt /*not-regparm*/
372     h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
373{
374   UInt r = 0;
375   if (w64hi & (1ULL << (64-1))) r |= (1<<15);
376   if (w64hi & (1ULL << (56-1))) r |= (1<<14);
377   if (w64hi & (1ULL << (48-1))) r |= (1<<13);
378   if (w64hi & (1ULL << (40-1))) r |= (1<<12);
379   if (w64hi & (1ULL << (32-1))) r |= (1<<11);
380   if (w64hi & (1ULL << (24-1))) r |= (1<<10);
381   if (w64hi & (1ULL << (16-1))) r |= (1<<9);
382   if (w64hi & (1ULL << ( 8-1))) r |= (1<<8);
383   if (w64lo & (1ULL << (64-1))) r |= (1<<7);
384   if (w64lo & (1ULL << (56-1))) r |= (1<<6);
385   if (w64lo & (1ULL << (48-1))) r |= (1<<5);
386   if (w64lo & (1ULL << (40-1))) r |= (1<<4);
387   if (w64lo & (1ULL << (32-1))) r |= (1<<3);
388   if (w64lo & (1ULL << (24-1))) r |= (1<<2);
389   if (w64lo & (1ULL << (16-1))) r |= (1<<1);
390   if (w64lo & (1ULL << ( 8-1))) r |= (1<<0);
391   return r;
392}
393
394/*---------------------------------------------------------------*/
395/*--- end                              host_generic_simd128.c ---*/
396/*---------------------------------------------------------------*/
397