host_generic_simd128.c revision 9bea4c13fca0e3bb4b719dcb3ed63d47d479294e
1
2/*---------------------------------------------------------------*/
3/*--- begin                            host_generic_simd128.c ---*/
4/*---------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2010-2010 OpenWorks GbR
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29*/
30
31/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
32   where the instruction selectors cannot generate code in-line.
33   These are purely back-end entities and cannot be seen/referenced
34   from IR. */
35
36#include "libvex_basictypes.h"
37#include "host_generic_simd128.h"
38
39
40/* Primitive helpers always take args of the real type (signed vs
41   unsigned) but return an unsigned result, so there's no conversion
42   weirdness when stuffing results back in the V128 union fields,
43   which are all unsigned. */
44
45static inline UInt mul32 ( Int xx, Int yy )
46{
47   Int t = ((Int)xx) * ((Int)yy);
48   return toUInt(t);
49}
50
51static inline UInt max32S ( Int xx, Int yy )
52{
53   return toUInt((xx > yy) ? xx : yy);
54}
55
56static inline UInt min32S ( Int xx, Int yy )
57{
58   return toUInt((xx < yy) ? xx : yy);
59}
60
61static inline UInt max32U ( UInt xx, UInt yy )
62{
63   return toUInt((xx > yy) ? xx : yy);
64}
65
66static inline UInt min32U ( UInt xx, UInt yy )
67{
68   return toUInt((xx < yy) ? xx : yy);
69}
70
71static inline UShort max16U ( UShort xx, UShort yy )
72{
73   return toUShort((xx > yy) ? xx : yy);
74}
75
76static inline UShort min16U ( UShort xx, UShort yy )
77{
78   return toUShort((xx < yy) ? xx : yy);
79}
80
81static inline UChar max8S ( Char xx, Char yy )
82{
83   return toUChar((xx > yy) ? xx : yy);
84}
85
86static inline UChar min8S ( Char xx, Char yy )
87{
88   return toUChar((xx < yy) ? xx : yy);
89}
90
91static inline ULong cmpGT64S ( Long xx, Long yy )
92{
93   return (((Long)xx) > ((Long)yy))
94             ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
95}
96
97static inline ULong sar64 ( ULong v, UInt n )
98{
99   return ((Long)v) >> n;
100}
101
102static inline UChar sar8 ( UChar v, UInt n )
103{
104   return toUChar(((Char)v) >> n);
105}
106
107void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
108                              V128* argL, V128* argR )
109{
110   res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
111   res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
112   res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
113   res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
114}
115
116void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
117                               V128* argL, V128* argR )
118{
119   res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
120   res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
121   res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
122   res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
123}
124
125void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
126                               V128* argL, V128* argR )
127{
128   res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
129   res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
130   res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
131   res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
132}
133
134void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
135                               V128* argL, V128* argR )
136{
137   res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
138   res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
139   res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
140   res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
141}
142
143void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
144                               V128* argL, V128* argR )
145{
146   res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
147   res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
148   res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
149   res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
150}
151
152void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
153                               V128* argL, V128* argR )
154{
155   res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
156   res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
157   res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
158   res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
159   res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
160   res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
161   res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
162   res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
163}
164
165void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
166                               V128* argL, V128* argR )
167{
168   res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
169   res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
170   res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
171   res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
172   res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
173   res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
174   res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
175   res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
176}
177
178void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
179                               V128* argL, V128* argR )
180{
181   res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
182   res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
183   res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
184   res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
185   res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
186   res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
187   res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
188   res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
189   res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
190   res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
191   res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
192   res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
193   res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
194   res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
195   res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
196   res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
197}
198
199void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
200                               V128* argL, V128* argR )
201{
202   res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
203   res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
204   res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
205   res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
206   res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
207   res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
208   res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
209   res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
210   res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
211   res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
212   res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
213   res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
214   res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
215   res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
216   res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
217   res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
218}
219
220void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
221                                 V128* argL, V128* argR )
222{
223   res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
224   res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
225}
226
227/* ------------ Shifting ------------ */
228/* Note that because these primops are undefined if the shift amount
229   equals or exceeds the lane width, the shift amount is masked so
230   that the scalar shifts are always in range.  In fact, given the
231   semantics of these primops (Sar64x2, etc) it is an error if in
232   fact we are ever given an out-of-range shift amount.
233*/
234void h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
235                               V128* argL, UInt nn)
236{
237   /* vassert(nn < 64); */
238   nn &= 63;
239   res->w64[0] = sar64(argL->w64[0], nn);
240   res->w64[1] = sar64(argL->w64[1], nn);
241}
242
243void h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
244                              V128* argL, UInt nn)
245{
246   /* vassert(nn < 8); */
247   nn &= 7;
248   res->w8[ 0] = sar8(argL->w8[ 0], nn);
249   res->w8[ 1] = sar8(argL->w8[ 1], nn);
250   res->w8[ 2] = sar8(argL->w8[ 2], nn);
251   res->w8[ 3] = sar8(argL->w8[ 3], nn);
252   res->w8[ 4] = sar8(argL->w8[ 4], nn);
253   res->w8[ 5] = sar8(argL->w8[ 5], nn);
254   res->w8[ 6] = sar8(argL->w8[ 6], nn);
255   res->w8[ 7] = sar8(argL->w8[ 7], nn);
256   res->w8[ 8] = sar8(argL->w8[ 8], nn);
257   res->w8[ 9] = sar8(argL->w8[ 9], nn);
258   res->w8[10] = sar8(argL->w8[10], nn);
259   res->w8[11] = sar8(argL->w8[11], nn);
260   res->w8[12] = sar8(argL->w8[12], nn);
261   res->w8[13] = sar8(argL->w8[13], nn);
262   res->w8[14] = sar8(argL->w8[14], nn);
263   res->w8[15] = sar8(argL->w8[15], nn);
264}
265
266/*---------------------------------------------------------------*/
267/*--- end                              host_generic_simd128.c ---*/
268/*---------------------------------------------------------------*/
269