fwd_txfm_sse2.h revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_
12#define VPX_DSP_X86_FWD_TXFM_SSE2_H_
13
14#ifdef __cplusplus
15extern "C" {
16#endif
17
18#define pair_set_epi32(a, b) \
19  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
20
21static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
22  __m128i buf0, buf1;
23  buf0 = _mm_mul_epu32(a, b);
24  a = _mm_srli_epi64(a, 32);
25  b = _mm_srli_epi64(b, 32);
26  buf1 = _mm_mul_epu32(a, b);
27  return _mm_add_epi64(buf0, buf1);
28}
29
30static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
31  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
32  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
33  return _mm_unpacklo_epi64(buf0, buf1);
34}
35
36static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
37                                          const __m128i *preg1) {
38  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
39  const __m128i min_overflow = _mm_set1_epi16(0x8000);
40  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
41                              _mm_cmpeq_epi16(*preg0, min_overflow));
42  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
43                              _mm_cmpeq_epi16(*preg1, min_overflow));
44  cmp0 = _mm_or_si128(cmp0, cmp1);
45  return _mm_movemask_epi8(cmp0);
46}
47
48static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
49                                          const __m128i *preg1,
50                                          const __m128i *preg2,
51                                          const __m128i *preg3) {
52  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
53  const __m128i min_overflow = _mm_set1_epi16(0x8000);
54  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
55                              _mm_cmpeq_epi16(*preg0, min_overflow));
56  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
57                              _mm_cmpeq_epi16(*preg1, min_overflow));
58  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
59                              _mm_cmpeq_epi16(*preg2, min_overflow));
60  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
61                              _mm_cmpeq_epi16(*preg3, min_overflow));
62  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
63  return _mm_movemask_epi8(cmp0);
64}
65
66static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
67                                          const __m128i *preg1,
68                                          const __m128i *preg2,
69                                          const __m128i *preg3,
70                                          const __m128i *preg4,
71                                          const __m128i *preg5,
72                                          const __m128i *preg6,
73                                          const __m128i *preg7) {
74  int res0, res1;
75  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
76  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
77  return res0 + res1;
78}
79
80static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
81                                           const __m128i *preg1,
82                                           const __m128i *preg2,
83                                           const __m128i *preg3,
84                                           const __m128i *preg4,
85                                           const __m128i *preg5,
86                                           const __m128i *preg6,
87                                           const __m128i *preg7,
88                                           const __m128i *preg8,
89                                           const __m128i *preg9,
90                                           const __m128i *preg10,
91                                           const __m128i *preg11) {
92  int res0, res1;
93  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
94  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
95  if (!res0)
96    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
97  return res0 + res1;
98}
99
100static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
101                                           const __m128i *preg1,
102                                           const __m128i *preg2,
103                                           const __m128i *preg3,
104                                           const __m128i *preg4,
105                                           const __m128i *preg5,
106                                           const __m128i *preg6,
107                                           const __m128i *preg7,
108                                           const __m128i *preg8,
109                                           const __m128i *preg9,
110                                           const __m128i *preg10,
111                                           const __m128i *preg11,
112                                           const __m128i *preg12,
113                                           const __m128i *preg13,
114                                           const __m128i *preg14,
115                                           const __m128i *preg15) {
116  int res0, res1;
117  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
118  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
119  if (!res0) {
120    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
121    if (!res1)
122      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
123  }
124  return res0 + res1;
125}
126
127static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
128                                           const __m128i *preg1,
129                                           const __m128i *preg2,
130                                           const __m128i *preg3,
131                                           const __m128i *preg4,
132                                           const __m128i *preg5,
133                                           const __m128i *preg6,
134                                           const __m128i *preg7,
135                                           const __m128i *preg8,
136                                           const __m128i *preg9,
137                                           const __m128i *preg10,
138                                           const __m128i *preg11,
139                                           const __m128i *preg12,
140                                           const __m128i *preg13,
141                                           const __m128i *preg14,
142                                           const __m128i *preg15,
143                                           const __m128i *preg16,
144                                           const __m128i *preg17,
145                                           const __m128i *preg18,
146                                           const __m128i *preg19,
147                                           const __m128i *preg20,
148                                           const __m128i *preg21,
149                                           const __m128i *preg22,
150                                           const __m128i *preg23,
151                                           const __m128i *preg24,
152                                           const __m128i *preg25,
153                                           const __m128i *preg26,
154                                           const __m128i *preg27,
155                                           const __m128i *preg28,
156                                           const __m128i *preg29,
157                                           const __m128i *preg30,
158                                           const __m128i *preg31) {
159  int res0, res1;
160  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
161  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
162  if (!res0) {
163    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
164    if (!res1) {
165      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
166      if (!res0) {
167        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
168        if (!res1) {
169          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
170          if (!res0) {
171            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
172            if (!res1)
173              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
174          }
175        }
176      }
177    }
178  }
179  return res0 + res1;
180}
181
182static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
183                                           const __m128i *preg1,
184                                           const __m128i *preg2,
185                                           const __m128i *preg3,
186                                           const __m128i *zero) {
187  __m128i minus_one = _mm_set1_epi32(-1);
188  // Check for overflows
189  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
190  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
191  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
192  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
193  __m128i reg0_top_dwords = _mm_shuffle_epi32(
194      reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
195  __m128i reg1_top_dwords = _mm_shuffle_epi32(
196      reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
197  __m128i reg2_top_dwords = _mm_shuffle_epi32(
198      reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
199  __m128i reg3_top_dwords = _mm_shuffle_epi32(
200      reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
201  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
202  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
203  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
204  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
205  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
206  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
207  int overflow_01 = _mm_movemask_epi8(
208      _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
209  int overflow_23 = _mm_movemask_epi8(
210      _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
211  return (overflow_01 + overflow_23);
212}
213
214static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
215                                           const __m128i *preg1,
216                                           const __m128i *preg2,
217                                           const __m128i *preg3,
218                                           const __m128i *preg4,
219                                           const __m128i *preg5,
220                                           const __m128i *preg6,
221                                           const __m128i *preg7,
222                                           const __m128i *zero) {
223  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
224  if (!overflow) {
225    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
226  }
227  return overflow;
228}
229
230static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
231                                            const __m128i *preg1,
232                                            const __m128i *preg2,
233                                            const __m128i *preg3,
234                                            const __m128i *preg4,
235                                            const __m128i *preg5,
236                                            const __m128i *preg6,
237                                            const __m128i *preg7,
238                                            const __m128i *preg8,
239                                            const __m128i *preg9,
240                                            const __m128i *preg10,
241                                            const __m128i *preg11,
242                                            const __m128i *preg12,
243                                            const __m128i *preg13,
244                                            const __m128i *preg14,
245                                            const __m128i *preg15,
246                                            const __m128i *zero) {
247  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
248  if (!overflow) {
249    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
250    if (!overflow) {
251      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
252                                          zero);
253      if (!overflow) {
254        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
255                                            zero);
256      }
257    }
258  }
259  return overflow;
260}
261
262static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
263                                            const __m128i *preg1,
264                                            const __m128i *preg2,
265                                            const __m128i *preg3,
266                                            const __m128i *preg4,
267                                            const __m128i *preg5,
268                                            const __m128i *preg6,
269                                            const __m128i *preg7,
270                                            const __m128i *preg8,
271                                            const __m128i *preg9,
272                                            const __m128i *preg10,
273                                            const __m128i *preg11,
274                                            const __m128i *preg12,
275                                            const __m128i *preg13,
276                                            const __m128i *preg14,
277                                            const __m128i *preg15,
278                                            const __m128i *preg16,
279                                            const __m128i *preg17,
280                                            const __m128i *preg18,
281                                            const __m128i *preg19,
282                                            const __m128i *preg20,
283                                            const __m128i *preg21,
284                                            const __m128i *preg22,
285                                            const __m128i *preg23,
286                                            const __m128i *preg24,
287                                            const __m128i *preg25,
288                                            const __m128i *preg26,
289                                            const __m128i *preg27,
290                                            const __m128i *preg28,
291                                            const __m128i *preg29,
292                                            const __m128i *preg30,
293                                            const __m128i *preg31,
294                                            const __m128i *zero) {
295  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
296  if (!overflow) {
297    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
298    if (!overflow) {
299      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
300      if (!overflow) {
301        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
302                                            zero);
303        if (!overflow) {
304          overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
305                                              zero);
306          if (!overflow) {
307            overflow = k_check_epi32_overflow_4(preg20, preg21,
308                                                preg22, preg23, zero);
309            if (!overflow) {
310              overflow = k_check_epi32_overflow_4(preg24, preg25,
311                                                  preg26, preg27, zero);
312              if (!overflow) {
313                overflow = k_check_epi32_overflow_4(preg28, preg29,
314                                                    preg30, preg31, zero);
315              }
316            }
317          }
318        }
319      }
320    }
321  }
322  return overflow;
323}
324
325static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
326#if CONFIG_VP9_HIGHBITDEPTH
327  const __m128i zero = _mm_setzero_si128();
328  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
329  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
330  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
331  _mm_store_si128((__m128i *)(dst_ptr), out0);
332  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
333#else
334  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
335#endif  // CONFIG_VP9_HIGHBITDEPTH
336}
337
338static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
339#if CONFIG_VP9_HIGHBITDEPTH
340  const __m128i zero = _mm_setzero_si128();
341  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
342  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
343  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
344  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
345  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
346#else
347  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
348#endif  // CONFIG_VP9_HIGHBITDEPTH
349}
350
351
352static INLINE __m128i mult_round_shift(const __m128i *pin0,
353                                       const __m128i *pin1,
354                                       const __m128i *pmultiplier,
355                                       const __m128i *prounding,
356                                       const int shift) {
357  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
358  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
359  const __m128i v0 = _mm_add_epi32(u0, *prounding);
360  const __m128i v1 = _mm_add_epi32(u1, *prounding);
361  const __m128i w0 = _mm_srai_epi32(v0, shift);
362  const __m128i w1 = _mm_srai_epi32(v1, shift);
363  return _mm_packs_epi32(w0, w1);
364}
365
366static INLINE void transpose_and_output8x8(
367    const __m128i *pin00, const __m128i *pin01,
368    const __m128i *pin02, const __m128i *pin03,
369    const __m128i *pin04, const __m128i *pin05,
370    const __m128i *pin06, const __m128i *pin07,
371    const int pass, int16_t* out0_ptr,
372    tran_low_t* out1_ptr) {
373  // 00 01 02 03 04 05 06 07
374  // 10 11 12 13 14 15 16 17
375  // 20 21 22 23 24 25 26 27
376  // 30 31 32 33 34 35 36 37
377  // 40 41 42 43 44 45 46 47
378  // 50 51 52 53 54 55 56 57
379  // 60 61 62 63 64 65 66 67
380  // 70 71 72 73 74 75 76 77
381  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
382  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
383  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
384  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
385  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
386  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
387  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
388  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
389  // 00 10 01 11 02 12 03 13
390  // 20 30 21 31 22 32 23 33
391  // 04 14 05 15 06 16 07 17
392  // 24 34 25 35 26 36 27 37
393  // 40 50 41 51 42 52 43 53
394  // 60 70 61 71 62 72 63 73
395  // 54 54 55 55 56 56 57 57
396  // 64 74 65 75 66 76 67 77
397  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
398  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
399  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
400  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
401  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
402  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
403  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
404  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
405  // 00 10 20 30 01 11 21 31
406  // 40 50 60 70 41 51 61 71
407  // 02 12 22 32 03 13 23 33
408  // 42 52 62 72 43 53 63 73
409  // 04 14 24 34 05 15 21 36
410  // 44 54 64 74 45 55 61 76
411  // 06 16 26 36 07 17 27 37
412  // 46 56 66 76 47 57 67 77
413  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
414  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
415  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
416  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
417  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
418  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
419  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
420  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
421  // 00 10 20 30 40 50 60 70
422  // 01 11 21 31 41 51 61 71
423  // 02 12 22 32 42 52 62 72
424  // 03 13 23 33 43 53 63 73
425  // 04 14 24 34 44 54 64 74
426  // 05 15 25 35 45 55 65 75
427  // 06 16 26 36 46 56 66 76
428  // 07 17 27 37 47 57 67 77
429  if (pass == 0) {
430    _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
431    _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
432    _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
433    _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
434    _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
435    _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
436    _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
437    _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
438  } else {
439    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
440    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
441    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
442    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
443    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
444    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
445    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
446    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
447  }
448}
449
450#ifdef __cplusplus
451}  // extern "C"
452#endif
453
454#endif  // VPX_DSP_X86_FWD_TXFM_SSE2_H_
455