1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ 12#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ 13 14#include <smmintrin.h> // SSE4.1 15 16#include "./vpx_config.h" 17#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" 18 19static INLINE __m128i multiplication_round_shift_sse4_1( 20 const __m128i *const in /*in[2]*/, const int c) { 21 const __m128i pair_c = pair_set_epi32(c * 4, 0); 22 __m128i t0, t1; 23 24 t0 = _mm_mul_epi32(in[0], pair_c); 25 t1 = _mm_mul_epi32(in[1], pair_c); 26 t0 = dct_const_round_shift_64bit(t0); 27 t1 = dct_const_round_shift_64bit(t1); 28 29 return pack_4(t0, t1); 30} 31 32static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1, 33 const int c0, const int c1, 34 __m128i *const out0, 35 __m128i *const out1) { 36 const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); 37 const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); 38 __m128i temp1[4], temp2[4]; 39 40 extend_64bit(in0, temp1); 41 extend_64bit(in1, temp2); 42 temp1[2] = _mm_mul_epi32(temp1[0], pair_c1); 43 temp1[3] = _mm_mul_epi32(temp1[1], pair_c1); 44 temp1[0] = _mm_mul_epi32(temp1[0], pair_c0); 45 temp1[1] = _mm_mul_epi32(temp1[1], pair_c0); 46 temp2[2] = _mm_mul_epi32(temp2[0], pair_c0); 47 temp2[3] = _mm_mul_epi32(temp2[1], pair_c0); 48 temp2[0] = _mm_mul_epi32(temp2[0], pair_c1); 49 temp2[1] = _mm_mul_epi32(temp2[1], pair_c1); 50 temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); 51 temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); 52 temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); 53 temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); 54 temp1[0] = dct_const_round_shift_64bit(temp1[0]); 55 temp1[1] = dct_const_round_shift_64bit(temp1[1]); 56 temp2[0] = dct_const_round_shift_64bit(temp2[0]); 57 temp2[1] = dct_const_round_shift_64bit(temp2[1]); 58 *out0 = pack_4(temp1[0], temp1[1]); 59 *out1 = pack_4(temp2[0], temp2[1]); 60} 61 62static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0, 63 const __m128i in1, 64 __m128i *const out0, 65 __m128i *const out1) { 66 __m128i temp1[2], temp2; 67 68 temp2 = _mm_add_epi32(in0, in1); 69 extend_64bit(temp2, temp1); 70 *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); 71 temp2 = _mm_sub_epi32(in0, in1); 72 extend_64bit(temp2, temp1); 73 *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); 74} 75 76static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in, 77 const int c0, const int c1, 78 __m128i *const out0, 79 __m128i *const out1) { 80 __m128i temp[2]; 81 82 extend_64bit(in, temp); 83 *out0 = multiplication_round_shift_sse4_1(temp, c0); 84 *out1 = multiplication_round_shift_sse4_1(temp, c1); 85} 86 87#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ 88