ih264_ihadamard_scaling_sse42.c revision 7497191460a9504f8b4f64df169ab633f0b74353
1bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi/****************************************************************************** 2bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * 3bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * Copyright (C) 2015 The Android Open Source Project 4bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * 5bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * Licensed under the Apache License, Version 2.0 (the "License"); 6bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * you may not use this file except in compliance with the License. 7bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * You may obtain a copy of the License at: 8bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * 9bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * http://www.apache.org/licenses/LICENSE-2.0 10bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * 11bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * Unless required by applicable law or agreed to in writing, software 12bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * distributed under the License is distributed on an "AS IS" BASIS, 13bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * See the License for the specific language governing permissions and 15bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * limitations under the License. 16c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * 17c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn ***************************************************************************** 18c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 1930c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez*/ 2030c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez/** 21c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn ******************************************************************************* 22887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * @file 23e31034f28641c9bc3e5576cab6d2c7d918bb427cUtkarsh Sanghi * ih264_ihadamard_scaling_sse42.c 2487b642d428ca28ff81b97ab0617f2c04d8605683Darren Krahn * 25e31034f28641c9bc3e5576cab6d2c7d918bb427cUtkarsh Sanghi * @brief 26e31034f28641c9bc3e5576cab6d2c7d918bb427cUtkarsh Sanghi * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling 2733ca033c9665b8496f3491037403ae1574a54227Utkarsh Sanghi * 2833ca033c9665b8496f3491037403ae1574a54227Utkarsh Sanghi * @author 29c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * Mohit 30c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * 310ebbc58fe6d45378a5b502c33eb1c4289fd8b05bUtkarsh Sanghi * @par List of Functions: 3203d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn * - ih264_ihadamard_scaling_4x4_sse42() 3333ca033c9665b8496f3491037403ae1574a54227Utkarsh Sanghi * - ih264_ihadamard_scaling_2x2_uv_ssse42() 34ff7f2da556b21253a52abbc82e7cf7bee54a850eUtkarsh Sanghi * 35a6e332ee7f5fe52c1291d961dbeda975c8272044Utkarsh Sanghi * @remarks 362ee32a9d61896f544d87ecee24dc25cc33c9ebb3Utkarsh Sanghi * 3739dd58452d7fe3ebe93490a6239a76b385bd676dUtkarsh Sanghi ******************************************************************************* 38c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn */ 39c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/*****************************************************************************/ 40c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/* File Includes */ 41c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/*****************************************************************************/ 42c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn 43c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/* User include files */ 442ee32a9d61896f544d87ecee24dc25cc33c9ebb3Utkarsh Sanghi#include "ih264_typedefs.h" 45e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi#include "ih264_defs.h" 4639dd58452d7fe3ebe93490a6239a76b385bd676dUtkarsh Sanghi#include "ih264_trans_macros.h" 474dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn#include "ih264_macros.h" 48c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include "ih264_trans_data.h" 49c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include "ih264_size_defs.h" 50c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include "ih264_structs.h" 51c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include "ih264_trans_quant_itrans_iquant.h" 52c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include <immintrin.h> 53c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn 54c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/* 55c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn ******************************************************************************** 56c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * 57c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients 58c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * of a 16x16 intra prediction macroblock, and then performs scaling. 59887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * prediction buffer 60887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * 61e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi * @par Description: 62e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi * The DC coefficients pass through a 2-stage inverse hadamard transform. 63887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * This inverse transformed content is scaled to based on Qp value. 64e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi * 65887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * @param[in] pi2_src 66e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi * input 4x4 block of DC coefficients 67e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi * 68887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * @param[out] pi2_out 69e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi * output 4x4 block 70e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi * 71c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * @param[in] pu2_iscal_mat 72c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * pointer to scaling list 73c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * 74c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * @param[in] pu2_weigh_mat 75c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * pointer to weight matrix 7687b642d428ca28ff81b97ab0617f2c04d8605683Darren Krahn * 7787b642d428ca28ff81b97ab0617f2c04d8605683Darren Krahn * @param[in] u4_qp_div_6 7887b642d428ca28ff81b97ab0617f2c04d8605683Darren Krahn * Floor (qp/6) 79c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * 804dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn * @param[in] pi4_tmp 81c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * temporary buffer of size 1*16 8203d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn * 8303d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn * @returns none 8403d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn * 85c04decde2a6e9b74395f402818852e752534254bUtkarsh Sanghi * @remarks none 8603d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn * 8703d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn ******************************************************************************* 8803d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn */ 8903d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahnvoid ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, 9003d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn WORD16* pi2_out, 91c04decde2a6e9b74395f402818852e752534254bUtkarsh Sanghi const UWORD16 *pu2_iscal_mat, 9203d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn const UWORD16 *pu2_weigh_mat, 9303d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn UWORD32 u4_qp_div_6, 9403d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn WORD32* pi4_tmp) 9503d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn{ 9603d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn __m128i src_r0_r1, src_r2_r3; 9703d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn __m128i src_r0, src_r1, src_r2, src_r3; 9803d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn __m128i temp0, temp1, temp2, temp3; 99d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6))); 100d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); 10130c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez UNUSED (pi4_tmp); 102d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi 103d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row 104d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row 105d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi //sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); 106d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi src_r0 = _mm_cvtepi16_epi32(src_r0_r1); 107d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi src_r0_r1 = _mm_srli_si128(src_r0_r1, 8); 108d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi src_r1 = _mm_cvtepi16_epi32(src_r0_r1); 10930c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez 110d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi src_r2 = _mm_cvtepi16_epi32(src_r2_r3); 1114dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn src_r2_r3 = _mm_srli_si128(src_r2_r3, 8); 1124dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn src_r3 = _mm_cvtepi16_epi32(src_r2_r3); 113d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi 114d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi /* Perform Inverse transform */ 115b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn /*-------------------------------------------------------------*/ 116b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn /* IDCT [ Horizontal transformation ] */ 117d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi /*-------------------------------------------------------------*/ 118d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi // Matrix transpose 119d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi /* 120d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi * a0 a1 a2 a3 121b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn * b0 b1 b2 b3 122b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn * c0 c1 c2 c3 123d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi * d0 d1 d2 d3 124d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi */ 125d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 126d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 127e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 128c04decde2a6e9b74395f402818852e752534254bUtkarsh Sanghi temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 129e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 130e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 131b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 132b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 133e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi 134e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi temp0 = _mm_add_epi32(src_r0, src_r3); 135e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi temp1 = _mm_add_epi32(src_r1, src_r2); 136c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn temp2 = _mm_sub_epi32(src_r1, src_r2); 137c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn temp3 = _mm_sub_epi32(src_r0, src_r3); 13830c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez 139c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r0 = _mm_add_epi32(temp0, temp1); 140c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r1 = _mm_add_epi32(temp2, temp3); 14103d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn src_r2 = _mm_sub_epi32(temp0, temp1); 142c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r3 = _mm_sub_epi32(temp3, temp2); 143c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn 144c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn /*-------------------------------------------------------------*/ 145c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn /* IDCT [ Vertical transformation ] */ 146b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn /*-------------------------------------------------------------*/ 147b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn // Matrix transpose 148c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn /* 149c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * a0 b0 c0 d0 150b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn * a1 b1 c1 d1 151c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * a2 b2 c2 d2 152c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * a3 b3 c3 d3 153c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn */ 154c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 155c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 15630c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 15752e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 1584dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 15952e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 16052e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 16152e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 16252e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn 16352e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn temp0 = _mm_add_epi32(src_r0, src_r3); 164e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi temp1 = _mm_add_epi32(src_r1, src_r2); 165e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi temp2 = _mm_sub_epi32(src_r1, src_r2); 166e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi temp3 = _mm_sub_epi32(src_r0, src_r3); 167e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi 168e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi src_r0 = _mm_add_epi32(temp0, temp1); 169e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi src_r1 = _mm_add_epi32(temp2, temp3); 17003d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn src_r2 = _mm_sub_epi32(temp0, temp1); 171c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r3 = _mm_sub_epi32(temp3, temp2); 172c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn 17330c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez src_r0 = _mm_mullo_epi32(src_r0, mult_val); 174c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r1 = _mm_mullo_epi32(src_r1, mult_val); 175c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r2 = _mm_mullo_epi32(src_r2, mult_val); 176e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi src_r3 = _mm_mullo_epi32(src_r3, mult_val); 17703d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn 178c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn //Scaling 179c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn if(u4_qp_div_6 >= 6) 180c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn { 181c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); 182c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); 183c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); 184e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); 185e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi } 186db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn else 187db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn { 188db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn temp0 = _mm_add_epi32(src_r0, add_rshift); 189db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn temp1 = _mm_add_epi32(src_r1, add_rshift); 190db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn temp2 = _mm_add_epi32(src_r2, add_rshift); 191db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn temp3 = _mm_add_epi32(src_r3, add_rshift); 192b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6); 193b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6); 194db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6); 195db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6); 196db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn } 197db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn src_r0_r1 = _mm_packs_epi32(src_r0, src_r1); 198db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn src_r2_r3 = _mm_packs_epi32(src_r2, src_r3); 199db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn 200db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1); 201db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3); 202db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn} 203db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn 204db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahnvoid ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src, 205db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn WORD16* pi2_out, 206db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn const UWORD16 *pu2_iscal_mat, 207db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn const UWORD16 *pu2_weigh_mat, 208db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn UWORD32 u4_qp_div_6, 209e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi WORD32* pi4_tmp) 210db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn{ 211db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; 212db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn __m128i zero_8x16b = _mm_setzero_si128(); 213db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn __m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0])); 214db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn UNUSED(pi4_tmp); 215db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn 216db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn src = _mm_loadu_si128((__m128i *) pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 217db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); 218db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits 219db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits 220db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn 221db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3 222db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3 223db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3 224db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3 225db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3 22630c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3 227e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi 228e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3 229e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3 230e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi 231e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3 2324dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3 2334dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn 2344dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0] 2354dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0] 236e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi 237b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn temp0 = _mm_slli_epi32(temp0, u4_qp_div_6); 238b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn temp1 = _mm_slli_epi32(temp1, u4_qp_div_6); 239e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi 240e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi temp0 = _mm_srai_epi32(temp0, 5); 241e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi temp1 = _mm_srai_epi32(temp1, 5); 242b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn 243e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi temp0 = _mm_packs_epi32(temp0, temp1); //Final values are 16-bits only. 244e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi 245e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); 246e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi 247e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi} 2482ee32a9d61896f544d87ecee24dc25cc33c9ebb3Utkarsh Sanghi