ih264_ihadamard_scaling_sse42.c revision 7497191460a9504f8b4f64df169ab633f0b74353
1bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi/******************************************************************************
2bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi *
3bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * Copyright (C) 2015 The Android Open Source Project
4bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi *
5bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * Licensed under the Apache License, Version 2.0 (the "License");
6bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * you may not use this file except in compliance with the License.
7bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * You may obtain a copy of the License at:
8bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi *
9bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * http://www.apache.org/licenses/LICENSE-2.0
10bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi *
11bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * Unless required by applicable law or agreed to in writing, software
12bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * distributed under the License is distributed on an "AS IS" BASIS,
13bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * See the License for the specific language governing permissions and
15bbef5dff2b94fef72012e721cd6124cd87621af4Utkarsh Sanghi * limitations under the License.
16c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *
17c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *****************************************************************************
18c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
1930c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez*/
2030c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez/**
21c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *******************************************************************************
22887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * @file
23e31034f28641c9bc3e5576cab6d2c7d918bb427cUtkarsh Sanghi *  ih264_ihadamard_scaling_sse42.c
2487b642d428ca28ff81b97ab0617f2c04d8605683Darren Krahn *
25e31034f28641c9bc3e5576cab6d2c7d918bb427cUtkarsh Sanghi * @brief
26e31034f28641c9bc3e5576cab6d2c7d918bb427cUtkarsh Sanghi *  Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
2733ca033c9665b8496f3491037403ae1574a54227Utkarsh Sanghi *
2833ca033c9665b8496f3491037403ae1574a54227Utkarsh Sanghi * @author
29c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *  Mohit
30c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *
310ebbc58fe6d45378a5b502c33eb1c4289fd8b05bUtkarsh Sanghi *  @par List of Functions:
3203d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn *  - ih264_ihadamard_scaling_4x4_sse42()
3333ca033c9665b8496f3491037403ae1574a54227Utkarsh Sanghi *  - ih264_ihadamard_scaling_2x2_uv_ssse42()
34ff7f2da556b21253a52abbc82e7cf7bee54a850eUtkarsh Sanghi *
35a6e332ee7f5fe52c1291d961dbeda975c8272044Utkarsh Sanghi * @remarks
362ee32a9d61896f544d87ecee24dc25cc33c9ebb3Utkarsh Sanghi *
3739dd58452d7fe3ebe93490a6239a76b385bd676dUtkarsh Sanghi *******************************************************************************
38c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn */
39c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/*****************************************************************************/
40c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/* File Includes                                                             */
41c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/*****************************************************************************/
42c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn
43c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/* User include files */
442ee32a9d61896f544d87ecee24dc25cc33c9ebb3Utkarsh Sanghi#include "ih264_typedefs.h"
45e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi#include "ih264_defs.h"
4639dd58452d7fe3ebe93490a6239a76b385bd676dUtkarsh Sanghi#include "ih264_trans_macros.h"
474dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn#include "ih264_macros.h"
48c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include "ih264_trans_data.h"
49c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include "ih264_size_defs.h"
50c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include "ih264_structs.h"
51c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include "ih264_trans_quant_itrans_iquant.h"
52c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn#include <immintrin.h>
53c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn
54c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn/*
55c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn ********************************************************************************
56c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *
57c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
58c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * of a 16x16 intra prediction macroblock, and then performs scaling.
59887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * prediction buffer
60887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi *
61e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi * @par Description:
62e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi *  The DC coefficients pass through a 2-stage inverse hadamard transform.
63887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi *  This inverse transformed content is scaled to based on Qp value.
64e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi *
65887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * @param[in] pi2_src
66e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi *  input 4x4 block of DC coefficients
67e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi *
68887ccbcc4db5fa0b5221018296aa49b86a5ac26fUtkarsh Sanghi * @param[out] pi2_out
69e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi *  output 4x4 block
70e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi *
71c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * @param[in] pu2_iscal_mat
72c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *  pointer to scaling list
73c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *
74c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * @param[in] pu2_weigh_mat
75c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *  pointer to weight matrix
7687b642d428ca28ff81b97ab0617f2c04d8605683Darren Krahn *
7787b642d428ca28ff81b97ab0617f2c04d8605683Darren Krahn * @param[in] u4_qp_div_6
7887b642d428ca28ff81b97ab0617f2c04d8605683Darren Krahn *  Floor (qp/6)
79c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn *
804dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn * @param[in] pi4_tmp
81c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn * temporary buffer of size 1*16
8203d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn *
8303d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn * @returns none
8403d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn *
85c04decde2a6e9b74395f402818852e752534254bUtkarsh Sanghi * @remarks none
8603d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn *
8703d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn *******************************************************************************
8803d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn */
8903d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahnvoid ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src,
9003d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn                                       WORD16* pi2_out,
91c04decde2a6e9b74395f402818852e752534254bUtkarsh Sanghi                                       const UWORD16 *pu2_iscal_mat,
9203d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn                                       const UWORD16 *pu2_weigh_mat,
9303d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn                                       UWORD32 u4_qp_div_6,
9403d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn                                       WORD32* pi4_tmp)
9503d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn{
9603d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn    __m128i src_r0_r1, src_r2_r3;
9703d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn    __m128i src_r0, src_r1, src_r2, src_r3;
9803d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn    __m128i temp0, temp1, temp2, temp3;
99d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6)));
100d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
10130c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez    UNUSED (pi4_tmp);
102d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi
103d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
104d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
105d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    //sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
106d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    src_r0 = _mm_cvtepi16_epi32(src_r0_r1);
107d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    src_r0_r1 = _mm_srli_si128(src_r0_r1, 8);
108d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    src_r1 = _mm_cvtepi16_epi32(src_r0_r1);
10930c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez
110d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    src_r2 = _mm_cvtepi16_epi32(src_r2_r3);
1114dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn    src_r2_r3 = _mm_srli_si128(src_r2_r3, 8);
1124dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn    src_r3 = _mm_cvtepi16_epi32(src_r2_r3);
113d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi
114d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    /* Perform Inverse transform */
115b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn    /*-------------------------------------------------------------*/
116b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn    /* IDCT [ Horizontal transformation ]                          */
117d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    /*-------------------------------------------------------------*/
118d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    // Matrix transpose
119d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    /*
120d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi     *  a0 a1 a2 a3
121b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn     *  b0 b1 b2 b3
122b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn     *  c0 c1 c2 c3
123d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi     *  d0 d1 d2 d3
124d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi     */
125d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 b0 a1 b1
126d75dcae8a010d1ced7554dd25a440bee350a2d06Utkarsh Sanghi    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //c0 d0 c1 d1
127e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //a2 b2 a3 b3
128c04decde2a6e9b74395f402818852e752534254bUtkarsh Sanghi    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 d2 c3 d3
129e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                    //a0 b0 c0 d0
130e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                    //a1 b1 c1 d1
131b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                    //a2 b2 c2 d2
132b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                    //a3 b3 c3 d3
133e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi
134e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi    temp0 = _mm_add_epi32(src_r0, src_r3);
135e8b9a556d4561617747fed4ee5ced70fce9a4392Utkarsh Sanghi    temp1 = _mm_add_epi32(src_r1, src_r2);
136c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    temp2 = _mm_sub_epi32(src_r1, src_r2);
137c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    temp3 = _mm_sub_epi32(src_r0, src_r3);
13830c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez
139c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    src_r0 = _mm_add_epi32(temp0, temp1);
140c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    src_r1 = _mm_add_epi32(temp2, temp3);
14103d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn    src_r2 = _mm_sub_epi32(temp0, temp1);
142c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    src_r3 = _mm_sub_epi32(temp3, temp2);
143c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn
144c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    /*-------------------------------------------------------------*/
145c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    /* IDCT [ Vertical transformation ]                          */
146b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn    /*-------------------------------------------------------------*/
147b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn    // Matrix transpose
148c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    /*
149c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn     *  a0 b0 c0 d0
150b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn     *  a1 b1 c1 d1
151c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn     *  a2 b2 c2 d2
152c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn     *  a3 b3 c3 d3
153c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn     */
154c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 a1 b0 b1
155c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //a2 a3 b2 b3
15630c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //c0 c1 d0 d1
15752e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 c3 d2 d3
1584dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                   //a0 a1 a2 a3
15952e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                   //b0 b1 b2 b3
16052e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                   //c0 c1 c2 c3
16152e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                   //d0 d1 d2 d3
16252e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn
16352e2a45f585fbe34032eae5b094a092afdf217caDarren Krahn    temp0 = _mm_add_epi32(src_r0, src_r3);
164e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    temp1 = _mm_add_epi32(src_r1, src_r2);
165e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    temp2 = _mm_sub_epi32(src_r1, src_r2);
166e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    temp3 = _mm_sub_epi32(src_r0, src_r3);
167e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi
168e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    src_r0 = _mm_add_epi32(temp0, temp1);
169e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    src_r1 = _mm_add_epi32(temp2, temp3);
17003d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn    src_r2 = _mm_sub_epi32(temp0, temp1);
171c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    src_r3 = _mm_sub_epi32(temp3, temp2);
172c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn
17330c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez    src_r0 = _mm_mullo_epi32(src_r0, mult_val);
174c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    src_r1 = _mm_mullo_epi32(src_r1, mult_val);
175c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    src_r2 = _mm_mullo_epi32(src_r2, mult_val);
176e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    src_r3 = _mm_mullo_epi32(src_r3, mult_val);
17703d54dfbcbdd04384e8c0419b7c45282664a2c1aDarren Krahn
178c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    //Scaling
179c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    if(u4_qp_div_6 >= 6)
180c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn    {
181c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn        src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
182c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn        src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
183c364caa9d091baae8eeb9144a6abf69e1fcabb39Darren Krahn        src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
184e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi        src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
185e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    }
186db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    else
187db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    {
188db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn        temp0 = _mm_add_epi32(src_r0, add_rshift);
189db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn        temp1 = _mm_add_epi32(src_r1, add_rshift);
190db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn        temp2 = _mm_add_epi32(src_r2, add_rshift);
191db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn        temp3 = _mm_add_epi32(src_r3, add_rshift);
192b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn        src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6);
193b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn        src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6);
194db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn        src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6);
195db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn        src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6);
196db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    }
197db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    src_r0_r1 = _mm_packs_epi32(src_r0, src_r1);
198db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    src_r2_r3 = _mm_packs_epi32(src_r2, src_r3);
199db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn
200db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1);
201db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3);
202db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn}
203db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn
204db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahnvoid ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src,
205db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn                                          WORD16* pi2_out,
206db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn                                          const UWORD16 *pu2_iscal_mat,
207db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn                                          const UWORD16 *pu2_weigh_mat,
208db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn                                          UWORD32 u4_qp_div_6,
209e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi                                          WORD32* pi4_tmp)
210db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn{
211db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
212db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    __m128i zero_8x16b = _mm_setzero_si128();
213db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    __m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0]));
214db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    UNUSED(pi4_tmp);
215db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn
216db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    src = _mm_loadu_si128((__m128i *) pi2_src);         //a0 a1 a2 a3 b0 b1 b2 b3
217db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
218db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    plane_0 = _mm_unpacklo_epi16(src, sign_reg);        //a0 a1 a2 a3 -- 32 bits
219db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    plane_1 = _mm_unpackhi_epi16(src, sign_reg);        //b0 b1 b2 b3 -- 32 bits
220db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn
221db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    temp0 = _mm_hadd_epi32(plane_0, plane_1);           //a0+a1 a2+a3 b0+b1 b2+b3
222db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    temp1 = _mm_hsub_epi32(plane_0, plane_1);           //a0-a1 a2-a3 b0-b1 b2-b3
223db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    plane_0 = _mm_hadd_epi32(temp0, temp1);             //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
224db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    plane_1 = _mm_hsub_epi32(temp0, temp1);             //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
225db30790da11248905f8dca02a933ebbdcf9e3ff9Darren Krahn    temp0 = _mm_unpacklo_epi32(plane_0, plane_1);       //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
22630c921db09d27768acc1ea0d8b6a9c8e814f931aLuis Hector Chavez    temp1 = _mm_unpackhi_epi32(plane_0, plane_1);       //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
227e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi
228e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    plane_0 = _mm_unpacklo_epi64(temp0, temp1);         //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
229e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    plane_1 = _mm_unpackhi_epi64(temp0, temp1);         //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
230e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi
231e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    plane_0 = _mm_shuffle_epi32(plane_0, 0xd8);         //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
2324dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn    plane_1 = _mm_shuffle_epi32(plane_1, 0xd8);         //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
2334dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn
2344dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn    temp0 = _mm_mullo_epi32(scale_val, plane_0);        //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0]
2354dc4629c415e7ca90ff146d7bb75b5646ecd8b17Darren Krahn    temp1 = _mm_mullo_epi32(scale_val, plane_1);        //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0]
236e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi
237b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn    temp0 = _mm_slli_epi32(temp0, u4_qp_div_6);
238b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn    temp1 = _mm_slli_epi32(temp1, u4_qp_div_6);
239e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi
240e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    temp0 = _mm_srai_epi32(temp0, 5);
241e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    temp1 = _mm_srai_epi32(temp1, 5);
242b180754b429c078cbc99175a6059a8b5d0491002Darren Krahn
243e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    temp0 = _mm_packs_epi32(temp0, temp1);              //Final values are 16-bits only.
244e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi
245e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi    _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
246e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi
247e7eb2bf306af6e8408cd77125861542d19e5ec6dUtkarsh Sanghi}
2482ee32a9d61896f544d87ecee24dc25cc33c9ebb3Utkarsh Sanghi