18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/******************************************************************************
28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Copyright (C) 2015 The Android Open Source Project
48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Licensed under the Apache License, Version 2.0 (the "License");
68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * you may not use this file except in compliance with the License.
78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * You may obtain a copy of the License at:
88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * http://www.apache.org/licenses/LICENSE-2.0
108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Unless required by applicable law or agreed to in writing, software
128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * distributed under the License is distributed on an "AS IS" BASIS,
138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * See the License for the specific language governing permissions and
158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * limitations under the License.
168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *****************************************************************************
188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *******************************************************************************
228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @file
238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *  ih264_resi_trans_quant_sse42.c
248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief
268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *  Contains function definitions single stage  forward transform for H.264
278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *  It will calculate the residue, do the cf and then do quantization
288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @author
308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *  Mohit [100664]
318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par List of Functions:
338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *  - ih264_resi_trans_quant_4x4_sse42()
348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *  - ih264_resi_trans_quant_chroma_4x4_sse42()
358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks
378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *  None
388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *******************************************************************************
408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */
418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* System include files */
428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <stddef.h>
438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* User include files */
458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_typedefs.h"
468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_defs.h"
478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_size_defs.h"
488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_macros.h"
498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_trans_macros.h"
508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_trans_data.h"
518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_structs.h"
528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_trans_quant_itrans_iquant.h"
538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <immintrin.h>
548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *******************************************************************************
568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief
588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   This function performs forward transform and quantization on a 4*4 block
598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par Description:
618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   The function accepts source buffer and estimation buffer. From these, it
628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   computes the residue. This is residue is then transformed and quantized.
638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   The transform and quantization are in placed computed. They use the residue
648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   buffer for this.
658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_src
678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to source sub-block
688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_pred
708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to prediction sub-block
718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pi2_out
738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to residual sub-block
748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] src_strd
768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Source stride
778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pred_strd
798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Prediction stride
808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] dst_strd
828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Destination stride
838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_qbits
858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *    QP_BITS_h264_4x4 + floor(QP/6)
868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_threshold_matrix
888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to Forward Quant Threshold Matrix
898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_scale_matrix
918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to Forward Quant Scale Matrix
928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_round_factor
948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Quantization Round factor
958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[out] pu1_nnz
978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Total non-zero coefficients in the current sub-block
988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @returns
1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks
1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   None
1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *******************************************************************************
1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */
1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred,
1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                      WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                      const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix,
1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                      UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz,
1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                      WORD16 *pi2_alt_dc_addr)
1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 mask0, mask1;
1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sum0, sum1, sum2, cmp0, cmp1;
1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp_2 = _mm_set1_epi16(2);
1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp_1 = _mm_set1_epi16(1);
1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0, src_r1, src_r2, src_r3;
1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp0, temp1, temp2, temp3;
1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sign_reg0, sign_reg2;
1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i scalemat_r0_r1, scalemat_r2_r3;
12425e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar
12525e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar    UNUSED (pu2_threshold_matrix);
12625e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar
1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_cvtepu8_epi16(src_r0);
1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_cvtepu8_epi16(src_r1);
1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_cvtepu8_epi16(src_r2);
1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_cvtepu8_epi16(src_r3);
1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_sub_epi16(src_r0, pred_r0);
1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_sub_epi16(src_r1, pred_r1);
1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_sub_epi16(src_r2, pred_r2);
1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_sub_epi16(src_r3, pred_r3);
1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* Perform Forward transform */
1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*-------------------------------------------------------------*/
1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* DCT [ Horizontal transformation ]                          */
1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*-------------------------------------------------------------*/
1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Matrix transpose
1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*
1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a0 a1 a2 a3
1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  b0 b1 b2 b3
1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  c0 c1 c2 c3
1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  d0 d1 d2 d3
1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     */
1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 b0 a1 b1 a2 b2 a3 b3
1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //c0 d0 c1 d1 c2 d2 c3 d3
1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 b0 c0 d0 a1 b1 c1 d1
1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //a2 b2 c2 d2 a3 b3 c3 d3
1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 b0 c0 d0
1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //a1 b1 c1 d1
1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //a2 b2 c2 d2
1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //a3 b3 c3 d3
1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*----------------------------------------------------------*/
1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x0 = z0 + z3                                             */
1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi16(src_r0, src_r3);
1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x1 = z1 + z2                                             */
1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(src_r1, src_r2);
1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x2 = z1 - z2                                             */
1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_sub_epi16(src_r1, src_r2);
1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x3 = z0 - z3                                             */
1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_sub_epi16(src_r0, src_r3);
1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z0 = x0 + x1                                             */
1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_add_epi16(temp0, temp1);
1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z1 = (x3 << 1) + x2                                      */
1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_add_epi16(src_r1, temp2);
1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z2 = x0 - x1                                             */
1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_sub_epi16(temp0, temp1);
1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z3 = x3 - (x2 << 1)                                      */
1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_sub_epi16(temp3, src_r3);
1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Matrix transpose
1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*
1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a0 b0 c0 d0
1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a1 b1 c1 d1
2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a2 b2 c2 d2
2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a3 b3 c3 d3
2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     */
2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 a1 b0 b1 c0 c1 d0 d1
2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //a2 a3 b2 b3 c2 c3 d2 d3
2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 a1 a2 a3 b0 b1 b2 b3
2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //c0 c1 c2 c3 d0 d1 d2 d3
2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 a1 a2 a3
2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //b0 b1 b2 b3
2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //c0 c1 c2 c3
2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //d0 d1 d2 d3
2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*----------------------------------------------------------*/
2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x0 = z0 + z3                                             */
2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi16(src_r0, src_r3);
2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x1 = z1 + z2                                             */
2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(src_r1, src_r2);
2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x2 = z1 - z2                                             */
2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_sub_epi16(src_r1, src_r2);
2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x3 = z0 - z3                                             */
2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_sub_epi16(src_r0, src_r3);
2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z0 = x0 + x1                                             */
2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_add_epi16(temp0, temp1);
2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z1 = (x3 << 1) + x2                                      */
2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_add_epi16(src_r1, temp2);
2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z2 = x0 - x1                                             */
2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_sub_epi16(temp0, temp1);
2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z3 = x3 - (x2 << 1)                                      */
2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_sub_epi16(temp3, src_r3);
2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    tmp_dc = _mm_extract_epi16(src_r0,0);                       //a0
2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    *pi2_alt_dc_addr = tmp_dc;
2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);                //a0 a1 a2 a3 b0 b1 b2 b3
2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);                //c0 c1 c2 c3 d0 d1 d2 d3
2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_abs_epi16(src_r0);
2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_abs_epi16(src_r2);
2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_srli_si128(src_r0, 8);
2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_cvtepu16_epi32(src_r0);
2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_cvtepu16_epi32(src_r1);
2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_srli_si128(src_r2, 8);
2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_cvtepu16_epi32(src_r2);
2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_cvtepu16_epi32(src_r3);
2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_mullo_epi32(temp0, src_r0);
2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_mullo_epi32(temp1, src_r1);
2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_mullo_epi32(temp2, src_r2);
2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_mullo_epi32(temp3, src_r3);
2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi32(temp0,rnd_fact);
2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi32(temp1,rnd_fact);
2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi32(temp2,rnd_fact);
2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi32(temp3,rnd_fact);
2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_srli_epi32(temp0,u4_qbits);
2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srli_epi32(temp1,u4_qbits);
2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srli_epi32(temp2,u4_qbits);
2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_srli_epi32(temp3,u4_qbits);
2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 =  _mm_packs_epi32 (temp0,temp1);
2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 =  _mm_packs_epi32 (temp2,temp3);
2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 =  _mm_sign_epi16(temp0, sign_reg0);
2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 =  _mm_sign_epi16(temp2, sign_reg2);
2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask0 = _mm_movemask_epi8(cmp0);
2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask1 = _mm_movemask_epi8(cmp1);
2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u4_zero_coeff = 0;
2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if(mask0)
2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if(mask0 == 0xffff)
2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff+=8;
2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        else
3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        {
3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp0 = _mm_and_si128(temp_1, cmp0);
3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if(mask1)
3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if(mask1 == 0xffff)
3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff+=8;
3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        else
3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        {
3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp1 = _mm_and_si128(temp_1, cmp1);
3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* Return total nonzero coefficients in the current sub block */
3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u4_nonzero_coeff = 16 - u4_zero_coeff;
3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    *pu1_nnz =  u4_nonzero_coeff;
3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *******************************************************************************
3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief
3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   This function performs forward transform and quantization on a 4*4 chroma block
3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par Description:
3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   The function accepts source buffer and estimation buffer. From these, it
3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   computes the residue. This is residue is then transformed and quantized.
3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   The transform and quantization are in placed computed. They use the residue
3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   buffer for this.
3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_src
3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to source sub-block
3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_pred
3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to prediction sub-block
3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pi2_out
3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to residual sub-block
3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] src_strd
3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Source stride
3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pred_strd
3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Prediction stride
3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] dst_strd
3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Destination stride
3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_qbits
3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *    QP_BITS_h264_4x4 + floor(QP/6)
3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_threshold_matrix
3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to Forward Quant Threshold Matrix
3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_scale_matrix
3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to Forward Quant Scale Matrix
3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_round_factor
3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Quantization Round factor
3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[out] pu1_nnz
3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Total non-zero coefficients in the current sub-block
3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @returns
3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks
3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   None
3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *******************************************************************************
3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */
3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out,
3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                            WORD32 src_strd,WORD32 pred_strd,
3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                            const UWORD16 *pu2_scale_matrix,
3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                            const UWORD16 *pu2_threshold_matrix,
3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                            UWORD32 u4_qbits,UWORD32 u4_round_factor,
3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                            UWORD8  *pu1_nnz, WORD16 *pi2_alt_dc_addr)
3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 mask0, mask1;
3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i cmp0, cmp1, sum0, sum1, sum2;
3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp_2 = _mm_set1_epi16(2);
3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp_1 = _mm_set1_epi16(1);
3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0, src_r1, src_r2, src_r3;
3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i pred_r0, pred_r1, pred_r2, pred_r3;
3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp0, temp1, temp2, temp3;
3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sign_reg0, sign_reg2;
3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i scalemat_r0_r1, scalemat_r2_r3;
3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i chroma_mask = _mm_set1_epi16 (0xFF);
3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
40025e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar    UNUSED (pu2_threshold_matrix);
40125e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar
4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_and_si128(src_r0, chroma_mask);
4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_and_si128(src_r1, chroma_mask);
4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_and_si128(src_r2, chroma_mask);
4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_and_si128(src_r3, chroma_mask);
4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//  src_r0 = _mm_cvtepu8_epi16(src_r0);
4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//  src_r1 = _mm_cvtepu8_epi16(src_r1);
4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//  src_r2 = _mm_cvtepu8_epi16(src_r2);
4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//  src_r3 = _mm_cvtepu8_epi16(src_r3);
4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//  pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//  pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//  pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//  pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_sub_epi16(src_r0, pred_r0);
4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_sub_epi16(src_r1, pred_r1);
4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_sub_epi16(src_r2, pred_r2);
4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_sub_epi16(src_r3, pred_r3);
4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* Perform Forward transform */
4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*-------------------------------------------------------------*/
4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* DCT [ Horizontal transformation ]                          */
4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*-------------------------------------------------------------*/
4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Matrix transpose
4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*
4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a0 a1 a2 a3
4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  b0 b1 b2 b3
4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  c0 c1 c2 c3
4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  d0 d1 d2 d3
4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     */
4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 b0 a1 b1 a2 b2 a3 b3
4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //c0 d0 c1 d1 c2 d2 c3 d3
4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 b0 c0 d0 a1 b1 c1 d1
4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //a2 b2 c2 d2 a3 b3 c3 d3
4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 b0 c0 d0
4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //a1 b1 c1 d1
4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //a2 b2 c2 d2
4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //a3 b3 c3 d3
4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*----------------------------------------------------------*/
4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x0 = z0 + z3                                             */
4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi16(src_r0, src_r3);
4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x1 = z1 + z2                                             */
4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(src_r1, src_r2);
4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x2 = z1 - z2                                             */
4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_sub_epi16(src_r1, src_r2);
4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x3 = z0 - z3                                             */
4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_sub_epi16(src_r0, src_r3);
4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z0 = x0 + x1                                             */
4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_add_epi16(temp0, temp1);
4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z1 = (x3 << 1) + x2                                      */
4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_add_epi16(src_r1, temp2);
4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z2 = x0 - x1                                             */
4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_sub_epi16(temp0, temp1);
4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z3 = x3 - (x2 << 1)                                      */
4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_sub_epi16(temp3, src_r3);
4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Matrix transpose
4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*
4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a0 b0 c0 d0
4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a1 b1 c1 d1
4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a2 b2 c2 d2
4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a3 b3 c3 d3
4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     */
4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 a1 b0 b1 c0 c1 d0 d1
4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //a2 a3 b2 b3 c2 c3 d2 d3
4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 a1 a2 a3 b0 b1 b2 b3
4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //c0 c1 c2 c3 d0 d1 d2 d3
4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 a1 a2 a3
4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //b0 b1 b2 b3
4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //c0 c1 c2 c3
4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //d0 d1 d2 d3
4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*----------------------------------------------------------*/
4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x0 = z0 + z3                                             */
4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi16(src_r0, src_r3);
4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x1 = z1 + z2                                             */
5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(src_r1, src_r2);
5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x2 = z1 - z2                                             */
5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_sub_epi16(src_r1, src_r2);
5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* x3 = z0 - z3                                             */
5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_sub_epi16(src_r0, src_r3);
5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z0 = x0 + x1                                             */
5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_add_epi16(temp0, temp1);
5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z1 = (x3 << 1) + x2                                      */
5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_add_epi16(src_r1, temp2);
5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z2 = x0 - x1                                             */
5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_sub_epi16(temp0, temp1);
5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* z3 = x3 - (x2 << 1)                                      */
5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_sub_epi16(temp3, src_r3);
5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    tmp_dc = _mm_extract_epi16(src_r0,0);                       //a0
5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    *pi2_alt_dc_addr = tmp_dc;
5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);                //a0 a1 a2 a3 b0 b1 b2 b3
5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);                //c0 c1 c2 c3 d0 d1 d2 d3
5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_abs_epi16(src_r0);
5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_abs_epi16(src_r2);
5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_srli_si128(src_r0, 8);
5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_cvtepu16_epi32(src_r0);
5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_cvtepu16_epi32(src_r1);
5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_srli_si128(src_r2, 8);
5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_cvtepu16_epi32(src_r2);
5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_cvtepu16_epi32(src_r3);
5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_mullo_epi32(temp0, src_r0);
5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_mullo_epi32(temp1, src_r1);
5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_mullo_epi32(temp2, src_r2);
5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_mullo_epi32(temp3, src_r3);
5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi32(temp0,rnd_fact);
5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi32(temp1,rnd_fact);
5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi32(temp2,rnd_fact);
5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi32(temp3,rnd_fact);
5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_srli_epi32(temp0,u4_qbits);
5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srli_epi32(temp1,u4_qbits);
5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srli_epi32(temp2,u4_qbits);
5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_srli_epi32(temp3,u4_qbits);
5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 =  _mm_packs_epi32 (temp0,temp1);
5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 =  _mm_packs_epi32 (temp2,temp3);
5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 =  _mm_sign_epi16(temp0, sign_reg0);
5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 =  _mm_sign_epi16(temp2, sign_reg2);
5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0);
5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask0 = _mm_movemask_epi8(cmp0);
5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask1 = _mm_movemask_epi8(cmp1);
5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u4_zero_coeff = 0;
5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if(mask0)
5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if(mask0 == 0xffff)
5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff+=8;
5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        else
5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        {
5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp0 = _mm_and_si128(temp_1, cmp0);
5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if(mask1)
5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if(mask1 == 0xffff)
5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff+=8;
5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        else
5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        {
5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp1 = _mm_and_si128(temp_1, cmp1);
6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* Return total nonzero coefficients in the current sub block */
6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u4_nonzero_coeff = 16 - u4_zero_coeff;
6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    *pu1_nnz =  u4_nonzero_coeff;
6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *******************************************************************************
6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief
6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   This function performs forward hadamard transform and quantization on a 4*4 block
6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par Description:
6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   The function accepts source buffer and estimation buffer. From these, it
6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   computes the residue. This is residue is then transformed and quantized.
6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   The transform and quantization are in placed computed. They use the residue
6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   buffer for this.
6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_src
6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to source sub-block
6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_pred
6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to prediction sub-block
6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pi2_out
6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to residual sub-block
6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] src_strd
6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Source stride
6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pred_strd
6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Prediction stride
6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] dst_strd
6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Destination stride
6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_qbits
6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *    QP_BITS_h264_4x4 + floor(QP/6)
6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_threshold_matrix
6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to Forward Quant Threshold Matrix
6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_scale_matrix
6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to Forward Quant Scale Matrix
6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_round_factor
6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Quantization Round factor
6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[out] pu1_nnz
6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Total non-zero coefficients in the current sub-block
6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @returns
6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks
6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   None
6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */
6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          const UWORD16 *pu2_scale_matrix,
6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          UWORD32 u4_round_factor,UWORD8  *pu1_nnz
6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          )
6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 u4_zero_coeff,u4_nonzero_coeff=0;
6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i cmp0, cmp1, sum0, sum1, sum2;
6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 mask0, mask1;
6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0_r1, src_r2_r3, sign_reg;
6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0, src_r1, src_r2, src_r3;
6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero_8x16b = _mm_setzero_si128();
6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp0, temp1, temp2, temp3;
6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3;
6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp_1 = _mm_set1_epi16(1);
6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
68425e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar    UNUSED (pu2_threshold_matrix);
68525e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar
6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg);   //a0 a1 a2 a3
6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg);   //b0 b1 b2 b3
6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg);   //c0 c1 c2 c3
6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg);   //d0 d1 d2 d3
6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* Perform Inverse transform */
6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*-------------------------------------------------------------*/
6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* Forward DC transform [ Horizontal transformation ]                          */
6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*-------------------------------------------------------------*/
6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Matrix transpose
7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*
7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a0 a1 a2 a3
7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  b0 b1 b2 b3
7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  c0 c1 c2 c3
7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  d0 d1 d2 d3
7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     */
7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 b0 a1 b1
7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //c0 d0 c1 d1
7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //a2 b2 a3 b3
7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 d2 c3 d3
7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                    //a0 b0 c0 d0
7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                    //a1 b1 c1 d1
7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                    //a2 b2 c2 d2
7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                    //a3 b3 c3 d3
7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi32(src_r0, src_r3);
7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi32(src_r1, src_r2);
7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_sub_epi32(src_r1, src_r2);
7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_sub_epi32(src_r0, src_r3);
7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_add_epi32(temp0, temp1);
7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_add_epi32(temp2, temp3);
7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_sub_epi32(temp0, temp1);
7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_sub_epi32(temp3, temp2);
7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*-------------------------------------------------------------*/
7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* Forward DC transform [ Vertical transformation ]                          */
7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*-------------------------------------------------------------*/
7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Matrix transpose
7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /*
7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a0 b0 c0 d0
7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a1 b1 c1 d1
7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a2 b2 c2 d2
7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     *  a3 b3 c3 d3
7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S     */
7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 a1 b0 b1
7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //a2 a3 b2 b3
7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //c0 c1 d0 d1
7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 c3 d2 d3
7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                   //a0 a1 a2 a3
7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                   //b0 b1 b2 b3
7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                   //c0 c1 c2 c3
7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                   //d0 d1 d2 d3
7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi32(src_r0, src_r3);
7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi32(src_r1, src_r2);
7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_sub_epi32(src_r1, src_r2);
7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_sub_epi32(src_r0, src_r3);
7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_add_epi32(temp0, temp1);
7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_add_epi32(temp2, temp3);
7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_sub_epi32(temp0, temp1);
7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_sub_epi32(temp3, temp2);
7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_srai_epi32(src_r0, 1);
7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_srai_epi32(src_r1, 1);
7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_srai_epi32(src_r2, 1);
7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_srai_epi32(src_r3, 1);
7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Quantization
7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0);        //Find sign of each value for later restoration
7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1);
7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2);
7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3);
7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1);      //Sign = -1 or 0 depending on <0 or >0 respectively
7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3);
7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_slli_epi16(sign_reg0, 1);               //Sign = -2 or 0 depending on <0 or >0 respectively
7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_slli_epi16(sign_reg2, 1);
7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);            //Sign = -1 or 1 depending on <0 or >0 respectively
7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_abs_epi32(src_r0);                         //Absolute values
7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_abs_epi32(src_r1);
7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_abs_epi32(src_r2);
7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_abs_epi32(src_r3);
7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_mullo_epi32(scale_val, src_r0);             //multiply by pu2_scale_matrix[0]
7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_mullo_epi32(scale_val, src_r1);
7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_mullo_epi32(scale_val, src_r2);
7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_mullo_epi32(scale_val, src_r3);
7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi32(temp0,rnd_fact);                  //Add round factor
7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi32(temp1,rnd_fact);
7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi32(temp2,rnd_fact);
7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi32(temp3,rnd_fact);
7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_srli_epi32(temp0,u4_qbits);                 //RIght shift by qbits, unsigned variable, so shift right immediate works
7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srli_epi32(temp1,u4_qbits);
7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srli_epi32(temp2,u4_qbits);
7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_srli_epi32(temp3,u4_qbits);
7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 =  _mm_packs_epi32 (temp0,temp1);                 //Final values are 16-bits only.
7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 =  _mm_packs_epi32 (temp2,temp3);
7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 =  _mm_sign_epi16(temp0, sign_reg0);              //Sign restoration
7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 =  _mm_sign_epi16(temp2, sign_reg2);
7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2);
8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask0 = _mm_movemask_epi8(cmp0);
8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask1 = _mm_movemask_epi8(cmp1);
8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u4_zero_coeff = 0;
8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if(mask0)
8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if(mask0 == 0xffff)
8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff+=8;
8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        else
8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        {
8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp0 = _mm_and_si128(temp_1, cmp0);
8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if(mask1)
8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if(mask1 == 0xffff)
8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff+=8;
8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        else
8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        {
8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp1 = _mm_and_si128(temp_1, cmp1);
8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
8338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* Return total nonzero coefficients in the current sub block */
8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u4_nonzero_coeff = 16 - u4_zero_coeff;
8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_nnz[0] =  u4_nonzero_coeff;
8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *******************************************************************************
8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief
8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   This function performs forward hadamard transform and quantization on a 2*2 block
8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   for both U and V planes
8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par Description:
8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   The function accepts source buffer and estimation buffer. From these, it
8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   computes the residue. This is residue is then transformed and quantized.
8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   The transform and quantization are in placed computed. They use the residue
8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   buffer for this.
8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_src
8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to source sub-block
8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_pred
8598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to prediction sub-block
8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pi2_out
8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to residual sub-block
8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] src_strd
8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Source stride
8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pred_strd
8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Prediction stride
8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] dst_strd
8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Destination stride
8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_qbits
8748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *    QP_BITS_h264_4x4 + floor(QP/6)
8758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_threshold_matrix
8778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to Forward Quant Threshold Matrix
8788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_scale_matrix
8808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Pointer to Forward Quant Scale Matrix
8818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_round_factor
8838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Quantization Round factor
8848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[out] pu1_nnz
8868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   Total non-zero coefficients in the current sub-block
8878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @returns
8898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks
8918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *   NNZ for dc is populated at 0 and 5th position of pu1_nnz
8928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
8938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */
8948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
8968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            const UWORD16 *pu2_scale_matrix,
8978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
8988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            UWORD32 u4_round_factor,UWORD8  *pu1_nnz)
8998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
900caab4fe0688e7c4b023d979b617fb16a275614f2Hamsalekha S    WORD32 val, nonzero_coeff_0=0, nonzero_coeff_1=0;
9018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i cmp, cmp0, cmp1;
9028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sum0, sum1;
9038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 mask, mask0, mask1;
9048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
9058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero_8x16b = _mm_setzero_si128();
9068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
9078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sign_reg0, sign_reg1;
9088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp_1 = _mm_set1_epi16(1);
9098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
9108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
91125e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar    UNUSED (pu2_threshold_matrix);
91225e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar
9138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *)pi2_src);          //a0 a1 a2 a3 b0 b1 b2 b3
9148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
9158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_0 = _mm_unpacklo_epi16(src, sign_reg);        //a0 a1 a2 a3 -- 32 bits
9168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_1 = _mm_unpackhi_epi16(src, sign_reg);        //b0 b1 b2 b3 -- 32 bits
9178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_hadd_epi32(plane_0, plane_1);           //a0+a1 a2+a3 b0+b1 b2+b3
9198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_hsub_epi32(plane_0, plane_1);           //a0-a1 a2-a3 b0-b1 b2-b3
9208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_0 = _mm_hadd_epi32(temp0, temp1);             //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
9228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_1 = _mm_hsub_epi32(temp0, temp1);             //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
9238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_unpacklo_epi32(plane_0, plane_1);       //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
9258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi32(plane_0, plane_1);       //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
9268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_0 = _mm_unpacklo_epi64(temp0, temp1);         //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
9288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_1 = _mm_unpackhi_epi64(temp0, temp1);         //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
9298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_0 = _mm_shuffle_epi32(plane_0, 0xd8);         //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
9318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_1 = _mm_shuffle_epi32(plane_1, 0xd8);         //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
9328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Quantization
9338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0);       //Find sign of each value for later restoration
9348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1);
9358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1);      //Sign = -1 or 0 depending on <0 or >0 respectively
9378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_slli_epi16(sign_reg0, 1);               //Sign = -2 or 0 depending on <0 or >0 respectively
9388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);            //Sign = -1 or 1 depending on <0 or >0 respectively
9398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_0 = _mm_abs_epi32(plane_0);                           //Absolute values
9418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    plane_1 = _mm_abs_epi32(plane_1);
9428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_mullo_epi32(scale_val, plane_0);                //multiply by pu2_scale_matrix[0]
9448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_mullo_epi32(scale_val, plane_1);                //multiply by pu2_scale_matrix[0]
9458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_add_epi32(temp0,rnd_fact);                  //Add round factor
9478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi32(temp1,rnd_fact);
9488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 = _mm_srli_epi32(temp0,u4_qbits);                 //RIght shift by qbits, unsigned variable, so shift right immediate works
9508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srli_epi32(temp1,u4_qbits);
9518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 =  _mm_packs_epi32 (temp0,temp1);                 //Final values are 16-bits only.
9538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp0 =  _mm_sign_epi16(temp0, sign_reg0);              //Sign restoration
9548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
9568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    cmp = _mm_cmpeq_epi16(temp0, zero_8x16b);
9588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask = _mm_movemask_epi8(cmp);
9598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask0 = mask & 0xff;
9608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mask1 = mask>>8;
9618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if(mask0)
9628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
9638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if(mask0 == 0xff)
9648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            nonzero_coeff_0 += 4;
9658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        else
9668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        {
9678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp0 = _mm_and_si128(temp_1, cmp);
9688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
9698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
9708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            val = _mm_cvtsi128_si32(sum1);
9718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            val = val & 0xffff;
9728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            nonzero_coeff_0 += val;
9738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
9748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
9758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if(mask1)
9768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
9778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if(mask1 == 0xff)
9788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            nonzero_coeff_1 += 4;
9798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        else
9808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        {
9818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp1 = _mm_srli_si128(cmp, 8);
9828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            cmp1 = _mm_and_si128(temp_1, cmp1);
9838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
9848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
9858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            nonzero_coeff_1 += _mm_cvtsi128_si32(sum1);
9868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
9878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
9888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_nnz[0] = 4 - nonzero_coeff_0;
9908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_nnz[1] = 4 - nonzero_coeff_1;
9918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
993